In [1]:
############ADD FEATURES AND RESPONSE############

In [2]:
# import required modules
import sqlite3
import pandas as pd
import numpy as np
import ta

In [3]:
def df_from_db(dbname):
    """
    Setup sqlite connection and returns the dataframe
    """
    conn = sqlite3.connect(dbname)
    # load table to dataframe
    tdf = pd.read_sql("SELECT * FROM eow", 
                 conn, parse_dates="date")
    conn.close()
    print(tdf.head())
    return tdf

In [4]:
def optimize_columns(tdf):
    """
    Optimize the memory allocation of the columns
    """
    tdf['open'] = tdf['open'].astype(float)
    tdf['high'] = tdf['high'].astype(float)
    tdf['low'] = tdf['low'].astype(float)
    tdf['close'] = tdf['close'].astype(float)
    tdf['adjclose'] = tdf['adjclose'].astype(float)
    tdf['volume'] = tdf['volume'].astype(int)
    print(tdf.memory_usage())
    return tdf

In [5]:
def index_operations(tdf):
    """
    Performs preliminery index operations
    """
    # Make symbol as index
    tdf.set_index(keys= "symbol", inplace = True, append=False)
    # Make date as index
    tdf.set_index(keys= "date", inplace = True, append=True)
    # Sort the entire dataset
    tdf.sort_index(inplace=True)
    print(tdf.head())
    return tdf

In [6]:
def add_response(tdf):
    """
    Add response variables:
    shifted logret and shifted adjclose
    """
    # Add logret
    tdf['logret'] = np.log(tdf.adjclose).groupby(tdf.company).diff()
    # Shift the log returns
    tdf['slogret'] = tdf['logret'].shift(-1)
    # Shifted adjclose
    tdf['sadjclose'] = tdf.adjclose.groupby(tdf.company).shift(-1)
    # Drop last row of each ticker, (has NAN)
    tdf = tdf.dropna(axis=0)
    print(tdf.head())
    return tdf

In [7]:
def print_stats(df):
    print("########COLUMN NAN##############")
    print("MAX:",df.isnull().sum(axis = 1).groupby(df.company).sum().min())
    print("MIN:",df.isnull().sum(axis = 1).groupby(df.company).sum().max())
    print(df.isnull().sum(axis = 1).groupby(df.company).sum())
    print("########ROW NAN##############")
    print("MAX:", df.isnull().sum(axis = 0).max())
    print("MIN:",df.isnull().sum(axis = 0).min())
    print(df.isnull().sum(axis = 0))
    print("########DF SHAPE##############")
    print(df.shape)
    print("########DF HEAD###############")
    print(df.head)
    return

In [8]:
# Store it in the database
def df_to_db(tdf, dbname):
    """
    save database as sqlite db
    """
    conn = sqlite3.connect(dbname)
    tdf.to_sql(name='eow', con=conn, if_exists='replace', index=True, index_label=['symbol', 'date'])
    conn.close()
    return

In [None]:
############################################

In [9]:
df = df_from_db("stockseow.db")
df = optimize_columns(df)
df = index_operations(df)

  symbol       date company     open     high    low  close  adjclose  \
0      A 2011-01-07       A  41.5600  42.1391  41.00  41.62   27.3601   
1      A 2011-01-14       A  41.4100  43.4100  41.30  43.26   28.4382   
2      A 2011-01-21       A  43.3700  44.4500  41.46  42.11   27.6822   
3      A 2011-01-28       A  42.1834  43.5200  40.88  40.98   26.9394   
4      A 2011-02-04       A  41.2100  43.1300  40.23  42.99   28.2607   

     volume  
0  23040100  
1  17917600  
2  21525200  
3  22294000  
4  27637800  
Index           128
symbol      1679200
date        1679200
company     1679200
open        1679200
high        1679200
low         1679200
close       1679200
adjclose    1679200
volume      1679200
dtype: int64
                  company     open     high    low  close  adjclose    volume
symbol date                                                                  
A      2011-01-07       A  41.5600  42.1391  41.00  41.62   27.3601  23040100
       2011-01-14       A  41.

In [10]:
## remove LHX, just 145 values
grouped = df.groupby('company')
df = df.drop(grouped.get_group('LHX').index)

In [11]:
# Check if all companies have equal entries
grouped = df.adjclose.groupby(df.company)
grouped.count().min() == grouped.count().max()

True

In [12]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,company,open,high,low,close,adjclose,volume
symbol,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,2011-01-07,A,41.56,42.1391,41.0,41.62,27.3601,23040100
A,2011-01-14,A,41.41,43.41,41.3,43.26,28.4382,17917600
A,2011-01-21,A,43.37,44.45,41.46,42.11,27.6822,21525200
A,2011-01-28,A,42.1834,43.52,40.88,40.98,26.9394,22294000
A,2011-02-04,A,41.21,43.13,40.23,42.99,28.2607,27637800


In [13]:
# Add all features
ta.add_all_ta_features(
    df, open="open", high="high", low="low", close="close", volume="volume", fillna=True)
df.head()

  dip[i] = 100 * (self._dip[i]/self._trs[i])
  din[i] = 100 * (self._din[i]/self._trs[i])


Unnamed: 0_level_0,Unnamed: 1_level_0,company,open,high,low,close,adjclose,volume,volume_adi,volume_obv,volume_cmf,...,momentum_uo,momentum_stoch,momentum_stoch_signal,momentum_wr,momentum_ao,momentum_kama,momentum_roc,others_dr,others_dlr,others_cr
symbol,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A,2011-01-07,A,41.56,42.1391,41.0,41.62,27.3601,23040100,2040862.0,23040100,0.088579,...,1.340688,54.428935,54.428935,-45.571065,0.0,41.62,0.0,-52.295222,0.0,0.0
A,2011-01-14,A,41.41,43.41,41.3,43.26,28.4382,17917600,17410940.0,40957700,0.425096,...,5.335547,93.775934,74.102434,-6.224066,0.0,43.26,0.0,3.940413,3.86476,3.940413
A,2011-01-21,A,43.37,44.45,41.46,42.11,27.6822,21525200,5244519.0,19432500,0.083935,...,6.290787,32.173913,60.126261,-67.826087,0.0,42.11,0.0,-2.658345,-2.694318,1.177319
A,2011-01-28,A,42.1834,43.52,40.88,40.98,26.9394,22294000,-15360540.0,-2861500,-0.181188,...,6.168388,2.80112,42.916989,-97.19888,0.0,40.98,0.0,-2.683448,-2.72011,-1.537722
A,2011-02-04,A,41.21,43.13,40.23,42.99,28.2607,27637800,9608781.0,24776300,0.085476,...,10.705823,65.402844,33.459292,-34.597156,0.0,42.99,0.0,4.904832,4.788339,3.291687


In [14]:
df = add_response(df)

                  company     open   high    low  close  adjclose    volume  \
symbol date                                                                   
A      2011-01-14       A  41.4100  43.41  41.30  43.26   28.4382  17917600   
       2011-01-21       A  43.3700  44.45  41.46  42.11   27.6822  21525200   
       2011-01-28       A  42.1834  43.52  40.88  40.98   26.9394  22294000   
       2011-02-04       A  41.2100  43.13  40.23  42.99   28.2607  27637800   
       2011-02-11       A  43.4500  45.15  42.57  45.02   29.5952  27674300   

                     volume_adi  volume_obv  volume_cmf  ...  momentum_wr  \
symbol date                                              ...                
A      2011-01-14  1.741094e+07    40957700    0.425096  ...    -6.224066   
       2011-01-21  5.244519e+06    19432500    0.083935  ...   -67.826087   
       2011-01-28 -1.536054e+07    -2861500   -0.181188  ...   -97.198880   
       2011-02-04  9.608781e+06    24776300    0.085476  ... 

In [15]:
print_stats(df)

########COLUMN NAN##############
MAX: 0
MIN: 0
company
A       0
AAL     0
AAP     0
AAPL    0
ABC     0
       ..
XRAY    0
XRX     0
YUM     0
ZBH     0
ZION    0
Length: 461, dtype: int64
########ROW NAN##############
MAX: 0
MIN: 0
company       0
open          0
high          0
low           0
close         0
             ..
others_dlr    0
others_cr     0
logret        0
slogret       0
sadjclose     0
Length: 71, dtype: int64
########DF SHAPE##############
(208833, 71)
########DF HEAD###############
<bound method NDFrame.head of                   company     open     high    low  close  adjclose    volume  \
symbol date                                                                     
A      2011-01-14       A  41.4100  43.4100  41.30  43.26   28.4382  17917600   
       2011-01-21       A  43.3700  44.4500  41.46  42.11   27.6822  21525200   
       2011-01-28       A  42.1834  43.5200  40.88  40.98   26.9394  22294000   
       2011-02-04       A  41.2100  43.1300  40.23  42

In [16]:
df_to_db(df, 'stocks-eow.db')

In [None]:
########################################################