In [1]:
#library imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import talib
import warnings
from matplotlib.pyplot import figure
warnings.filterwarnings('ignore')

In [16]:
#data imports
volumes = pd.read_csv('Data/stock_volumes.csv', index_col = 'date', parse_dates = True)
prices = pd.read_csv('Data/stock_prices.csv', index_col = 'date', parse_dates = True)
info = pd.read_csv('Data/stock_info.csv')
sp_listings = pd.read_csv('Data/sp500_listings.csv', index_col = 'date', parse_dates = True)

## Data Cleaning

In [None]:
for col in prices.columns:
    #select the stock
    stock = prices[[col]]
    #find delist date
    delist_date = sp_listings.where(sp_listings['stock'] == col).last_valid_index() + pd.DateOffset(1)
    #set all values after delist date to NaN
    stock.loc[delist_date:][col] = np.nan
    #replace in original dataframe
    prices[col] = stock[col]

In [58]:
prices

Unnamed: 0_level_0,905270,921795,904261,905261,916328,923024,936365,902355,912215,929813,...,9660J1,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-02,22.75,12.63,30.00,29.61,3.8736,11.00,4.1250,24.50,58.00,19.00,...,,,,,,,,,,
1990-01-03,22.56,12.57,31.50,29.37,3.8876,11.00,4.0000,24.50,57.75,18.75,...,,,,,,,,,,
1990-01-04,22.38,12.41,32.25,28.89,3.8806,11.38,3.9375,24.63,57.88,18.75,...,,,,,,,,,,
1990-01-05,21.81,12.60,32.13,28.41,3.8387,11.50,3.8125,24.75,57.50,18.50,...,,,,,,,,,,
1990-01-08,22.25,12.49,32.13,29.13,3.8387,11.50,3.8125,25.00,57.75,19.13,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-24,,,,,,,,,,,...,,,,,,,,,,
2021-05-25,,,,,,,,,,,...,,,,,,,,,,
2021-05-26,,,,,,,,,,,...,,,,,,,,,,
2021-05-27,,,,,,,,,,,...,,,,,,,,,,


In [38]:
drop_columns = []

for col in prices.columns:
    if prices[col].isna().value_counts()[1] == prices.shape[0]:
        drop_columns.append(col)

prices = prices.drop(columns = drop_columns)

In [51]:
prices_r = prices.copy(deep = True)
prices_r = prices_r.pct_change()*100

In [52]:
for col in prices_r.columns:
    #select the stock
    stock = prices_r[[col]]
    #find delist date
    delist_date = sp_listings.where(sp_listings['stock'] == col).last_valid_index() + pd.DateOffset(1)
    #set all values after delist date to NaN
    stock.loc[delist_date:][col] = np.nan
    #replace in original dataframe
    prices_r[col] = stock[col]

In [56]:
prices_r.to_pickle('./Data/returns.pkl')

## Data Preparation

In [72]:
#drop very first row
prices_r = prices_r.iloc[1:, :]

In [60]:
#sample stock
stock = '905270'

In [99]:
#time period
t = pd.to_datetime('1999-02-24')

In [100]:
#save the target (sample stock's returns upto time t)
y = prices_r[[stock]].loc[:t, :]
y.columns = ['Outcome']
y = y.sort_index(ascending = False)
y

Unnamed: 0_level_0,Outcome
date,Unnamed: 1_level_1
1999-02-24,-0.243263
1999-02-23,-0.576744
1999-02-22,0.111753
1999-02-19,0.111878
1999-02-18,1.668246
...,...
1990-01-09,-1.123596
1990-01-08,2.017423
1990-01-05,-2.546917
1990-01-04,-0.797872


In [101]:
#shift the features
X = prices_r.loc[:(t + pd.DateOffset(-1)), :]
X = X.sort_index(ascending = False)
X

Unnamed: 0_level_0,905270,921795,904261,905261,916328,923024,936365,902355,912215,929813,...,9660J1,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1999-02-23,-0.576744,-1.551363,-0.504323,-0.718177,-0.132424,,-2.913124,-0.444444,-1.667515,,...,,,-0.745092,,,4.413908,,,,
1999-02-22,0.111753,5.019815,4.282494,0.937383,2.717539,,9.575035,0.106781,2.025974,,...,,,1.512727,,,-0.743955,,,,
1999-02-19,0.111878,0.000000,0.452830,-0.137294,0.000000,,2.919708,0.106895,-1.521934,,...,,,-2.225699,,,-2.143579,,,,
1999-02-18,1.668246,-3.155650,0.913938,1.843142,2.222244,,0.735294,0.000000,0.000000,,...,,,3.847290,,,-0.702811,,,,
1999-02-17,0.228007,0.903614,-1.868460,-0.367275,-4.127726,,-0.366884,0.000000,-2.335748,,...,,,0.000000,,,-0.697906,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1990-01-09,-1.123596,-2.962370,-0.404606,-1.098524,-0.364707,-1.043478,1.639344,0.000000,0.432900,-3.293257,...,,,,,,,,,,
1990-01-08,2.017423,-0.873016,0.000000,2.534319,0.000000,0.000000,0.000000,1.010101,0.434783,3.405405,...,,,,,,,,,,
1990-01-05,-2.546917,1.531023,-0.372093,-1.661475,-1.079730,1.054482,-3.174603,0.487211,-0.656531,-1.333333,...,,,,,,,,,,
1990-01-04,-0.797872,-1.272872,2.380952,-1.634321,-0.180060,3.454545,-1.562500,0.530612,0.225108,0.000000,...,,,,,,,,,,


In [118]:
#concatenate outcome and features
df_full = pd.concat([X, y], axis = 1)
df_full = df_full.sort_index(ascending = False)
df_full

Unnamed: 0_level_0,905270,921795,904261,905261,916328,923024,936365,902355,912215,929813,...,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703,Outcome
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1999-02-24,,,,,,,,,,,...,,,,,,,,,,-0.243263
1999-02-23,-0.576744,-1.551363,-0.504323,-0.718177,-0.132424,,-2.913124,-0.444444,-1.667515,,...,,-0.745092,,,4.413908,,,,,-0.576744
1999-02-22,0.111753,5.019815,4.282494,0.937383,2.717539,,9.575035,0.106781,2.025974,,...,,1.512727,,,-0.743955,,,,,0.111753
1999-02-19,0.111878,0.000000,0.452830,-0.137294,0.000000,,2.919708,0.106895,-1.521934,,...,,-2.225699,,,-2.143579,,,,,0.111878
1999-02-18,1.668246,-3.155650,0.913938,1.843142,2.222244,,0.735294,0.000000,0.000000,,...,,3.847290,,,-0.702811,,,,,1.668246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1990-01-09,-1.123596,-2.962370,-0.404606,-1.098524,-0.364707,-1.043478,1.639344,0.000000,0.432900,-3.293257,...,,,,,,,,,,-1.123596
1990-01-08,2.017423,-0.873016,0.000000,2.534319,0.000000,0.000000,0.000000,1.010101,0.434783,3.405405,...,,,,,,,,,,2.017423
1990-01-05,-2.546917,1.531023,-0.372093,-1.661475,-1.079730,1.054482,-3.174603,0.487211,-0.656531,-1.333333,...,,,,,,,,,,-2.546917
1990-01-04,-0.797872,-1.272872,2.380952,-1.634321,-0.180060,3.454545,-1.562500,0.530612,0.225108,0.000000,...,,,,,,,,,,-0.797872


In [119]:
#shit outcome down 1 row
df_full['Outcome'] = df_full['Outcome'].shift(1)
df_full = df_full.iloc[1:, :]
df_full

Unnamed: 0_level_0,905270,921795,904261,905261,916328,923024,936365,902355,912215,929813,...,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703,Outcome
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1999-02-23,-0.576744,-1.551363,-0.504323,-0.718177,-0.132424,,-2.913124,-0.444444,-1.667515,,...,,-0.745092,,,4.413908,,,,,-0.243263
1999-02-22,0.111753,5.019815,4.282494,0.937383,2.717539,,9.575035,0.106781,2.025974,,...,,1.512727,,,-0.743955,,,,,-0.576744
1999-02-19,0.111878,0.000000,0.452830,-0.137294,0.000000,,2.919708,0.106895,-1.521934,,...,,-2.225699,,,-2.143579,,,,,0.111753
1999-02-18,1.668246,-3.155650,0.913938,1.843142,2.222244,,0.735294,0.000000,0.000000,,...,,3.847290,,,-0.702811,,,,,0.111878
1999-02-17,0.228007,0.903614,-1.868460,-0.367275,-4.127726,,-0.366884,0.000000,-2.335748,,...,,0.000000,,,-0.697906,,,,,1.668246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1990-01-09,-1.123596,-2.962370,-0.404606,-1.098524,-0.364707,-1.043478,1.639344,0.000000,0.432900,-3.293257,...,,,,,,,,,,-1.136364
1990-01-08,2.017423,-0.873016,0.000000,2.534319,0.000000,0.000000,0.000000,1.010101,0.434783,3.405405,...,,,,,,,,,,-1.123596
1990-01-05,-2.546917,1.531023,-0.372093,-1.661475,-1.079730,1.054482,-3.174603,0.487211,-0.656531,-1.333333,...,,,,,,,,,,2.017423
1990-01-04,-0.797872,-1.272872,2.380952,-1.634321,-0.180060,3.454545,-1.562500,0.530612,0.225108,0.000000,...,,,,,,,,,,-2.546917


In [120]:
df_full[stock].isna().any()

False

In [124]:
#investable universe
investable_universe = []

for col in df_full.columns:
    if ~df_full[col].isna().any():
        investable_universe.append(col)

len(investable_universe)

604

In [125]:
df_investable = df_full[investable_universe]

In [128]:
df_investable

Unnamed: 0_level_0,905270,921795,904261,905261,916328,936365,902355,912215,905271,921246,...,905652,701667,921509,511339,541798,906828,923298,992765,922853,Outcome
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1999-02-23,-0.576744,-1.551363,-0.504323,-0.718177,-0.132424,-2.913124,-0.444444,-1.667515,-2.025804,-1.684533,...,0.000000,0.172733,1.497504,-0.512821,0.810485,-1.345756,-0.727277,0.000000,0.000000,-0.243263
1999-02-22,0.111753,5.019815,4.282494,0.937383,2.717539,9.575035,0.106781,2.025974,2.259972,1.240310,...,2.058724,1.049510,-1.313629,-0.510204,3.932692,1.594155,-0.182359,-1.298701,29.166667,-0.576744
1999-02-19,0.111878,0.000000,0.452830,-0.137294,0.000000,2.919708,0.106895,-1.521934,0.188414,0.000000,...,-0.403361,-0.694444,0.827815,-1.754386,-0.280335,0.688310,-1.076628,-1.910828,-7.692308,0.111753
1999-02-18,1.668246,-3.155650,0.913938,1.843142,2.222244,0.735294,0.000000,0.000000,-4.158921,0.467290,...,0.404995,-2.702703,-2.737520,-0.250000,-0.833000,3.073388,-0.713015,0.000000,0.000000,0.111878
1999-02-17,0.228007,0.903614,-1.868460,-0.367275,-4.127726,-0.366884,0.000000,-2.335748,2.979519,-2.134146,...,0.417985,-3.425523,1.803279,0.000000,-0.826118,0.000000,-1.232676,-1.875000,0.000000,1.668246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1990-01-09,-1.123596,-2.962370,-0.404606,-1.098524,-0.364707,1.639344,0.000000,0.432900,-0.999463,-1.973684,...,0.697708,0.000000,1.167253,2.127587,-1.504126,-1.796383,0.648312,0.000000,-1.448930,-1.136364
1990-01-08,2.017423,-0.873016,0.000000,2.534319,0.000000,0.000000,1.010101,0.434783,1.009554,1.672241,...,-0.692873,-6.717850,0.390918,0.713973,1.527095,-0.297825,0.000000,0.756322,0.000000,-1.123596
1990-01-05,-2.546917,1.531023,-0.372093,-1.661475,-1.079730,-3.174603,0.487211,-0.656531,1.279191,0.673401,...,0.000000,3.475670,-1.538689,2.941670,-0.757762,0.000000,1.315789,0.000000,0.000000,2.017423
1990-01-04,-0.797872,-1.272872,2.380952,-1.634321,-0.180060,-1.562500,0.530612,0.225108,-1.263035,0.677966,...,-0.688106,0.000000,1.961319,-2.158205,-2.941877,-0.885564,1.333333,0.000000,0.000000,-2.546917


## Model

In [129]:
X_m = df_investable.drop('Outcome', axis = 1)
y_m = df_investable[['Outcome']]

In [132]:
from sklearn.linear_model import LinearRegression

In [133]:
lm = LinearRegression()

In [134]:
lm.fit(X_m, y_m)

LinearRegression()