In [1]:
#library imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import warnings
from matplotlib.pyplot import figure
warnings.filterwarnings('ignore')

In [5]:
#data imports
returns = pd.read_pickle("./Data/returns.pkl")
volumes = pd.read_csv('./Data/stock_volumes.csv', index_col = 'date', parse_dates = True)
info = pd.read_csv('./Data/stock_info.csv', index_col = 'Instrument')
sp_listings = pd.read_csv('./Data/sp500_listings.csv', index_col = 'date', parse_dates = True)

In [6]:
#drop very first row
returns = returns.iloc[1:, :]

In [69]:
def get_investable(stock, t_start, t_end, past_n_days):
    "Returns investable universe (stocks in the S&P) between time t_start and t_end\
     Used as training data to predict stock's return at time t_end + 1"
    "Parameters:\
        t_start: stock's listing date\
        t_end: stock's delisting date\
        last_n: % of rows of training data to use"
    
    #save stock's returns between t_start and t_end
    y = returns[[stock]].loc[t_start:t_end, :]
    y.columns = ['t+1 return']
    y = y.sort_index(ascending = False)
    
    #create a one-day lag in all other stocks' returns so that their past returns are
    #used to predict given stock's current return
    X = returns.loc[(t_start + pd.DateOffset(-1)):(t_end + pd.DateOffset(-1)), :]
    X = X.sort_index(ascending = False)
    
    df_full = pd.concat([X, y], axis = 1)
    df_full = df_full.sort_index(ascending = False)
    df_full['t+1 return'] = df_full['t+1 return'].shift(1)
    df_full = df_full.iloc[1:, :]
    
    #find all stocks that exist in the S&P between these time periods - their returns
    #will be used to predict stock
    investable_universe = []

    for col in df_full.columns:
        if ~df_full[col].iloc[:past_n_days].isna().any():
            investable_universe.append(col)
    
    df_investable = df_full[investable_universe].iloc[:past_n_days]
    
    return df_investable

In [70]:
def get_start_split_end(stock, date, split_ratio = 0.9):
    "Helper function for get_investable - returns train split of given stock returns"
    
    #drop all the missing values - these indicate when the stock was not listed on the S&P
    df = returns[[stock]].dropna()
    
    first_listing = df.index[0]
    last_listing = df.index[-1]
    
    past_n_days_dt = (split_ratio * (last_listing - first_listing)) #in datetime format
    past_n_days_int = past_n_days_dt.days #in integer format
    
    start = last_listing - past_n_days_dt
    
    return start, past_n_days_int

In [86]:
from sklearn.linear_model import LinearRegression

def linear_predict(stock, date):
    "Creates linear model for an individual stock"
    #split into train and test set
    start, n_days = get_start_split_end(stock, date)
    investable = get_investable(stock, start, date, n_days)
    
    X_train = investable.drop('t+1 return', axis = 1)
    y_train = investable[['t+1 return']]
    
    display(X_train)
    display(y_train)
    
    date = pd.to_datetime(date)
    next_date = date + pd.DateOffset(1)
    print(next_date)
    

In [87]:
stock = '905270'
pred_after = pd.to_datetime('1999-02-24')
linear_predict(stock, pred_after)

Unnamed: 0_level_0,905270,921795,904261,905261,916328,936365,902355,912215,905271,921246,...,905652,701667,921509,511339,541798,906828,923298,992765,922853,543755
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1999-02-23,-0.576744,-1.551363,-0.504323,-0.718177,-0.132424,-2.913124,-0.444444,-1.667515,-2.025804,-1.684533,...,0.000000,0.172733,1.497504,-0.512821,0.810485,-1.345756,-0.727277,0.000000,0.000000,-0.745092
1999-02-22,0.111753,5.019815,4.282494,0.937383,2.717539,9.575035,0.106781,2.025974,2.259972,1.240310,...,2.058724,1.049510,-1.313629,-0.510204,3.932692,1.594155,-0.182359,-1.298701,29.166667,1.512727
1999-02-19,0.111878,0.000000,0.452830,-0.137294,0.000000,2.919708,0.106895,-1.521934,0.188414,0.000000,...,-0.403361,-0.694444,0.827815,-1.754386,-0.280335,0.688310,-1.076628,-1.910828,-7.692308,-2.225699
1999-02-18,1.668246,-3.155650,0.913938,1.843142,2.222244,0.735294,0.000000,0.000000,-4.158921,0.467290,...,0.404995,-2.702703,-2.737520,-0.250000,-0.833000,3.073388,-0.713015,0.000000,0.000000,3.847290
1999-02-17,0.228007,0.903614,-1.868460,-0.367275,-4.127726,-0.366884,0.000000,-2.335748,2.979519,-2.134146,...,0.417985,-3.425523,1.803279,0.000000,-0.826118,0.000000,-1.232676,-1.875000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1990-12-07,-0.584270,-0.722674,0.961538,-0.834202,-0.278092,2.777778,-0.747986,-3.134403,0.927569,0.543478,...,0.652377,6.300630,-0.836446,0.757245,1.052901,2.000000,0.000000,0.756322,0.000000,3.451130
1990-12-06,-0.268938,2.216066,1.443621,0.000000,1.122851,-2.702703,0.000000,3.235827,-1.597901,-1.866667,...,0.656660,0.000000,0.843502,2.325495,-3.061983,2.738123,2.000000,1.539626,1.960784,1.751579
1990-12-05,-1.414052,3.537285,0.509804,-0.827301,-0.559284,5.714286,2.962085,2.331126,0.689880,3.591160,...,2.683363,0.000000,0.850677,-2.272645,-1.010349,0.692808,0.000000,3.999155,2.000000,0.000000
1990-12-04,1.982875,2.048780,-0.507218,0.834202,1.704418,0.000000,0.776119,-1.307190,-0.457815,3.724928,...,0.000000,0.000000,2.174503,-1.491896,0.000000,-1.362094,7.528732,0.807060,4.166667,0.000000


Unnamed: 0_level_0,t+1 return
date,Unnamed: 1_level_1
1999-02-23,-0.243263
1999-02-22,-0.576744
1999-02-19,0.111753
1999-02-18,0.111878
1999-02-17,1.668246
...,...
1990-12-07,-0.271248
1990-12-06,-0.584270
1990-12-05,-0.268938
1990-12-04,-1.414052


1999-02-25 00:00:00
