In [1]:
#library imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import warnings
from matplotlib.pyplot import figure
warnings.filterwarnings('ignore')

In [5]:
#data imports
returns = pd.read_pickle("./Data/returns.pkl")
volumes = pd.read_csv('./Data/stock_volumes.csv', index_col = 'date', parse_dates = True)
info = pd.read_csv('./Data/stock_info.csv', index_col = 'Instrument')
sp_listings = pd.read_csv('./Data/sp500_listings.csv', index_col = 'date', parse_dates = True)

In [6]:
#drop very first row
returns = returns.iloc[1:, :]

In [190]:
def get_features_targets(stock):
    "Returns dataframe with all stocks' lagged returns within the date range between given\
     stock's first listing and last listing date\
     **Note: This dataframe will contain missing values in returns for other stocks. We need\
     to further filter this based on the which date we are trying to predict a return for\
     and how many rows of past data we want to use"
    
    #drop all the missing values - these indicate when the stock was not listed on the S&P
    df = returns[[stock]].dropna()
    
    t_start = df.index[0]
    t_end = df.index[-1]
    
    #save stock's returns between t_start and t_end
    y = returns[[stock]].loc[t_start:t_end, :]
    y.columns = [str(stock)+' t+1']
    y = y.sort_index(ascending = False)
    
    #create a one-day lag in all other stocks' returns so that their past returns are
    #used to predict given stock's current return
    X = returns.loc[(t_start + pd.DateOffset(-1)):(t_end + pd.DateOffset(-1)), :]
    X = X.sort_index(ascending = False)
    
    df_full = pd.concat([X, y], axis = 1)
    df_full = df_full.sort_index(ascending = False)
    df_full[str(stock) + ' t+1'] = df_full[str(stock) + ' t+1'].shift(1)
    df_full = df_full.iloc[1:, :]
    
    return df_full

In [180]:
def get_train(stock, date, n_rows_pct = 0.9):
    "Helper function for get_investable\
    Parameters:\
        stock: given stock\
        date: date to predict after (date of last training data)\
        n_rows_pct: % of rows to use as training data (% of past returns to use as features)"
    
    #drop all the missing values - these indicate when the stock was not listed on the S&P
    df = returns[[stock]].dropna()
    
    first_listing = df.index[0]
    last_listing = df.index[-1]
    
    past_n_days_dt = (n_rows_pct * (last_listing - first_listing)) #in datetime format
    
    least_recent = date - past_n_days_dt   #date of oldest past returns to include
    most_recent = date + pd.DateOffset(-1) #date of most recent past returns to include
    
    return most_recent, least_recent

In [181]:
def get_investable(df, most_recent, least_recent):
    "Returns investable universe (stocks in the S&P) between time t_start and t_end\
    Parameters:\
        df: a given stock's dataframe from get_features_targets\
        most_recent: most recent date\
        past_n_days: number of past returns to use as training data"
    
    #find all stocks that exist in the S&P between these time periods - their returns
    #will be used to predict stock
    investable_universe = []

    for col in df.columns:
        if ~df[col].loc[:least_recent].isna().any():
            investable_universe.append(col)
    
    #a dataframe with no missing returns for any stock
    df_investable = df[investable_universe]
    df_investable = df_investable.loc[most_recent:least_recent]
    
    return df_investable

In [219]:
from sklearn.linear_model import LinearRegression

def fit_predict(stock, date):
    "Creates linear model for an individual stock and predicts return for next trading day"
    df = get_features_targets(stock)
    most_recent, least_recent = get_train(stock, date)
    train_investable = get_investable(df, most_recent, least_recent)
    
    X_train =  train_investable.drop(str(stock) + ' t+1', axis = 1)
    y_train =  train_investable[[str(stock) + ' t+1']]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    most_recent_test = most_recent + pd.DateOffset(1)
    test_investable = get_investable(df, most_recent_test, least_recent)
    
    X_test = test_investable.loc[[date + pd.DateOffset(-1)]].drop(str(stock) + ' t+1', axis = 1)
    y_test = test_investable.loc[[date]][str(stock) + ' t+1']
    
    prediction = model.predict(X_test)
    
    return y_test, prediction