In [4]:
#library imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import warnings
from matplotlib.pyplot import figure
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression

In [8]:
#data imports
returns = pd.read_pickle("../Data/returns.pkl")
volumes = pd.read_csv('../Data/stock_volumes.csv', index_col = 'date', parse_dates = True)
info = pd.read_csv('../Data/stock_info.csv', index_col = 'Instrument')
sp_listings = pd.read_csv('../Data/sp500_listings.csv', index_col = 'date', parse_dates = True)

In [228]:
returns

Unnamed: 0_level_0,905270,921795,904261,905261,916328,923024,936365,902355,912215,929813,...,9660J1,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-03,-0.835165,-0.475059,5.000000,-0.810537,0.361421,0.000000,-3.030303,0.000000,-0.431034,-1.315789,...,,,,,,,,,,
1990-01-04,-0.797872,-1.272872,2.380952,-1.634321,-0.180060,3.454545,-1.562500,0.530612,0.225108,0.000000,...,,,,,,,,,,
1990-01-05,-2.546917,1.531023,-0.372093,-1.661475,-1.079730,1.054482,-3.174603,0.487211,-0.656531,-1.333333,...,,,,,,,,,,
1990-01-08,2.017423,-0.873016,0.000000,2.534319,0.000000,0.000000,0.000000,1.010101,0.434783,3.405405,...,,,,,,,,,,
1990-01-09,-1.123596,-2.962370,-0.404606,-1.098524,-0.364707,-1.043478,1.639344,0.000000,0.432900,-3.293257,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-24,,,,,0.136600,,0.349877,,,,...,0.376702,4.400221,1.590198,-0.991004,1.682900,3.364703,2.271817,0.634962,4.859794,1.204016
2021-05-25,,,,,0.946372,,0.542355,,,,...,0.230947,-0.288569,0.449063,-1.661106,0.439588,-0.548765,0.185928,0.727530,1.884279,-0.498203
2021-05-26,,,,,-1.393581,,0.616491,,,,...,1.296083,2.388001,-0.166049,3.255613,2.317760,4.702784,0.673960,2.336242,-0.429666,-1.520393
2021-05-27,,,,,-0.907923,,0.102119,,,,...,0.056867,1.893011,-0.063971,-0.510347,1.044185,-0.275482,1.979237,1.876893,1.247655,2.369727


### Pipeline psuedo code
    for t in range (first_date to last_date):
        find stocks in investable universe at t (stocks in the S&P500 that have prices  
                                                 recorded for the last n days)
    
        for stock in investable universe:
            get training data
            fit the model
            predict return for t+1
        
        #how much return we should make according to predictions
        top5 predicted returns at time t+1
        bottom5 predicted returns at time t+1
        predicted average return = top5 - bottom 5
        
        #how much return we actually make
        top5 actual returns at time t+1 =
        bottom5 actual returns at time t+1 = 
        actual average return = top5 - bottom5

In [9]:
#drop very first row
returns = returns.iloc[1:, :]

In [78]:
def get_investable(t, n_rows):
    "Find stocks in investable universe at time t\
    (stocks in the S&P500 that have prices recorded for the last n_rows days)"
    time = pd.to_datetime(t)
    
    df_investable = returns.copy(deep = True).sort_index(ascending = False)
    
    most_recent = time
    least_recent = time + pd.DateOffset(-n_rows)
    
    #take n_rows worth of data upto time specified
    df_investable = df_investable[most_recent:least_recent]
    
    #find all stocks that exist in the S&P at this time period
    investable_universe = []
    for col in df_investable.columns:
        if ~df_investable[col].iloc[:n_rows].isna().any():
            investable_universe.append(col)
        
    df_investable = df_investable[investable_universe]
    
    return df_investable

In [115]:
def get_train(df_investable, stock, t):
    stock = stock
    time = pd.to_datetime(t) + pd.DateOffset(-1)
        
    X = df_investable.loc[time:]

    #save the target (sample stock's returns upto time t)
    y = df_investable[[stock]]
    y.columns = [str(stock) + ' t+1']
    
    #concatenate outcome and features
    train = pd.concat([X, y], axis = 1).sort_index(ascending = False)
    
    #shit outcome down 1 row
    train[str(stock) + ' t+1'] = train[str(stock) + ' t+1'].shift(1)
    train = train.iloc[1:, :]

    #split into X_train and y_train
    X_train = train.drop(str(stock) + ' t+1', axis = 1)
    y_train = train[[str(stock) + ' t+1']]
    
    return X_train, y_train

In [117]:
def get_test(stock, t, n_rows):
    time = pd.to_datetime(t) + pd.DateOffset(1)
    
    test = get_investable(time, n_rows)
    
    #shift in the same way that training data is (with outcome as returns at t+1)
    X_test, y_test = get_training(test, stock, time)
    
    X_test = X_test.iloc[0:1,:]
    y_test = y_test.iloc[0:1,:]
    
    return X_test, y_test

In [194]:
def fit_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

In [160]:
def pred_next(model, X_test, y_test):
    predictions = model.predict(X_test)
    df_eval = pd.concat([X_test, y_test], axis = 1)
    df_eval['predicted t+1'] = predictions
    return df_eval

In [204]:
def pred_next_all(t, investable):
    "Takes the investable universe at time t and returns predicted and actual returns for\
    all stocks in that universe"
    time = pd.to_datetime(t)

    #create a dataframe to hold the stock, actual return, predicted return at this time
    df = pd.DataFrame(columns = ['stock', 'actual return', 'pred return'])
    df['stock'] = investable.columns
    df.set_index('stock', inplace = True)

    for stock in investable.columns:
        #get train and test data
        X_train, y_train = get_train(investable, stock, time)
        X_test, y_test = get_test(stock, time, 80)
        
        #fit model
        model = fit_model(X_train, y_train)
        
        #predict
        predictions = pred_next(model, X_test, y_test)
        
        #add predictions and actual returns to dataframe
        df.loc[stock]['actual return'] = predictions.loc[time][str(stock) + ' t+1']
        df.loc[stock]['pred return'] = predictions.loc[time]['predicted t+1']

    return df

In [229]:
def pred_vs_act_return(df, n):
    "Takes dataframe with predicted and actual returns and outputs predicted portfolio returns\
    (based on strategy) and what the portfolio would have actually earned"
    pred = df.sort_values(by = 'pred return', ascending = False)
    topn_p = pred.head(n)
    botn_p = pred.tail(n)
    return_p = topn_p['pred return'].sum() - botn_p['pred return'].sum()
    return_a = topn_p['actual return'].sum() - botn_p['actual return'].sum()
    
    return return_p, return_a

In [209]:
def top_bottom_pred(df, n):
    "Takes dataframe with predicted and actual returns, and returns the topn and bottom n\
    predicted stocks"
    pred = df.sort_values(by = 'pred return', ascending = False)
    topn = list(pred.head(n).index)
    botn = list(pred.tail(n).index)
    return topn, botn

In [215]:
def top_bottom_actual(df, n):
    "Takes dataframe with predicted and actual returns, and returns the topn and bottom n\
    actual stocks"
    actual = df.sort_values(by = 'actual return', ascending = False)
    topn = list(actual.head(n).index)
    botn = list(actual.tail(n).index)
    return topn, botn

In [250]:
def pipeline(n_stocks):
    master_df = pd.DataFrame(
            columns = ['date', 'predicted portfolio return', 'actual portfolio return'])
    time_list = ['2021-05-24','2021-05-25']
    master_df['date'] = time_list
    master_df.set_index('date', inplace = True)
    
    for time in time_list:
        #get the investable universe of stocks
        investable = get_investable(time, 80)

        #predict the next day's returns for each stock in investable
        df = pred_next_all(time, investable)

        #get predicted and actual returns based on predicted topn and bottomn
        pred_r, act_r = pred_vs_act_return(df, n_stocks)
        
        #format into master dataframe
        master_df.loc[time]['predicted portfolio return'] = pred_r
        master_df.loc[time]['actual portfolio return'] = act_r
    
    return master_df

In [251]:
pipeline(5)

Unnamed: 0_level_0,predicted portfolio return,actual portfolio return
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-05-24,42.2686,11.3629
2021-05-25,49.5955,-27.9481


In [218]:
time = pd.to_datetime('2020-02-24')
investable = get_investable(time, 80)
df = pred_next_all(time, investable)

UsageError: Line magic function `%%time` not found.


In [219]:
df

Unnamed: 0_level_0,actual return,pred return
stock,Unnamed: 1_level_1,Unnamed: 2_level_1
916328,-4.13565,-1.47219
936365,-3.15554,2.95392
905271,-4.08911,-1.1107
905113,-4.11272,-0.224207
905802,-4.69944,-0.237983
...,...,...
131745,-10.7679,-0.240092
69487D,-2.59594,-3.52746
68157P,-4.85015,-0.173878
9110RA,-11.7003,-1.02061


In [220]:
top_bottom_pred(df, 5)

(['15168M', '27020T', '771767', '772128', '8858FM'],
 ['516649', '905274', '298514', '54060C', '88874X'])

In [221]:
top_bottom_actual(df, 5)

(['88874X', '298514', '905277', '905284', '27020T'],
 ['905047', '131745', '923497', '9110RA', '874114'])

In [222]:
pred_vs_act_return(df, 5)

(86.23110200611373, -12.321719220622898)