In [129]:
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM
import warnings
warnings.filterwarnings("ignore")

In [2]:
returns = pd.read_pickle("./Data/returns.pkl")
returns = returns.iloc[1:]

In [3]:
# dropping columns with all NaN
drop_columns = []

for col in returns.columns:
    if returns[col].isnull().all() == True:
        drop_columns.append(col)
        
returns.drop(columns=drop_columns, inplace=True)

In [182]:
def get_investable(t, n_rows):
    "Find stocks in investable universe at time t\
    (stocks in the S&P500 that have prices recorded for the last n_rows days)"
    
    df_investable = returns.copy(deep = True).sort_index(ascending = False)
    
    #add 1 date to get the test features in investable
    t = t + pd.DateOffset(1)
    
    #if t is now a non-trading day, advance until we reach a valid trading day
    while t not in df_investable.index:
        t = t + pd.DateOffset(1)
    
    t_index = df_investable.index.get_loc(t)
    
    #take n_rows worth of data upto time specified
    df_investable = df_investable.iloc[t_index + 1:t_index + n_rows + 1]
    
    #find all stocks that exist in the S&P at this time period
    investable_universe = []
    for col in df_investable.columns:
        if ~df_investable[col].iloc[:n_rows].isna().any():
            investable_universe.append(col)
        
    df_investable = df_investable[investable_universe]
    
    return df_investable

In [138]:
def format_investable(df, n_in=1, n_out=1, dropnan = True):
    "Takes investable dataframe and formats it to have all stock returns at time t-1\
    (the features) and all corresponding stock returns at time t (the targets)"
    n_vars = df.shape[1]
    
    cols = []
    input_col_names = df.add_suffix(' (t-1)').columns
    forecast_col_names = df.add_suffix(' (t)').columns
    names = input_col_names.append(forecast_col_names)
    
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))

    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    
    # drop the first row (it contains NaNs)
    agg.dropna(inplace=True)
    
    return agg

In [223]:
def reshape_investable(df):
    "Takes formatted investable and reshapes inputs it into 3D array [samples, timesteps, features]"
    
    # select columns ending with ' (t-1)' for features
    # and columns ending with ' (t)' for targets
    X_col = [df.columns[i] for i in range(df.shape[1]) if df.columns[i].endswith(' (t-1)')]
    y_col = [df.columns[i] for i in range(df.shape[1]) if df.columns[i].endswith(' (t)')]
    
    X_train = df[X_col].iloc[2:,:].values
    y_train = df[y_col].iloc[2:,:].values
    
    X_test = df[X_col].iloc[0:1,:].values
    y_test = df[y_col].iloc[0:1,:].values
    
    #reshape
    X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
    X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

    return X_train, y_train, X_test, y_test

In [247]:
def pred_vs_act_return(df, n):
    "Takes dataframe with predicted and actual returns and outputs predicted portfolio returns\
    (based on strategy) and what the portfolio would have actually earned"
    pred = df.sort_values(by = 'pred return', ascending = False)
    topn_p = pred.head(n)
    botn_p = pred.tail(n)
    return_a = topn_p['actual return'].sum() - botn_p['actual return'].sum()
    
    return return_a

In [194]:
t = pd.to_datetime('2020-05-22')
inv = get_investable(t, 360)
inv

Unnamed: 0_level_0,916328,936365,905271,905113,905802,905425,906156,916305,992816,921093,...,311917,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-05-22,1.194822,0.951510,-0.505222,-0.482703,-0.296359,1.822412,-0.556607,0.277681,0.643838,0.740319,...,-0.071324,-1.295312,-0.126968,-5.856833,-0.128433,2.462057,-0.566286,-0.034953,2.631579,-0.924499
2020-05-21,0.333000,-3.085653,-1.776480,-2.509804,-0.211238,-0.488746,-0.991954,2.746077,-0.745544,-0.650636,...,3.275705,1.476286,-0.379459,-6.607449,-3.374541,10.613691,-4.546332,1.265375,4.576000,-0.358240
2020-05-20,2.119701,1.658554,1.597255,5.809129,3.227213,0.790770,3.976622,1.483894,1.944817,1.522114,...,0.777259,0.934394,0.739042,-4.281026,3.815470,12.958281,4.604200,2.624410,8.657858,3.520132
2020-05-19,-2.270965,1.612017,-1.222013,-2.350081,-6.617797,-3.430145,-1.300758,-2.882250,-0.577851,-2.737430,...,-0.239589,-0.690671,1.552795,4.067060,-1.469583,6.651685,-2.968551,-1.157885,3.453237,-1.106066
2020-05-18,0.422739,0.719557,4.784248,14.577530,11.436351,1.848782,7.528582,7.480166,2.356115,4.923798,...,2.955555,1.809315,9.152542,10.355460,6.118854,14.987080,12.846877,6.408787,19.775959,1.160120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-24,-2.541995,-1.653869,-3.421612,-4.947292,-12.195710,-3.768154,-2.003723,-0.187166,-2.587408,-2.892157,...,-0.673594,-7.624386,-1.114754,-2.109705,-3.432660,-1.449275,-3.738982,-3.538136,-3.668357,-5.024853
2018-12-21,-1.623282,-5.629877,0.420603,-3.303169,-4.642611,-1.533110,-4.636107,-1.965924,-3.889562,-1.042930,...,-3.499066,1.392132,-3.542062,-5.200000,-2.066702,-5.129561,-2.237665,-3.278689,-2.896032,-2.930082
2018-12-20,-0.898551,-1.211454,0.285529,-2.008634,-5.413043,0.988811,-3.037360,-3.024911,-2.523463,-2.759434,...,0.242617,-5.282766,-2.647783,-3.660886,-0.079766,-4.301619,0.897490,-2.788845,-1.595896,-4.024585
2018-12-19,-1.541096,-6.871795,-0.426467,-4.171318,-2.768971,-0.517732,-2.227282,4.739084,-3.119167,-1.027077,...,-0.838603,-1.204789,-1.813785,-1.142857,-4.284018,-7.620383,-7.401636,-2.391600,-5.315704,-1.033247


In [184]:
f_inv = format_investable(inv)
f_inv

Unnamed: 0_level_0,916328 (t-1),936365 (t-1),905271 (t-1),905113 (t-1),905802 (t-1),905425 (t-1),906156 (t-1),916305 (t-1),992816 (t-1),921093 (t-1),...,311917 (t),69568X (t),543755 (t),77463M (t),29235J (t),131745 (t),69487D (t),68157P (t),9110RA (t),292703 (t)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-05-21,1.194822,0.951510,-0.505222,-0.482703,-0.296359,1.822412,-0.556607,0.277681,0.643838,0.740319,...,3.275705,1.476286,-0.379459,-6.607449,-3.374541,10.613691,-4.546332,1.265375,4.576000,-0.358240
2020-05-20,0.333000,-3.085653,-1.776480,-2.509804,-0.211238,-0.488746,-0.991954,2.746077,-0.745544,-0.650636,...,0.777259,0.934394,0.739042,-4.281026,3.815470,12.958281,4.604200,2.624410,8.657858,3.520132
2020-05-19,2.119701,1.658554,1.597255,5.809129,3.227213,0.790770,3.976622,1.483894,1.944817,1.522114,...,-0.239589,-0.690671,1.552795,4.067060,-1.469583,6.651685,-2.968551,-1.157885,3.453237,-1.106066
2020-05-18,-2.270965,1.612017,-1.222013,-2.350081,-6.617797,-3.430145,-1.300758,-2.882250,-0.577851,-2.737430,...,2.955555,1.809315,9.152542,10.355460,6.118854,14.987080,12.846877,6.408787,19.775959,1.160120
2020-05-15,0.422739,0.719557,4.784248,14.577530,11.436351,1.848782,7.528582,7.480166,2.356115,4.923798,...,2.416058,-0.517844,1.490826,2.448227,-3.956011,9.198646,-3.355059,4.003179,4.880253,4.177914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-24,6.192800,7.507508,4.697091,4.337458,11.034861,1.101292,4.849162,3.375301,7.042158,2.650177,...,-0.673594,-7.624386,-1.114754,-2.109705,-3.432660,-1.449275,-3.738982,-3.538136,-3.668357,-5.024853
2018-12-21,-2.541995,-1.653869,-3.421612,-4.947292,-12.195710,-3.768154,-2.003723,-0.187166,-2.587408,-2.892157,...,-3.499066,1.392132,-3.542062,-5.200000,-2.066702,-5.129561,-2.237665,-3.278689,-2.896032,-2.930082
2018-12-20,-1.623282,-5.629877,0.420603,-3.303169,-4.642611,-1.533110,-4.636107,-1.965924,-3.889562,-1.042930,...,0.242617,-5.282766,-2.647783,-3.660886,-0.079766,-4.301619,0.897490,-2.788845,-1.595896,-4.024585
2018-12-19,-0.898551,-1.211454,0.285529,-2.008634,-5.413043,0.988811,-3.037360,-3.024911,-2.523463,-2.759434,...,-0.838603,-1.204789,-1.813785,-1.142857,-4.284018,-7.620383,-7.401636,-2.391600,-5.315704,-1.033247


In [222]:
X_train, y_train, X_test, y_test = reshape_investable(f_inv)
print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')

Unnamed: 0_level_0,916328 (t-1),936365 (t-1),905271 (t-1),905113 (t-1),905802 (t-1),905425 (t-1),906156 (t-1),916305 (t-1),992816 (t-1),921093 (t-1),...,311917 (t-1),69568X (t-1),543755 (t-1),77463M (t-1),29235J (t-1),131745 (t-1),69487D (t-1),68157P (t-1),9110RA (t-1),292703 (t-1)
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-05-21,1.19482,0.95151,-0.50522,-0.4827,-0.29636,1.82241,-0.55661,0.27768,0.64384,0.74032,...,-0.07132,-1.29531,-0.12697,-5.85683,-0.12843,2.46206,-0.56629,-0.03495,2.63158,-0.9245


X_train: (357, 1, 628)
y_train: (357, 628)
X_test: (1, 1, 628)
y_test: (1, 628)


In [199]:
model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(y_train.shape[1]))

In [200]:
model.compile(loss = 'mae', optimizer = 'adam')

In [203]:
history = model.fit(X_train, y_train, epochs=200, batch_size=16)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 

In [213]:
yhat = model.predict(X_test, verbose=0)

In [216]:
print(yhat)

[[-5.22004426e-01 -1.97123241e+00 -1.30962932e+00 -2.03062534e+00
  -1.72277939e+00 -3.51463586e-01 -1.84353650e+00 -5.89816332e-01
  -1.13358390e+00 -5.41906357e-01 -7.80382633e-01 -1.50734618e-01
  -4.04313952e-01 -1.59157264e+00 -1.21829224e+00 -8.82632196e-01
  -6.56326294e-01  4.01184589e-01 -2.48371631e-01  2.25418776e-01
  -4.83745039e-01 -1.94339395e+00  2.40264624e-01 -4.96885777e-01
   4.21555638e-01 -9.43320632e-01  9.53524172e-01 -7.44188488e-01
  -5.71696341e-01  1.97319698e+00  7.17757344e-01 -2.21909332e+00
  -1.17449033e+00 -1.68417561e+00  3.70041251e-01 -2.82256991e-01
   1.11040509e+00 -3.77048314e-01 -1.33853769e+00  2.36461306e+00
  -1.78164437e-01 -3.50669861e-01 -2.04963112e+00 -2.08733273e+00
  -5.39807141e-01 -1.17005682e+00 -4.65292931e-01 -1.99108922e+00
  -9.19438362e-01 -1.54699397e+00 -2.03182912e+00 -2.13598698e-01
   3.40402102e+00 -1.07124805e+00 -5.38633287e-01 -2.09031367e+00
  -1.21477616e+00 -4.83930707e-01 -1.64541018e+00 -1.88396558e-01
  -1.87915

In [206]:
print(yhat.shape)

(1, 628)


In [238]:
col = [f_inv.columns[i] for i in range(f_inv.shape[1]) if f_inv.columns[i].endswith(' (t)')]
df = pd.DataFrame(index = col, data = yhat.T)
df = df.rename(columns = {0: 'pred return'})
df['actual return'] = y_test.T
df

Unnamed: 0,pred return,actual return
916328 (t),-0.52200,0.33300
936365 (t),-1.97123,-3.08565
905271 (t),-1.30963,-1.77648
905113 (t),-2.03063,-2.50980
905802 (t),-1.72278,-0.21124
...,...,...
131745 (t),-0.21856,10.61369
69487D (t),-1.17168,-4.54633
68157P (t),-2.11980,1.26537
9110RA (t),-1.33081,4.57600


In [246]:
pred_vs_act_return(df, 5)

16.78028196964031