In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.callbacks import EarlyStopping
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

### Pipeline psuedo code
    n = look back window
    k = number of PCs to keep

    for each time point t:
        p = number of stocks in investable universe at time t
        Define an n x p feature matrix X (lagged returns)

        Perform PCA on X
        Keep the first k PCs in an n x k matrix Z

    for each stock s in the investable universe at time t:
        Define an n x 1 outcome vector y (future returns of stock s)
        Perform a linear regression of y on Z
        Predict y for stock s at time t+1

## Data Preparation

In [21]:
returns = pd.read_pickle("./Data/returns.pkl")
returns = returns.iloc[1:]

In [22]:
drop_columns = []

for col in returns.columns:
    if returns[col].isnull().all() == True:
        drop_columns.append(col)
        
returns.drop(columns=drop_columns, inplace=True)

## Pipeline

In [23]:
def get_investable(t, n):
    """"Find stocks in investable universe at time t
    (stocks in the S&P500 that have prices recorded for the last n days)"""
    
    df_investable = returns.copy(deep = True).sort_index(ascending = False)
    
    #add 1 date to get the test features in investable
    t = t + pd.DateOffset(1)
    n += 1
    
    #if t is now a non-trading day, advance until we reach a valid trading day
    while t not in df_investable.index:
        t = t + pd.DateOffset(1)
    
    t_index = df_investable.index.get_loc(t)
    
    #take n_rows worth of data upto time specified
    df_investable = df_investable.iloc[t_index + 1:t_index + n + 1]
    
    #find all stocks that exist in the S&P at this time period
    investable_universe = []
    for col in df_investable.columns:
        if ~df_investable[col].iloc[:n].isna().any():
            investable_universe.append(col)
        
    df_investable = df_investable[investable_universe]
    
    return df_investable

In [24]:
def apply_PCA(inv, k):
    pca = PCA(n_components = k) 
    inv_scaled = StandardScaler().fit_transform(inv)   
    principal_components = pca.fit_transform(inv_scaled)

    df = pd.DataFrame(data = principal_components)
    
    #For explained variance table
    components = pca.components_
    component_explained_var = pca.explained_variance_ratio_ * 100
    
    comp_names = ['PCA' + str(i) for i in range(1, len(component_explained_var) + 1)]

    pca_results = pd.DataFrame(data = component_explained_var, index = comp_names)
    pca_results.columns = ['Explained variance (%)']
    pca_results['Explained variance (%)'] = pca_results['Explained variance (%)'].round(2)
    
    return df

In [25]:
def define_y(inv, stock, n):
    y = inv[[stock]].iloc[:n+1]
    
    return y

In [26]:
def train_test(X, y):
    X_train = X.iloc[1:, :]
    X_test = X.iloc[0:1, :]
    y_train = y.iloc[1:]
    y_test = y.iloc[0:1]
    
    return X_train, y_train, X_test, y_test

In [57]:
def reshape(X_train, y_train, X_test, y_test, timesteps):
    X_train = X_train.values.reshape(-1, timesteps, X_train.shape[1])
    X_test = X_test.values.reshape(-1, 1, X_test.shape[1])
    y_train = y_train.values
    y_test = y_test.values
    
    return X_train, y_train, X_test, y_test

In [64]:
def model_fit(X_train, y_train):
    model = Sequential()
    model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dense(y_train.shape[0]))
    
    model.compile(loss = 'mse', optimizer = 'adam')
    
    early_stopping = EarlyStopping(monitor="val_loss", min_delta=1e-5, patience=20)
    
    history = model.fit(X_train, y_train, epochs=300, batch_size=16,
                        verbose = 0, validation_split = 0.2, 
                       callbacks = [early_stopping])
    
    return model

In [65]:
def model_predict(model, X_test):
    yhat = model.predict(X_test, verbose = 0)
    
    return yhat

In [66]:
def predict_returns(t, n, k, timesteps, model, refit = True):
    inv = get_investable(t, n)
    X = apply_PCA(inv, k)
    
    returns_t = pd.DataFrame(index = inv.columns, columns = ['Pred', 'Actual'])
    
    for stock in inv.columns:
        y = define_y(inv, stock, n)
        X_train, y_train, X_test, y_test = train_test(X, y)
        X_train, y_train, X_test, y_test = reshape(X_train, y_train, X_test, y_test, timesteps)
        
        if refit:
            model = model_fit(X_train, y_train)
            
        yhat = model_predict(model, X_test)[0][0]
        returns_t['Pred'].loc[stock] = yhat*100
        returns_t['Actual'].loc[stock] = y_test
    
    return returns_t, model

In [67]:
def rank_stocks(returns, num_stocks):
    pred_returns = returns.sort_values(by = 'Pred', ascending = False)
    topn = pred_returns.head(num_stocks)
    botn = pred_returns.tail(num_stocks)
    
    return topn, botn

In [68]:
def portfolio_return(topn, botn, returns):
    return_t = topn['Actual'].mean() - botn['Actual'].mean()
    
    return return_t

In [None]:
def pipeline(n, k, timesteps, num_stocks):

    time_range = returns.loc['2007':'2021'].index
    returns_monthly = returns.resample('M').mean()
    
    for i in range(len(time_range)):
        if time_range[i] in returns_monthly.index:
            time_range = time_range[i:]
            break
    
    portfolio = pd.DataFrame(index = time_range, columns = ['Portfolio Return'])
    current_model = Sequential()
    
    count = 0
    for t in time_range[0:2]:
        if t in returns_monthly.index:
            pred_actual, current_model = predict_returns(t, n, k, timesteps, current_model, refit = True)
        else:
            pred_actual, current_model = predict_returns(t, n, k, timesteps, current_model, refit = False)
        
        topn, botn = rank_stocks(pred_actual, num_stocks)
        return_t = portfolio_return(topn, botn, pred_actual)
        portfolio['Portfolio Return'].loc[t] = return_t
        count +=1
        print(f'{(count/len(time_range))*100:.2f}% complete')
    
    return portfolio

In [None]:
portfolio = pipeline(200, 20, 50, 5)

In [None]:
#start at 2:15pm
print(pd.datetime.now())