In [346]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

### Pipeline psuedo code
    n = look back window
    k = number of PCs to keep

    for each time point t:
        p = number of stocks in investable universe at time t
        Define an n x p feature matrix X (lagged returns)

        Perform PCA on X
        Keep the first k PCs in an n x k matrix Z

    for each stock s in the investable universe at time t:
        Define an n x 1 outcome vector y (future returns of stock s)
        Perform a linear regression of y on Z
        Predict y for stock s at time t+1

## Data Preparation

In [347]:
returns = pd.read_pickle("./Data/returns.pkl")
returns = returns.iloc[1:]

In [348]:
drop_columns = []

for col in returns.columns:
    if returns[col].isnull().all() == True:
        drop_columns.append(col)
        
returns.drop(columns=drop_columns, inplace=True)

## Pipeline

In [349]:
def get_investable(t, n):
    """"Find stocks in investable universe at time t+1
    (stocks in the S&P500 that have prices recorded for the last n days)"""
    
    df_investable = returns.copy(deep = True).sort_index(ascending = False)
    
    #add 1 date to get the test features in investable
    t = t + pd.DateOffset(1)
    n += 1
    
    #if t is now a non-trading day, advance until we reach a valid trading day
    while t not in df_investable.index:
        t = t + pd.DateOffset(1)
    
    t_index = df_investable.index.get_loc(t)
    
    #take n_rows worth of data upto time specified
    df_investable = df_investable.iloc[t_index:t_index + n]
    
    #find all stocks that exist in the S&P at this time period
    investable_universe = []
    for col in df_investable.columns:
        if ~df_investable[col].iloc[:n].isna().any():
            investable_universe.append(col)
        
    df_investable = df_investable[investable_universe]
    
    return df_investable

In [350]:
def apply_PCA(inv, k):
    X = inv.iloc[1:, :]
    pca = PCA(n_components = k) 
    inv_scaled = StandardScaler().fit_transform(X)   
    principal_components = pca.fit_transform(inv_scaled)

    df = pd.DataFrame(data = principal_components)
    
    #For explained variance table
    components = pca.components_
    component_explained_var = pca.explained_variance_ratio_ * 100
    
    comp_names = ['PCA' + str(i) for i in range(1, len(component_explained_var) + 1)]

    pca_results = pd.DataFrame(data = component_explained_var, index = comp_names)
    pca_results.columns = ['Explained variance (%)']
    pca_results['Explained variance (%)'] = pca_results['Explained variance (%)'].round(2)
    
    return df

In [351]:
def define_y(inv, stock):
    y = inv[[stock]].iloc[:-1]
    
    return y

In [352]:
def train_test(X, y):
    X_train = X.iloc[1:, :]
    X_test = X.iloc[0:1, :]
    y_train = y.iloc[1:]
    y_test = y.iloc[0:1]
    
    return X_train, y_train, X_test, y_test

In [353]:
def reshape(X_train, y_train, X_test, y_test):
    X_train = X_train.values.reshape(-1, X_train.shape[0], X_train.shape[1])
    X_test = X_test.values.reshape(-1, X_train.shape[0], X_test.shape[1])
    y_train = y_train.values.reshape(-1, y_train.shape[0], y_train.shape[1])
    y_test = y_test.values.reshape(-1, y_test.shape[0], y_test.shape[1])
    
    return X_train, y_train, X_test, y_test

In [354]:
def model_fit(X_train, y_train):
    model = Sequential()
    model.add(LSTM(100, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(0.2))
    model.add(Dense(y_train.shape[0]))
    
    model.compile(loss = 'mse', optimizer = 'adam')
    
    #early_stopping = EarlyStopping(monitor="val_loss", min_delta=1e-5, patience=20)
    
    history = model.fit(X_train, y_train, epochs=100, batch_size=16,
                        verbose = 0)
    
    return model

In [355]:
def model_predict(model, X_test):
    yhat = model.predict(X_test, verbose = 0)
    
    return yhat

In [364]:
def predict_returns(t, n, k, model, refit = True):
    inv = get_investable(t, n)
    X = apply_PCA(inv, k)
    
    returns_t = pd.DataFrame(index = inv.columns, columns = ['Pred', 'Actual'])
    
    for stock in inv.columns:
        y = define_y(inv, stock)
        X_train, y_train, X_test, y_test = train_test(X, y)
        X_train, y_train, X_test, y_test = reshape(X_train, y_train, X_test, y_test)
        
        if refit:
            model = model_fit(X_train, y_train)
            
        yhat = model_predict(model, X_test)[0][0]
        returns_t['Pred'].loc[stock] = yhat
        returns_t['Actual'].loc[stock] = y_test[0][0][0]
        count += 1
    
    return returns_t, model

In [357]:
def rank_stocks(returns, num_stocks):
    pred_returns = returns.sort_values(by = 'Pred', ascending = False)
    topn = pred_returns.head(num_stocks)
    botn = pred_returns.tail(num_stocks)
    
    return topn, botn

In [358]:
def portfolio_return(topn, botn, returns):
    return_t = topn['Actual'].mean() - botn['Actual'].mean()
    
    return return_t

In [359]:
def pipeline(n, k, num_stocks):

    time_range = returns.loc['2007':'2021'].index
    returns_monthly = returns.resample('M').mean()
    
    for i in range(len(time_range)):
        if time_range[i] in returns_monthly.index:
            time_range = time_range[i:]
            break
    
    portfolio = pd.DataFrame(index = time_range, columns = ['Portfolio Return'])
    current_model = Sequential()
    
    count = 0
    for t in time_range[0:2]:
        if t in returns_monthly.index:
            pred_actual, current_model = predict_returns(t, n, k, current_model, refit = True)
        else:
            pred_actual, current_model = predict_returns(t, n, k, current_model, refit = False)
        
        topn, botn = rank_stocks(pred_actual, num_stocks)
        return_t = portfolio_return(topn, botn, pred_actual)
        t_index = time_range.get_loc[t] + 1
        portfolio['Portfolio Return'].loc[time_range[t_index]] = return_t
        
        count +=1
        print(f'{(count/len(time_range))*100:.2f}% complete')
    
    return portfolio

In [363]:
portfolio = pipeline(50, 20, 5)

0.13% models complete
0.27% models complete
0.40% models complete
0.53% models complete
0.66% models complete
0.80% models complete
0.93% models complete
1.06% models complete
1.19% models complete
1.33% models complete
1.46% models complete
1.59% models complete
1.72% models complete
1.86% models complete
1.99% models complete
2.12% models complete
2.25% models complete
2.39% models complete
2.52% models complete
2.65% models complete
2.79% models complete
2.92% models complete
3.05% models complete
3.18% models complete
3.32% models complete
3.45% models complete
3.58% models complete
3.71% models complete
3.85% models complete
3.98% models complete
4.11% models complete
4.24% models complete
4.38% models complete
4.51% models complete
4.64% models complete
4.77% models complete
4.91% models complete
5.04% models complete
5.17% models complete
5.31% models complete
5.44% models complete
5.57% models complete
5.70% models complete
5.84% models complete
5.97% models complete
6.10% mode

47.88% models complete
48.01% models complete
48.14% models complete
48.28% models complete
48.41% models complete
48.54% models complete
48.67% models complete
48.81% models complete
48.94% models complete
49.07% models complete
49.20% models complete
49.34% models complete
49.47% models complete
49.60% models complete
49.73% models complete
49.87% models complete
50.00% models complete
50.13% models complete
50.27% models complete
50.40% models complete
50.53% models complete
50.66% models complete
50.80% models complete
50.93% models complete
51.06% models complete
51.19% models complete
51.33% models complete
51.46% models complete
51.59% models complete
51.72% models complete
51.86% models complete
51.99% models complete
52.12% models complete
52.25% models complete
52.39% models complete
52.52% models complete
52.65% models complete
52.79% models complete
52.92% models complete
53.05% models complete
53.18% models complete
53.32% models complete
53.45% models complete
53.58% mode

95.23% models complete
95.36% models complete
95.49% models complete
95.62% models complete
95.76% models complete
95.89% models complete
96.02% models complete
96.15% models complete
96.29% models complete
96.42% models complete
96.55% models complete
96.68% models complete
96.82% models complete
96.95% models complete
97.08% models complete
97.21% models complete
97.35% models complete
97.48% models complete
97.61% models complete
97.75% models complete
97.88% models complete
98.01% models complete
98.14% models complete
98.28% models complete
98.41% models complete
98.54% models complete
98.67% models complete
98.81% models complete
98.94% models complete
99.07% models complete
99.20% models complete
99.34% models complete
99.47% models complete
99.60% models complete
99.73% models complete
99.87% models complete
100.00% models complete
0.03% complete
0.13% models complete
0.27% models complete
0.40% models complete
0.53% models complete
0.67% models complete
0.80% models complete
0

43.60% models complete
43.73% models complete
43.87% models complete
44.00% models complete
44.13% models complete
44.27% models complete
44.40% models complete
44.53% models complete
44.67% models complete
44.80% models complete
44.93% models complete
45.07% models complete
45.20% models complete
45.33% models complete
45.47% models complete
45.60% models complete
45.73% models complete
45.87% models complete
46.00% models complete
46.13% models complete
46.27% models complete
46.40% models complete
46.53% models complete
46.67% models complete
46.80% models complete
46.93% models complete
47.07% models complete
47.20% models complete
47.33% models complete
47.47% models complete
47.60% models complete
47.73% models complete
47.87% models complete
48.00% models complete
48.13% models complete
48.27% models complete
48.40% models complete
48.53% models complete
48.67% models complete
48.80% models complete
48.93% models complete
49.07% models complete
49.20% models complete
49.33% mode

91.20% models complete
91.33% models complete
91.47% models complete
91.60% models complete
91.73% models complete
91.87% models complete
92.00% models complete
92.13% models complete
92.27% models complete
92.40% models complete
92.53% models complete
92.67% models complete
92.80% models complete
92.93% models complete
93.07% models complete
93.20% models complete
93.33% models complete
93.47% models complete
93.60% models complete
93.73% models complete
93.87% models complete
94.00% models complete
94.13% models complete
94.27% models complete
94.40% models complete
94.53% models complete
94.67% models complete
94.80% models complete
94.93% models complete
95.07% models complete
95.20% models complete
95.33% models complete
95.47% models complete
95.60% models complete
95.73% models complete
95.87% models complete
96.00% models complete
96.13% models complete
96.27% models complete
96.40% models complete
96.53% models complete
96.67% models complete
96.80% models complete
96.93% mode

In [None]:
#start at 4:58pm
print(pd.datetime.now())