In [1]:
import pandas as pd
import numpy as np
import random
import time
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler
from Statistics import Statistics

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dropout,Dense,Input,add
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger, LearningRateScheduler
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras import optimizers
import warnings
warnings.filterwarnings("ignore")

import os
SEED = 9
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [2]:
SP500_df = pd.read_csv('data/SPXconst.csv')
all_companies = list(set(SP500_df.values.flatten()))
all_companies.remove(np.nan)

In [3]:
SP500_df

Unnamed: 0,01/1990,02/1990,03/1990,04/1990,05/1990,06/1990,07/1990,08/1990,09/1990,10/1990,...,03/2018,04/2018,05/2018,06/2018,07/2018,08/2018,09/2018,10/2018,11/2018,12/2018
0,S626,S626,S626,S626,S626,S626,S626,S626,S626,S626,...,S456,S456,S456,S456,S456,S456,S456,S456,S456,S198
1,S1335,S1335,S1335,S490,S490,S490,S490,S490,S490,S490,...,S698,S309,S309,S309,S309,S309,S309,S76,S76,S550
2,S83,S83,S83,S1335,S1335,S1335,S1141,S1141,S1141,S1141,...,S709,S698,S698,S698,S30,S30,S30,S309,S309,S118
3,S1090,S1090,S1090,S83,S83,S83,S1335,S1335,S1335,S1335,...,S658,S976,S976,S976,S698,S698,S698,S976,S976,S1368
4,S59,S59,S59,S1090,S1090,S1090,S83,S83,S83,S83,...,S834,S709,S709,S709,S631,S631,S631,S709,S709,S1170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,,,,,,,,,,,...,S446,S1245,S123,S95,S95,S95,S95,S602,S1032,S613
502,,,,,,,,,,,...,S854,S248,S124,S617,S617,S617,S617,S316,S758,S852
503,,,,,,,,,,,...,S699,S1349,S1245,S699,S699,S699,S699,S609,S316,S51
504,,,,,,,,,,,...,S654,S654,S248,S654,S654,S654,S654,S1161,S609,S1349


In [4]:
constituents = {'-'.join(col.split('/')[::-1]):set(SP500_df[col].dropna()) 
                for col in SP500_df.columns}

In [5]:
constituents

{'1990-01': {'S1',
  'S1000',
  'S1009',
  'S1010',
  'S1015',
  'S1023',
  'S1025',
  'S1027',
  'S1034',
  'S1035',
  'S1036',
  'S1038',
  'S1039',
  'S104',
  'S1041',
  'S1044',
  'S1049',
  'S1050',
  'S1052',
  'S1057',
  'S1063',
  'S1064',
  'S1066',
  'S1069',
  'S1070',
  'S1077',
  'S108',
  'S1080',
  'S1081',
  'S1083',
  'S1085',
  'S109',
  'S1090',
  'S1092',
  'S1093',
  'S1098',
  'S1099',
  'S110',
  'S1100',
  'S1104',
  'S1105',
  'S1107',
  'S1109',
  'S111',
  'S1111',
  'S1115',
  'S1117',
  'S1118',
  'S1127',
  'S1129',
  'S113',
  'S1130',
  'S1134',
  'S1141',
  'S1142',
  'S1143',
  'S1147',
  'S1148',
  'S1152',
  'S1154',
  'S1156',
  'S1159',
  'S1165',
  'S1166',
  'S1167',
  'S1168',
  'S117',
  'S1171',
  'S1174',
  'S1175',
  'S1185',
  'S1187',
  'S1192',
  'S12',
  'S1206',
  'S1210',
  'S1211',
  'S1212',
  'S1214',
  'S1217',
  'S1218',
  'S1219',
  'S122',
  'S1221',
  'S1223',
  'S1229',
  'S1237',
  'S1239',
  'S1240',
  'S1242',
  'S1243',
 

In [6]:
constituents_train = {} 
for test_year in range(1993,2016):
    months = [str(t)+'-0'+str(m) if m<10 else str(t)+'-'+str(m) 
              for t in range(test_year-3,test_year) for m in range(1,13)]
    constituents_train[test_year] = [list(constituents[m]) for m in months]
    constituents_train[test_year] = set([i for sublist in constituents_train[test_year] 
                                         for i in sublist])

In [12]:
def makeLSTM():
    inputs = Input(shape=(240,1))
    x = CuDNNLSTM(25,return_sequences=False)(inputs)
    x = Dropout(0.1)(x)
    outputs = Dense(2,activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='categorical_crossentropy',optimizer=optimizers.RMSprop(),
                          metrics=['accuracy'])
    model.summary()
    return model
    

def callbacks_req(model_type='LSTM'):
    csv_logger = CSVLogger(model_folder+'/training-log-'+model_type+'-'+str(test_year)+'.csv')
    filepath = model_folder+"/model-" + model_type + '-' + str(test_year) + "-E{epoch:02d}.h5"
    model_checkpoint = ModelCheckpoint(filepath, monitor='val_loss',save_best_only=False, period=1)
    earlyStopping = EarlyStopping(monitor='val_loss',mode='min',patience=10,restore_best_weights=True)
    return [csv_logger,earlyStopping,model_checkpoint]

def reshaper(arr):
    arr = np.array(np.split(arr,3,axis=1))
    arr = np.swapaxes(arr,0,1)
    arr = np.swapaxes(arr,1,2)
    return arr

def trainer(train_data,test_data,model_type='LSTM'):
    np.random.shuffle(train_data)
    train_x,train_y,train_ret = train_data[:,2:-2],train_data[:,-1],train_data[:,-2]
    train_x = np.reshape(train_x,(len(train_x),240,1))
    train_y = np.reshape(train_y,(-1, 1))
    train_ret = np.reshape(train_ret,(-1, 1))
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(train_y)
    enc_y = enc.transform(train_y).toarray()
    train_ret = np.hstack((np.zeros((len(train_data),1)),train_ret)) 

    if model_type == 'LSTM':
        model = makeLSTM()
    else:
        return
    callbacks = callbacks_req(model_type)
    
    model.fit(train_x,
              enc_y,
              epochs=1000,
              validation_split=0.2,
              callbacks=callbacks,
              batch_size=512
              )

    dates = list(set(test_data[:,0]))
    predictions = {}
    for day in dates:
        test_d = test_data[test_data[:,0]==day]
        test_d = np.reshape(test_d[:,2:-2], (len(test_d),240,1))
        predictions[day] = model.predict(test_d)[:,1]
    return model,predictions

def trained(filename,train_data,test_data):
    model = load_model(filename)

    dates = list(set(test_data[:,0]))
    predictions = {}
    for day in dates:
        test_d = test_data[test_data[:,0]==day]
        test_d = np.reshape(test_d[:,2:-2],(len(test_d),240,1))
        predictions[day] = model.predict(test_d)[:,1]
    return model,predictions     

def simulate(test_data,predictions):
    rets = pd.DataFrame([],columns=['Long','Short'])
    k = 10
    for day in sorted(predictions.keys()):
        preds = predictions[day]
        test_returns = test_data[test_data[:,0]==day][:,-2]
        top_preds = predictions[day].argsort()[-k:][::-1] 
        trans_long = test_returns[top_preds]
        worst_preds = predictions[day].argsort()[:k][::-1] 
        trans_short = -test_returns[worst_preds]
        rets.loc[day] = [np.mean(trans_long),np.mean(trans_short)] 
    print('Result : ',rets.mean())  
    return rets       

    
def create_label(df_open,df_close,perc=[0.5,0.5]):
    if not np.all(df_close.iloc[:,0]==df_open.iloc[:,0]):
        print('Date Index issue')
        return
    perc = [0.]+list(np.cumsum(perc))
    label = (df_close.iloc[:,1:]/df_open.iloc[:,1:]-1).apply(
            lambda x: pd.qcut(x.rank(method='first'),perc,labels=False), axis=1)
    return label[1:]

def create_stock_data(df_open,df_close,st,m=240):
    st_data = pd.DataFrame([])
    st_data['Date'] = list(df_close['Date'])
    st_data['Name'] = [st]*len(st_data)
    daily_change = df_close[st]/df_open[st]-1
    for k in range(m)[::-1]:
        st_data['IntraR'+str(k)] = daily_change.shift(k)

    st_data['IntraR-future'] = daily_change.shift(-1)    
    st_data['label'] = list(label[st])+[np.nan] 
    st_data['Month'] = list(df_close['Date'].str[:-3])
    st_data = st_data.dropna()
    
    trade_year = st_data['Month'].str[:4]
    st_data = st_data.drop(columns=['Month'])
    st_train_data = st_data[trade_year<str(test_year)]
    st_test_data = st_data[trade_year==str(test_year)]
    return np.array(st_train_data),np.array(st_test_data) 

def scalar_normalize(train_data,test_data):
    scaler = RobustScaler()
    scaler.fit(train_data[:,2:-2])
    train_data[:,2:-2] = scaler.transform(train_data[:,2:-2])
    test_data[:,2:-2] = scaler.transform(test_data[:,2:-2])

In [13]:
model_folder = 'models5'
result_folder = 'results5'
for directory in [model_folder,result_folder]:
    if not os.path.exists(directory):
        os.makedirs(directory)

In [None]:
for test_year in range(1993,2020):
    
    print('-'*40)
    print(test_year)
    print('-'*40)
    
    filename = 'data/Open-'+str(test_year-3)+'.csv'
    df_open = pd.read_csv(filename)
    filename = 'data/Close-'+str(test_year-3)+'.csv'
    df_close = pd.read_csv(filename)
    
    label = create_label(df_open,df_close)
    stock_names = sorted(list(constituents[str(test_year-1)+'-12']))
    train_data,test_data = [],[]

    start = time.time()
    for st in stock_names:
        st_train_data,st_test_data = create_stock_data(df_open,df_close,st)
        for x in st_train_data:
            train_data.append(x[1:])
        for x in st_test_data:
            test_data.append(x[1:])
    train_data = np.concatenate([x for x in train_data])
    test_data = np.concatenate([x for x in test_data])
    
    scalar_normalize(train_data,test_data)
    print(train_data.shape,test_data.shape,time.time()-start)
    
    model,predictions = trainer(train_data,test_data)
    returns = simulate(test_data,predictions)
    returns.to_csv(result_folder+'/avg_daily_rets-'+str(test_year)+'.csv')
    
    result = Statistics(returns.sum(axis=1))
    print('\nAverage returns prior to transaction charges')
    result.shortreport() 
    
    with open(result_folder+"/avg_returns.txt", "a") as myfile:
        res = '-'*30 + '\n'
        res += str(test_year) + '\n'
        res += 'Mean = ' + str(result.mean()) + '\n'
        res += 'Sharpe = '+str(result.sharpe()) + '\n'
        res += '-'*30 + '\n'
        myfile.write(res)
        