TODO: 

1. Select (probability 15) financial features, preferably exactly as paper does

2. Rolling over time window

3. Fine-tune the model (if the models were correct, then seems there's not a lot to be done. After all test accuracy has been pushed to its limit at the very start)

4. Trade! (This, even not tightly related with DL course itself, is perhaps the most tricky part)

5. Compare with mean-reversion and momentum (??   need to check the paper)

In [230]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Concatenate, Input
import pandas as pd
import numpy as np

In [229]:
tf.__version__

'0.22.0'

In [207]:
'''Model 1: LSTM only'''

inputs=Input(shape=(12, 1)) # 12 is the sequence length (monthly, was 80-daily in paper), and 1 is the input dimension (stock-price only)
h1=LSTM(units=50, return_sequences=True)(inputs) # 50 for the dimension of hidden layers
h2=LSTM(units=50, return_sequences=False)(h1) # still 50 hidden layers

outputs=Dense(5, activation='softmax')(h2)
model_LSTM=tf.keras.Model(inputs=inputs, outputs=outputs)

opt=tf.keras.optimizers.RMSprop(
    learning_rate=0.003, # default is 0.001, which seems a little bit too low
)
model_LSTM.compile(
    optimizer=opt, loss='CategoricalCrossentropy', metrics='accuracy', 
)


In [150]:
model_LSTM.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 12, 1)]           0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 12, 50)            10400     
_________________________________________________________________
lstm_9 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_14 (Dense)             (None, 5)                 255       
Total params: 30,855
Trainable params: 30,855
Non-trainable params: 0
_________________________________________________________________


In [210]:
'''Model 2: MLP only'''

inputs=Input(shape=(27,)) 
# 27 comes from 12 monthly return sequence + 15 financial indicators

h1=Dense(128, activation='relu')(inputs)
h2=Dense(64, activation='relu')(h1)
h3=Dense(32, activation='relu')(h2)
outputs=Dense(5, activation='softmax')(h3)
model_MLP=tf.keras.Model(inputs=inputs, outputs=outputs)

opt=tf.keras.optimizers.RMSprop(
    learning_rate=0.003,
)
model_MLP.compile(
    optimizer=opt, loss='CategoricalCrossentropy', metrics='accuracy', 
)


In [195]:
model_MLP.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 27)]              0         
_________________________________________________________________
dense_15 (Dense)             (None, 128)               3584      
_________________________________________________________________
dense_16 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_17 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_18 (Dense)             (None, 5)                 165       
Total params: 14,085
Trainable params: 14,085
Non-trainable params: 0
_________________________________________________________________


In [209]:
'''Model 3: LSTM + MLP'''

inputs_rets=tf.keras.Input(shape=(12, 1)) 
# historical 12-month returns

h1_rets=LSTM(units=50, return_sequences=True)(inputs_rets) # 50 for the dimension of hidden layers
outputs_rets=LSTM(units=30, return_sequences=False)(h1_rets) 

inputs_fin=tf.keras.Input(shape=(15, )) 
# 15 financial indicators

combined_features = Concatenate()([outputs_rets, inputs_fin])

h1=Dense(128, activation='relu')(combined_features)
h2=Dense(64, activation='relu')(h1)
h3=Dense(32, activation='relu')(h2)
outputs=Dense(5, activation='softmax')(h3)
model_hybrid=tf.keras.Model(inputs=[inputs_rets, inputs_fin], outputs=outputs)

opt=tf.keras.optimizers.RMSprop(
    learning_rate=0.003,
)
model_hybrid.compile(
    optimizer=opt, loss='CategoricalCrossentropy', metrics='accuracy',
)


In [15]:
model_hybrid.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 12, 1)]      0                                            
__________________________________________________________________________________________________
lstm_6 (LSTM)                   (None, 12, 50)       10400       input_6[0][0]                    
__________________________________________________________________________________________________
lstm_7 (LSTM)                   (None, 30)           9720        lstm_6[0][0]                     
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, 15)]         0                                            
____________________________________________________________________________________________

In [21]:
data_raw=pd.read_csv('complete_universe_cleaned_normalized_jonn.csv')

In [23]:
data_raw.shape

(2349536, 79)

In [46]:
data_raw.set_index(keys=['DATE', 'NCUSIP'], inplace=True) # make it multi-index

In [145]:
def ConvertQuintile(data_raw, partition=5):
    columns=['m+1_q{}'.format(i) for i in range(partition)]
    y=pd.DataFrame(index=data_raw.index, columns=columns)
    
    dates=data_raw.index.get_level_values('DATE').unique()
    dates=dates.sort_values() 
    
    for date in dates:
        print(date)
        quintiles=pd.qcut(data_raw.loc[date]['m+1'], partition, labels=False) # calculate quintile
        quintiles=pd.get_dummies(quintiles) # one hot       
        y.loc[date]=quintiles.values
               
    return y

In [148]:
y=ConvertQuintile(data_raw) # This might take some time

1971-01-31
1971-02-28
1971-03-31
1971-04-30
1971-05-31
1971-06-30
1971-07-31
1971-08-31
1971-09-30
1971-10-31
1971-11-30
1971-12-31
1972-01-31
1972-02-29
1972-03-31
1972-04-30
1972-05-31
1972-06-30
1972-07-31
1972-08-31
1972-09-30
1972-10-31
1972-11-30
1972-12-31
1973-01-31
1973-02-28
1973-03-31
1973-04-30
1973-05-31
1973-06-30
1973-07-31
1973-08-31
1973-09-30
1973-10-31
1973-11-30
1973-12-31
1974-01-31
1974-02-28
1974-03-31
1974-04-30
1974-05-31
1974-06-30
1974-07-31
1974-08-31
1974-09-30
1974-10-31
1974-11-30
1974-12-31
1975-01-31
1975-02-28
1975-03-31
1975-04-30
1975-05-31
1975-06-30
1975-07-31
1975-08-31
1975-09-30
1975-10-31
1975-11-30
1975-12-31
1976-01-31
1976-02-29
1976-03-31
1976-04-30
1976-05-31
1976-06-30
1976-07-31
1976-08-31
1976-09-30
1976-10-31
1976-11-30
1976-12-31
1977-01-31
1977-02-28
1977-03-31
1977-04-30
1977-05-31
1977-06-30
1977-07-31
1977-08-31
1977-09-30
1977-10-31
1977-11-30
1977-12-31
1978-01-31
1978-02-28
1978-03-31
1978-04-30
1978-05-31
1978-06-30
1978-07-31

In [212]:
data_raw.columns

Index(['Unnamed: 0', 'COMNAM', 'PERMNO', 'PERMCO', 'SICCD', 'TICKER_PERMNO',
       'accru', 'adv_sale', 'aftret_eq', 'aftret_equity', 'aftret_invcapx',
       'at_tu', 'bm', 'CAPEI', 'capital_rat', 'cash_debt', 'cash_lt',
       'cash_rat', 'cfm', 'curr_debt', 'curr_rat', 'de_rat', 'debt_assets',
       'debt_at', 'debt_capit', 'debt_ebitd', 'debt_invcap', 'divyield',
       'dltt_be', 'equity_invcap', 'evm', 'gpm', 'GProf', 'intcov',
       'intcov_rat', 'invt_act', 'lt_debt', 'lt_ppent', 'npm', 'ocf_lct',
       'opmad', 'opmbd', 'pay_tu', 'pcf', 'pe_ex', 'pe_inc', 'pretret_earnat',
       'pretret', 'profit_lct', 'ps', 'ptb', 'ptpm', 'quick_rat', 'rd_sale',
       'rect_act', 'rect_tu', 'roa', 'roce', 'roe', 'sale_equity',
       'sale_invcap', 'short_debt', 'staff_sale', 'totdebt_invcap', 'm-12',
       'm-11', 'm-10', 'm-9', 'm-8', 'm-7', 'm-6', 'm-5', 'm-4', 'm-3', 'm-2',
       'm-1', 'm+1'],
      dtype='object')

In [161]:
columns_ret=['m-{}'.format(i+1) for i in range(12)]

In [192]:
'''TODO: select the columns'''
columns_fin=list(data_raw.columns)[6:21] # select 15 columns (arbitrarily for now)

In [211]:
# train LSTM for 10 years
'''TODO: add rolling machenism for all models'''

start_date='1971-01-31'
end_date='1981-01-31'
test_end_date='1981-02-28'
mask_train=(data_raw.index.get_level_values(0)>=start_date) & (data_raw.index.get_level_values(0)<end_date)
mask_test=(data_raw.index.get_level_values(0)>=end_date) & (data_raw.index.get_level_values(0)<test_end_date)

x=data_raw[columns_ret]

x_train, x_test, y_train, y_test=x[mask_train], x[mask_test], y[mask_train], y[mask_test]

history_LSTM=model_LSTM.fit(
    x=x_train, y=y_train, batch_size=256, epochs=10, verbose=1, 
    validation_data=(x_test, y_test)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [213]:
# train MLP for 10 years
start_date='1971-01-31'
end_date='1981-01-31'
test_end_date='1981-02-28'
mask_train=(data_raw.index.get_level_values(0)>=start_date) & (data_raw.index.get_level_values(0)<end_date)
mask_test=(data_raw.index.get_level_values(0)>=end_date) & (data_raw.index.get_level_values(0)<test_end_date)

x=data_raw[columns_ret+columns_fin]

x_train, x_test, y_train, y_test=x[mask_train], x[mask_test], y[mask_train], y[mask_test]

history_MLP=model_MLP.fit(
    x=x_train, y=y_train, batch_size=256, epochs=10, verbose=1, 
    validation_data=(x_test, y_test)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [214]:
# train LSTM+MLP for 10 years
start_date='1971-01-31'
end_date='1981-01-31'
test_end_date='1981-02-28'
mask_train=(data_raw.index.get_level_values(0)>=start_date) & (data_raw.index.get_level_values(0)<end_date)
mask_test=(data_raw.index.get_level_values(0)>=end_date) & (data_raw.index.get_level_values(0)<test_end_date)

x=[data_raw[columns_ret], data_raw[columns_fin]]

x_train, x_test, y_train, y_test=[i[mask_train] for i in x], [i[mask_test] for i in x], y[mask_train], y[mask_test]

history_hybrid=model_hybrid.fit(
    x=x_train, y=y_train, batch_size=256, epochs=10, verbose=1, 
    validation_data=(x_test, y_test)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [216]:
ynew = model_hybrid.predict(x_test)

In [221]:
ynew

array([[0.19838303, 0.24431789, 0.22690049, 0.20285915, 0.12753941],
       [0.14375706, 0.2214937 , 0.23945446, 0.2325799 , 0.16271496],
       [0.44275892, 0.14754176, 0.11160321, 0.11253779, 0.18555823],
       ...,
       [0.19481744, 0.2731891 , 0.26049924, 0.20586607, 0.06562808],
       [0.27223945, 0.22168463, 0.18538223, 0.17361432, 0.14707941],
       [0.4151327 , 0.11317569, 0.08970512, 0.08570243, 0.29628414]],
      dtype=float32)

In [222]:
y_test

Unnamed: 0_level_0,Unnamed: 1_level_0,m+1_q0,m+1_q1,m+1_q2,m+1_q3,m+1_q4
DATE,NCUSIP,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1981-01-31,00462610,0,0,1,0,0
1981-01-31,00077410,0,0,1,0,0
1981-01-31,00087410,1,0,0,0,0
1981-01-31,01736110,0,0,0,1,0
1981-01-31,43851610,0,0,0,0,1
1981-01-31,00101510,0,0,0,1,0
1981-01-31,00103010,0,0,0,1,0
1981-01-31,02312710,0,1,0,0,0
1981-01-31,00103210,0,0,0,0,1
1981-01-31,00103810,0,0,0,0,1


In [227]:
from sklearn.metrics import r2_score
r2_score(y_test, ynew)

0.006683151182036129