In [79]:
%matplotlib inline

# Modules
import datetime as dt
import matplotlib as mp
from statsmodels.graphics.tsaplots import plot_acf
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.layers import Dense, Dropout, GaussianNoise, GRU, LSTM, Conv1D, Flatten
from keras.layers.pooling import MaxPooling1D, GlobalAveragePooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers.wrappers import Bidirectional
import keras
import os
import pandas as pd

# Source data
data_file_path = ''
data_filename = 'spx_history.csv'
processed_data_file = 'processed_data.csv'
model_input_file = 'model_input.csv'

# Model
model_file = 'model.hd5'
num_epochs = 10000
validation_frac = 0.2
batch_size = 128
optimizer = 'adam'
loss = 'binary_crossentropy'
metrics = ['accuracy']

# Other config
dt_format = '%Y-%m-%d'
window = 2500 # days
investment_horizon = 250 # days
stride = 1 #days

In [80]:
def get_price_data(filename, dt_format):
    prices = pd.read_csv(filename,
                         delimiter=',',
                         header=0,
                         names=['date', 'P_close'],
                         index_col=0,
                         parse_dates=True,
                         date_parser=lambda date_str: dt.datetime.strptime(date_str, dt_format))
    return prices

In [81]:
def get_processed_data(prices_df, investment_horizon):
    prices_df['log_P'] = prices_df['P_close'].apply(np.log)
    prices_df['diff'] = prices_df['log_P'].diff(1)
    midpoint = (prices_df['diff'].max() + prices_df['diff'].min())/2
    scale = (prices_df['diff'].max() - prices_df['diff'].min())/2
    prices_df['scaled'] = (prices_df['diff'] - midpoint)/scale

    def map_outcome(x):
        if x == False:
            return 0
        elif x == True:
            return 1
        else:
            return np.nan
    
    prices_df['return'] = prices_df['log_P'].diff(investment_horizon)
    prices_df['outcome'] = (prices_df['return'] > prices_df['diff'].median()).map(map_outcome)
    
    prices_df.to_csv(processed_data_file)
    print(prices_df.head())
    print(prices_df.describe())

    return prices_df[['scaled', 'outcome']].dropna(), midpoint, scale

In [82]:
def get_samples(data_df, window, stride):
    time_series = data_df.iloc[:, 0].values
    outcomes = data_df.iloc[:, 1].values
    x, y = zip(*[(time_series[i-window:i], outcomes[i]) for i in range(window, len(time_series), stride)])
    
    return np.array(x).reshape(-1, window, 1), np.array(y).reshape(-1, 1)

In [83]:
def save_processed_data(df, filename):
    df.to_csv(filename, sep=',')

In [84]:
def get_model(window,):
    dropout = 0.5
    model = keras.models.Sequential()
    
    # Convolutions
    model.add(Conv1D(32, 2, padding='valid', activation='relu', input_shape=(window,1)))
    model.add(MaxPooling1D(2, padding='valid'))
    model.add(Conv1D(32, 2, padding='valid', activation='relu'))
    model.add(MaxPooling1D(2, padding='valid'))
    model.add(Conv1D(32, 2, padding='valid', activation='relu'))
    model.add(MaxPooling1D(2, padding='valid'))
    model.add(Dropout(dropout))
    model.add(Conv1D(32, 2, padding='valid', activation='relu'))
    model.add(MaxPooling1D(2, padding='valid'))
    model.add(Conv1D(32, 2, padding='valid', activation='relu'))
    model.add(MaxPooling1D(2, padding='valid'))
    model.add(Conv1D(32, 2, padding='valid', activation='relu'))
    model.add(MaxPooling1D(2, padding='valid'))
    model.add(Dropout(dropout))
    model.add(Conv1D(32, 2, padding='valid', activation='relu'))
    model.add(MaxPooling1D(2, padding='valid'))
    model.add(Conv1D(32, 2, padding='valid', activation='relu'))
    model.add(MaxPooling1D(2, padding='valid'))
    model.add(Conv1D(32, 2, padding='valid', activation='relu'))
    model.add(MaxPooling1D(2, padding='valid'))
    model.add(Dropout(dropout))
    model.add(GlobalAveragePooling1D())
#     model.add(Flatten())
    
#     # Recurrents
#     model.add(GRU(128, return_sequences=True, go_backwards=True))
#     model.add(Dropout(dropout))
#     model.add(GRU(128, return_sequences=False, go_backwards=False))
#     model.add(Dropout(dropout))
    
    # Dense for final prediction
#     model.add(Dense(64, activation='relu'))
#     model.add(Dropout(dropout))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(4, activation='relu'))
    model.add(Dense(2, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    return model

In [85]:
full_filename = os.path.join(data_file_path, data_filename)
data = get_price_data(full_filename, dt_format)
processed_data, midpoint, scale = get_processed_data(data, investment_horizon)
X, y = get_samples(processed_data, window, stride)
save_processed_data(processed_data, model_input_file)

            P_close     log_P      diff    scaled  return  outcome
date                                                              
1927-12-30    17.66  2.871302       NaN       NaN     NaN        0
1928-01-03    17.76  2.876949  0.005647  0.226387     NaN        0
1928-01-04    17.72  2.874694 -0.002255  0.185090     NaN        0
1928-01-05    17.55  2.865054 -0.009640  0.146491     NaN        0
1928-01-06    17.66  2.871302  0.006248  0.229532     NaN        0
            P_close         log_P          diff        scaled        return  \
count  22504.000000  22504.000000  22503.000000  22503.000000  22254.000000   
mean     399.850251      4.734416      0.000220      0.198023      0.053370   
std      573.245719      1.723863      0.011763      0.061478      0.203029   
min        4.400000      1.481605     -0.228997     -1.000000     -1.223378   
25%       22.987500      3.134951     -0.004572      0.172979     -0.044455   
50%       96.700000      4.571613      0.000460      0.19

In [86]:
print('X.shape: ', X.shape)
print('y.shape: ', y.shape)

X.shape:  (20003, 2500, 1)
y.shape:  (20003, 1)


In [87]:
model = get_model(window)
model.summary()
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_97 (Conv1D)           (None, 2499, 32)          96        
_________________________________________________________________
max_pooling1d_97 (MaxPooling (None, 1249, 32)          0         
_________________________________________________________________
conv1d_98 (Conv1D)           (None, 1248, 32)          2080      
_________________________________________________________________
max_pooling1d_98 (MaxPooling (None, 624, 32)           0         
_________________________________________________________________
conv1d_99 (Conv1D)           (None, 623, 32)           2080      
_________________________________________________________________
max_pooling1d_99 (MaxPooling (None, 311, 32)           0         
_________________________________________________________________
dropout_144 (Dropout)        (None, 311, 32)           0         
__________

In [68]:
checkpointer = ModelCheckpoint(filepath=model_file, monitor='val_loss', save_best_only=True, verbose=1)
earlystopper = EarlyStopping(monitor='val_loss', patience=10)

training_history = model.fit(X, y, batch_size=batch_size, epochs=num_epochs, verbose=1,
          callbacks=[checkpointer, earlystopper], validation_split=validation_frac)

Train on 16002 samples, validate on 4001 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000


Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000


In [72]:
predictions = model.predict(X[0], verbose=1)

ValueError: Error when checking : expected conv1d_88_input to have 3 dimensions, but got array with shape (2500, 1)

In [70]:
predictions.shape

(20003, 1)

In [71]:
predictions

array([[ 0.69730878],
       [ 0.69730878],
       [ 0.69730878],
       ..., 
       [ 0.69730878],
       [ 0.69730878],
       [ 0.69730878]], dtype=float32)

In [77]:
check_data = pd.DataFrame(X.reshape(-1, window))

In [78]:
check_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0.226387,0.185090,0.146491,0.229532,0.149306,0.157904,0.190854,0.232900,0.229681,0.109938,...,0.168598,0.019034,0.007702,0.252343,0.266630,0.191923,0.181991,0.425317,0.163478,0.430871
1,0.185090,0.146491,0.229532,0.149306,0.157904,0.190854,0.232900,0.229681,0.109938,0.199897,...,0.019034,0.007702,0.252343,0.266630,0.191923,0.181991,0.425317,0.163478,0.430871,0.150906
2,0.146491,0.229532,0.149306,0.157904,0.190854,0.232900,0.229681,0.109938,0.199897,0.184776,...,0.007702,0.252343,0.266630,0.191923,0.181991,0.425317,0.163478,0.430871,0.150906,0.444841
3,0.229532,0.149306,0.157904,0.190854,0.232900,0.229681,0.109938,0.199897,0.184776,0.233087,...,0.252343,0.266630,0.191923,0.181991,0.425317,0.163478,0.430871,0.150906,0.444841,0.231982
4,0.149306,0.157904,0.190854,0.232900,0.229681,0.109938,0.199897,0.184776,0.233087,0.226861,...,0.266630,0.191923,0.181991,0.425317,0.163478,0.430871,0.150906,0.444841,0.231982,0.148541
5,0.157904,0.190854,0.232900,0.229681,0.109938,0.199897,0.184776,0.233087,0.226861,0.244498,...,0.191923,0.181991,0.425317,0.163478,0.430871,0.150906,0.444841,0.231982,0.148541,0.121287
6,0.190854,0.232900,0.229681,0.109938,0.199897,0.184776,0.233087,0.226861,0.244498,0.217574,...,0.181991,0.425317,0.163478,0.430871,0.150906,0.444841,0.231982,0.148541,0.121287,0.210294
7,0.232900,0.229681,0.109938,0.199897,0.184776,0.233087,0.226861,0.244498,0.217574,0.140499,...,0.425317,0.163478,0.430871,0.150906,0.444841,0.231982,0.148541,0.121287,0.210294,0.183456
8,0.229681,0.109938,0.199897,0.184776,0.233087,0.226861,0.244498,0.217574,0.140499,0.229588,...,0.163478,0.430871,0.150906,0.444841,0.231982,0.148541,0.121287,0.210294,0.183456,0.142853
9,0.109938,0.199897,0.184776,0.233087,0.226861,0.244498,0.217574,0.140499,0.229588,0.214632,...,0.430871,0.150906,0.444841,0.231982,0.148541,0.121287,0.210294,0.183456,0.142853,0.151426
