In [1]:
# Import Libraries
import numpy as np
import pandas as pd

# Path
import os
import random
from os.path import join
# from pathlib2 import Path

# ML libraries
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPool2D, Input
from tensorflow.keras import backend as K, callbacks
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error
import tensorflow as tf
import tensorflow.keras as keras

In [2]:
TRAIN_TEST_CUTOFF = '2016-04-21' # limit date for training
TRAIN_VALID_RATIO = 0.75 # Size of training set

# Data Generator
def datagen(data, seq_len, batch_size, targetcol, kind):
    """As a generator to produce samples for Keras model"""
    batch = []
    while True:
        # Pick one dataframe from the pool
        key = random.choice(list(data.keys())) # One filename from dataset
        df = data[key]
        # extract column name for features except target column and save it as an array
        input_cols = [c for c in df.columns if c != targetcol]
        # assign index for train set / Dataframe
        index = df.index[df.index < TRAIN_TEST_CUTOFF]
        # Set point of split for train and validation set / integer
        split = int(len(index) * TRAIN_VALID_RATIO)
        if kind == 'train':
            index = index[:split]   # range for the training set
        elif kind == 'valid':
            index = index[split:]   # range for the validation set
        # Pick one position, then clip a sequence length
        while True:
            t = random.choice(index)      # pick one time step / integer
            n = (df.index == t).argmax()  # find its position in the dataframe / argmax return index where the condition is true
            # Check if n length is greater than 0
            if n-seq_len+1 < 0:
                continue # can't get enough data for one sequence length
            # Get the dataframe 
            frame = df.iloc[n-seq_len+1:n+1]
            #              # ARRAY                      # 1 / integer
            batch.append([frame[input_cols].values, df.loc[t, targetcol]])
            break
        # if we get enough for a batch, dispatch
        if len(batch) == batch_size:
            # Create 1 dimensional array
            X, y = zip(*batch) # The * in a function call "unpacks" a list (or other iterable), making each of its elements a separate argument.
            # Create 4 dimensional array for X, and 1 for y
            X, y = np.expand_dims(np.array(X), 3), np.array(y)
            # returns a generator object
            yield X, y
            batch = []

In [3]:
# Dictionary consist of dataframe for each index
data = {}
DATADIR = '../datasets/processed'
for filename in os.listdir(DATADIR):
    if not filename.lower().endswith(".csv"):
        continue # read only the CSV files
    filepath = os.path.join(DATADIR, filename)
    X = pd.read_csv(filepath, index_col="Date", parse_dates=True)
    # basic preprocessing: get the name, the classification
    # Save the target variable as a column in dataframe for easier dropna()
    name = X["Name"][0] # All name has the same value which is the stock company name
    del X["Name"]
    cols = X.columns
    X["Target"] = (X["Close"].pct_change().shift(-1) > 0).astype(int) # Will input 1 or 0 for up and down
    X.dropna(inplace=True)
    # Fit the standard scaler using the training dataset
    index = X.index[X.index > TRAIN_TEST_CUTOFF] # Get index until TRAIN_TEST_CUTOFF
    index = index[:int(len(index) * TRAIN_VALID_RATIO)] # Get train set 
    # To set all data to standard value. Formula: ((x1-mean)/standardDeviation)
    scaler = StandardScaler().fit(X.loc[index, cols]) # Standardize train set
    # Save scale transformed dataframe
    X[cols] = scaler.transform(X[cols])
    # save each transformed dataframe on data dictionary
    data[name] = X

print("Data columns: ", data.keys())
print("Data Lenght: ", len(data))

Data columns:  dict_keys(['NYA', 'S&P', 'NASDAQ', 'RUT', 'DJI'])
Data Lenght:  5


In [4]:
# Building the Model
# CNN structure
def cnnpred_2d(seq_len=60, n_features=82, n_filters=(8,8,8), droprate=0.1):
    "2D-CNNpred model according to the paper"
    model = Sequential([
        Input(shape=(seq_len, n_features, 1)),
        Conv2D(n_filters[0], kernel_size=(1, n_features), activation="relu"),
        Conv2D(n_filters[1], kernel_size=(3,1), activation="relu"),
        MaxPool2D(pool_size=(2,1)),
        Conv2D(n_filters[2], kernel_size=(3,1), activation="relu"),
        MaxPool2D(pool_size=(2,1)),
        Flatten(),
        Dropout(droprate),
        Dense(1, activation="sigmoid")
    ])
    return model

In [5]:
# Recall = (TP/(TP+FN))
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

# Precision = (TP/(TP+FP))
def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

# F1 Score metrics
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Average of the F1 from positive and negative classification (F1-macro metric)
def f1macro(y_true, y_pred):
    f_pos = f1_m(y_true, y_pred)
    # negative version of the data and prediction
    f_neg = f1_m(1-y_true, 1-K.clip(y_pred,0,1))
    return (f_pos + f_neg)/2

In [6]:
# Optional
# Checkpoints to interupt or resume model training
checkpoint_path = "./cp2d-{epoch}-{val_f1macro:.2f}.h5"
callbacks = [
    callbacks.ModelCheckpoint(checkpoint_path,
                    monitor='val_f1macro', mode="max", verbose=0,
                    save_best_only=True, save_weights_only=False, save_freq="epoch")
]

We set up a filename template checkpoint_path and ask Keras to fill in the epoch number as well as validation F1 score into the filename. We save it by monitoring the validation’s F1 metric, and this metric is supposed to increase when the model gets better. Hence we pass in the mode="max" to it.

In [9]:
seq_len    = 60
batch_size = 128
n_epochs   = 20
n_features = 82

model = cnnpred_2d(seq_len, n_features)
model.compile(optimizer="adam", loss="mae", metrics=["acc", f1macro])
model.fit(datagen(data, seq_len, batch_size, "Target", "train"),
          validation_data=datagen(data, seq_len, batch_size, "Target", "valid"),
          epochs=n_epochs, steps_per_epoch=400, validation_steps=10, verbose=1,
          callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fd3787395b0>

In [12]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_3 (Conv2D)           (None, 60, 1, 8)          664       
                                                                 
 conv2d_4 (Conv2D)           (None, 58, 1, 8)          200       
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 29, 1, 8)         0         
 2D)                                                             
                                                                 
 conv2d_5 (Conv2D)           (None, 27, 1, 8)          200       
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 13, 1, 8)         0         
 2D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 104)              

In [10]:
def testgen(data, seq_len, targetcol):
    "Return array of all test samples"
    batch = []
    for key, df in data.items():
        input_cols = [c for c in df.columns if c != targetcol]
        # find the start of test sample
        t = df.index[df.index >= TRAIN_TEST_CUTOFF][0]
        n = (df.index == t).argmax()
        # extract sample using a sliding window
        for i in range(n+1, len(df)+1):
            frame = df.iloc[i-seq_len:i]
            batch.append([frame[input_cols].values, frame[targetcol][-1]])
    X, y = zip(*batch)
    return np.expand_dims(np.array(X),3), np.array(y)

# Prepare test data
test_data, test_target = testgen(data, seq_len, "Target")
 
# Test the model
test_out = model.predict(test_data)
test_pred = (test_out > 0.5).astype(int)
print("accuracy:", accuracy_score(test_pred, test_target))
print("MAE:", mean_absolute_error(test_pred, test_target))
print("F1:", f1_score(test_pred, test_target))

accuracy: 0.5209756097560976
MAE: 0.4790243902439024
F1: 0.5985282093213409


Model accuracy that is above 50% can be considered plausible. Market trend wouldnt be able to get high prediction because of a lot of outside factors.