In this final notebook, we will use 3 different models to predict the stock price trend using previously collected and aggregated data.
The goal is to predict the FutureTrend using price data, current trend score and sentiment scores. For this use case we will only use the FinBERT sentiment scores.
The models we will be using are SVM, LSTM and EA-LSTM.
EA-LSTM is an augmented version of the LSTM model that has an additional attention layer.
More information about EA-LSTM are available [here.](https://arxiv.org/pdf/1811.03760.pdf)


In [1]:
import pandas as pd
import numpy as np
from math import sqrt

In [2]:
atvi = pd.read_csv('data/final/atvi.csv')
ntdoy = pd.read_csv('data/final/ntdoy.csv')

In [3]:
for col in atvi.columns:
    print(col + " " + str(sum(atvi[col].isna())))

Date (GMT) 0
Last 0
emaTrend 0
trendBrDana 0
fbPos 359
fbNeg 359
fbNeu 359
FutureEmaTrend 10


In [5]:
for col in ntdoy.columns:
    print(col + " " + str(sum(ntdoy[col].isna())))

Date (GMT) 0
Last 0
emaTrend 0
trendBrDana 0
fbPos 8
fbNeg 8
fbNeu 8
FutureEmaTrend 10


In [3]:
def findLastFalse(df, column, index):
    if index == 0:
        return 0
    if not pd.isna(df.loc[index - 1, column]):
        return index - 1
    return findLastFalse(df, column, index - 1)


def findNextFalse(df, column , index):
    if index == len(df[column]) - 1:
        return len(df[column]) - 1
    if not pd.isna(df.loc[index + 1, column]):
        return index + 1
    return findNextFalse(df, column, index + 1)




In [4]:
#we are replacing the missing sentiment values with the mean of the last available sentiment score and the first next available sentiment score
def replaceNAs(df):

    if pd.isna(df.loc[0, "fbNeg"]):
        df.loc[0, ["fbNeg", "fbPos", "fbNeu"]] = df.iloc[findNextFalse(df,"fbNeg", 0), "fbNeg"]

    if pd.isna(df.loc[df.shape[0] - 1, "fbNeg"]):
        df.loc[df.shape[0] - 1, ["fbNeg", "fbPos", "fbNeu"]] = df.iloc[findLastFalse(df,"fbNeg", df.shape[0] - 1), "fbNeg"]

    for i in range(df.shape[0]):
        if pd.isna(df.loc[i, "fbNeg"]):
            last = findLastFalse(df,"fbNeg",i)
            next = findNextFalse(df,"fbNeg",i)
            for j in [i, next - 1]:
                df.loc[j, "fbNeg"] = (df.loc[last, "fbNeg"]+df.loc[next, "fbNeg"])/2
                df.loc[j, "fbNeu"] = (df.loc[last, "fbNeu"]+df.loc[next, "fbNeu"])/2
                df.loc[j, "fbPos"] = (df.loc[last, "fbPos"]+df.loc[next, "fbPos"])/2
            i = next






In [5]:
replaceNAs(atvi)
replaceNAs(ntdoy)

atvi.FutureEmaTrend.fillna(0,inplace=True)
ntdoy.FutureEmaTrend.fillna(0, inplace=True)

In [8]:
#checking if there are any missing values left
for col in atvi.columns:
    print(col + " " + str(sum(atvi[col].isna())))
print()
for col in ntdoy.columns:
    print(col + " " + str(sum(ntdoy[col].isna())))

Date (GMT) 0
Last 0
emaTrend 0
trendBrDana 0
fbPos 0
fbNeg 0
fbNeu 0
FutureEmaTrend 0

Date (GMT) 0
Last 0
emaTrend 0
trendBrDana 0
fbPos 0
fbNeg 0
fbNeu 0
FutureEmaTrend 0


Because of some problems with tweet scraping, ATVI had more missing values and therefore we will be using NTDOY

In [12]:
data = ntdoy.copy()

In [13]:
data.Last = (data.Last-data.Last.min())/(data.Last.max()-data.Last.min()) #normalization

In [14]:
#SVM does not allow tensors for input so we will manually add shifted collumns
def pomereneKolone(data, columns_to_shift, num_shifts):

    df = data.copy()

    last_column_index = df.columns.get_loc(df.columns[-1])

    for column in columns_to_shift:
        for i in range(1, num_shifts + 1):
            shifted_column_name = f'{column}_shifted_{i}'
            df[shifted_column_name] = df[column].shift(i)

    df.dropna(inplace=True)

    #moving the output column to the last space
    column_to_move = df.columns[last_column_index]
    other_columns = [col for col in df.columns if col != column_to_move]
    df = df[other_columns + [column_to_move]]

    return df


In [15]:

data_svm = pomereneKolone(data,["Last","emaTrend"],5)

In [16]:
#added normalization for LSTM
data.emaTrend = (data.emaTrend + 1)/2
data.trendBrDana = (data.trendBrDana + 1)/2
data.FutureEmaTrend = data.FutureEmaTrend + 1

In [17]:
data_svm.head()

Unnamed: 0,Date (GMT),Last,emaTrend,trendBrDana,fbPos,fbNeg,fbNeu,Last_shifted_1,Last_shifted_2,Last_shifted_3,Last_shifted_4,Last_shifted_5,emaTrend_shifted_1,emaTrend_shifted_2,emaTrend_shifted_3,emaTrend_shifted_4,emaTrend_shifted_5,FutureEmaTrend
5,2019-01-09,0.058337,1,0,0.056432,0.026387,0.917181,0.043096,0.036334,0.020993,0.012919,0.018571,1.0,1.0,1.0,-1.0,0.0,1.0
6,2019-01-10,0.046023,1,0,0.288594,0.163498,0.547908,0.058337,0.043096,0.036334,0.020993,0.012919,1.0,1.0,1.0,1.0,-1.0,1.0
7,2019-01-11,0.044913,1,0,0.073317,0.02239,0.904293,0.046023,0.058337,0.043096,0.036334,0.020993,1.0,1.0,1.0,1.0,1.0,-1.0
8,2019-01-14,0.05874,1,0,0.141267,0.065509,0.793224,0.044913,0.046023,0.058337,0.043096,0.036334,1.0,1.0,1.0,1.0,1.0,1.0
9,2019-01-15,0.082862,1,0,0.148546,0.031439,0.820014,0.05874,0.044913,0.046023,0.058337,0.043096,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
#Now we are splitting the data nad removing the Date column
def podela(df):
    #podela podataka
    train = df.loc[df["Date (GMT)"] < "2022-01-01"]
    test = df.loc[df["Date (GMT)"] > "2021-12-31"]

    #izbacivanje kolone datum
    train = train.iloc[:,1:]
    test = test.iloc[:,1:]

    X_train = train.iloc[:, :-1]
    X_test = test.iloc[:, :-1]
    y_train = train.iloc[:, -1]
    y_test = test.iloc[:, -1]



    return X_train,X_test,y_train,y_test

In [19]:
X_train,X_test,y_train,y_test = podela(data)
X_trainsvm,X_testsvm,y_trainsvm,y_testsvm = podela(data_svm)

<b>SVM</b>

In [25]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [21]:
model_svc = SVC(kernel='rbf',gamma=0.5, C=10)

In [23]:
model_svc.fit(X_trainsvm,y_trainsvm)

In [26]:
predictions = model_svc.predict(X_testsvm)

In [32]:
accuracy_score(y_testsvm, predictions)

0.5059760956175299

Testing different hyperparameters

In [101]:
parameters = {
    'C':[2500, 2000, 1800, 1500, 1000, 100, 10, 1],
    'gamma': ['scale', 'auto', 1, 0.5, 0.2],
    'kernel':['linear', 'poly', 'rbf', 'sigmoid']
}

In [102]:
from sklearn.model_selection import GridSearchCV

In [103]:
cv = [(slice(None), slice(None))] #to use GridSearch without cross-validation because we are working with a time series


In [104]:
gs = GridSearchCV(estimator=model_svc, param_grid=parameters,
                   cv=cv, n_jobs=-1)


In [108]:

gs.fit(data_svm.iloc[:,1:12],data_svm.iloc[:,12])


GridSearchCV(cv=[(slice(None, None, None), slice(None, None, None))],
             estimator=SVC(C=10, gamma=0.5), n_jobs=-1,
             param_grid={'C': [2500, 2000, 1800, 1500, 1000, 100, 10, 1],
                         'gamma': ['scale', 'auto', 1, 0.5, 0.2],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [111]:
print(f"Best params: C:" + str(gs.best_estimator_.C) + ", gamma:" + str(gs.best_estimator_.gamma) + " kernel:" + str(gs.best_estimator_.kernel))

Best params: C:2500, gamma:1 kernel:rbf


New model with optimal hyperparameters

In [117]:
model_svc = SVC(kernel='rbf',gamma=1, C=2500)
model_svc.fit(X_trainsvm,y_trainsvm)
predictions = model_svc.predict(X_testsvm)

In [119]:
accuracy_score(y_testsvm, predictions)

0.5338645418326693

<b>LSTM</b>

In [19]:
#pip install tensorflow
import tensorflow as tf
from keras.models import Sequential
from keras.layers import *
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, save_model
from sklearn.metrics import mean_squared_error, accuracy_score

Adding time-steps

In [20]:
time_steps = 7
num_classes = 3

prepend_features = X_train.iloc[-(time_steps-1):]
X_test2 = pd.concat([prepend_features, X_test], axis=0)

X_train1, y_train1 = [], []
for i in range(y_train.shape[0] - (time_steps-1)):
    X_train1.append(X_train.iloc[i:i+time_steps].values)
    y_train1.append(y_train.iloc[i + (time_steps-1)])
X_train1, y_train1 = np.array(X_train1), np.array(y_train1).reshape(-1, 1)
print(f'Train data dimensions: {X_train1.shape}, {y_train1.shape}')

X_test1, y_test1 = [], []
for i in range(y_test.shape[0]):
    X_test1.append(X_test2.iloc[i:i+time_steps].values)
    y_test1.append(y_test.iloc[i])
X_test1, y_test1 = np.array(X_test1), np.array(y_test1).reshape(-1, 1)

print(f'Test data dimensions: {X_test1.shape}, {y_test1.shape}')

y_testcat = to_categorical(y_test1, num_classes=3)
y_traincat = to_categorical(y_train1, num_classes=3)

Train data dimensions: (751, 7, 6), (751, 1)
Test data dimensions: (251, 7, 6), (251, 1)


Creating initial model

In [36]:
model = Sequential()
model.add(InputLayer((7,X_train1.shape[2])))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(3,"softmax"))

In [37]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 7, 256)            269312    
                                                                 
 lstm_1 (LSTM)               (None, 128)               197120    
                                                                 
 dense (Dense)               (None, 3)                 387       
                                                                 
Total params: 466,819
Trainable params: 466,819
Non-trainable params: 0
_________________________________________________________________


In [223]:
#saving the model from the best training epoch
cp = ModelCheckpoint('models/lstm/', save_best_only=True, monitor='val_accuracy')

In [224]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.0007), metrics=['accuracy'])

In [225]:
# from keras.callbacks import ReduceLROnPlateau
# reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.9, patience=5, verbose=1)

In [227]:
#training the network
model.fit(X_train1,y_traincat,validation_data=(X_test1,y_testcat),epochs=50,callbacks=[cp])#reduce_lr

Epoch 1/50



INFO:tensorflow:Assets written to: model5c\assets


INFO:tensorflow:Assets written to: model5c\assets


Epoch 2/50



INFO:tensorflow:Assets written to: model5c\assets


INFO:tensorflow:Assets written to: model5c\assets


Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50



INFO:tensorflow:Assets written to: model5c\assets


INFO:tensorflow:Assets written to: model5c\assets


Epoch 9/50
Epoch 10/50



INFO:tensorflow:Assets written to: model5c\assets


INFO:tensorflow:Assets written to: model5c\assets


Epoch 11/50
Epoch 12/50
Epoch 13/50



INFO:tensorflow:Assets written to: model5c\assets


INFO:tensorflow:Assets written to: model5c\assets


Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50



INFO:tensorflow:Assets written to: model5c\assets


INFO:tensorflow:Assets written to: model5c\assets


Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50



INFO:tensorflow:Assets written to: model5c\assets


INFO:tensorflow:Assets written to: model5c\assets


Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50



INFO:tensorflow:Assets written to: model5c\assets


INFO:tensorflow:Assets written to: model5c\assets


Epoch 34/50
Epoch 35/50
Epoch 36/50



INFO:tensorflow:Assets written to: model5c\assets


INFO:tensorflow:Assets written to: model5c\assets


Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50



INFO:tensorflow:Assets written to: model5c\assets


INFO:tensorflow:Assets written to: model5c\assets


Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1fc54ab5610>

In [199]:
#Checking the best learning rate
# lrs = [0.004,0.003,0.002,0.0018,0.0013,0.0011,0.0010,0.0009,0.0008,0.0007,0.0006,0.0005,0.0004,0.0003,0.0002]
#
# for lr in lrs:
#     model = Sequential()
#     model.add(InputLayer((11,1)))
#     model.add(LSTM(128, return_sequences=True))
#     model.add(LSTM(128))
#     model.add(Dense(8,"relu"))
#     model.add(Dense(3,"softmax"))
#
#     model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=lr), metrics=['accuracy'])
#     model.fit(X_train,(y_train+1),validation_data=(X_test,(y_test + 1)),epochs=20,callbacks=[cp])

In [17]:
#loading the best epoch model
model = load_model('models/lstm/')

In [24]:
lstm_predictions = model.predict(X_test1)



In [26]:
lstm_predictions[0:5]

array([[5.8446944e-01, 8.9780529e-05, 4.1544074e-01],
       [5.5865902e-01, 1.1288827e-04, 4.4122815e-01],
       [6.3111371e-01, 1.7095068e-04, 3.6871538e-01],
       [7.2440594e-01, 3.0656825e-04, 2.7528745e-01],
       [6.5444452e-01, 3.8827819e-04, 3.4516716e-01]], dtype=float32)

In [27]:
predicted_classes = np.argmax(lstm_predictions, axis=1)

In [30]:
accuracy_score(y_test1,predicted_classes)

0.5936254980079682

<b>EA-LSTM</b>

In [52]:
from ea_lstm import (initialize_weights, individual_to_key,pop_to_weights,
                     select, reconstruct_population,apply_weight, is_minimum, is_maximum)
from sklearn.metrics import mean_absolute_error
from math import sqrt
from copy import copy
from keras.callbacks import EarlyStopping, LambdaCallback
import json

In [28]:
#Creating initial model
def make_model():
    model = Sequential()
    model.add(InputLayer((7,6)))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(128))
    model.add(Dense(3,"softmax"))

    return model

In [64]:
best_model = make_model()
best_weight = [1.0] * time_steps

In [66]:
best_model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.0007), metrics=['accuracy'])
cpea = ModelCheckpoint('models/lstm_temp/', save_best_only=True, monitor='val_accuracy')

In [68]:
#Initial training

best_model.fit(apply_weight(X_train1, best_weight), y_traincat, epochs=35,
               validation_data=(apply_weight(X_test1, best_weight), y_testcat), callbacks=[cpea])

Epoch 1/35



INFO:tensorflow:Assets written to: modelealstm\assets


INFO:tensorflow:Assets written to: modelealstm\assets


Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35



INFO:tensorflow:Assets written to: modelealstm\assets


INFO:tensorflow:Assets written to: modelealstm\assets


Epoch 20/35
Epoch 21/35
Epoch 22/35



INFO:tensorflow:Assets written to: modelealstm\assets


INFO:tensorflow:Assets written to: modelealstm\assets


Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35



INFO:tensorflow:Assets written to: modelealstm\assets


INFO:tensorflow:Assets written to: modelealstm\assets


Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x20aa39e5910>

In [142]:
best_model = load_model('models/lstm_temp/')

In [113]:
#Defining LambdaCallback-a so we can save the best epoch model in each loop iteration without creating new save files
def save_best_epoch_model(epoch, logs):
    global best_val_accuracy, best_model_weights
    val_accuracy = logs.get('val_accuracy')
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model_weights = model.get_weights()

save_best_model = LambdaCallback(on_epoch_end=lambda epoch, logs: save_best_epoch_model(epoch, logs))

In [115]:
#to reduce overfitting we are adding an earlystopper to end the training if val_acc doesnt improve for 10 epochs
early_stop = EarlyStopping(monitor='val_accuracy', patience=10, verbose=1, restore_best_weights=True)

In [209]:
#competitive random search
pop_size = 36
code_length = 6
iterations = 20
n_select = 6
time_steps = 7

pop, weights = initialize_weights(pop_size, time_steps, code_length)
key_to_rmse = {}

for iteration in range(iterations):
    for enum, (indiv, weight) in enumerate(zip(pop, weights)):
        print('iteration: [%d/%d] indiv_no: [%d/%d]' % (iteration + 1, iterations, enum + 1, pop_size))
        key = individual_to_key(indiv)
        if key not in key_to_rmse.keys():

            best_val_accuracy = 0.0
            best_model_weights = None

            model = make_model()
            model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.0007), metrics=['accuracy'])
            model.set_weights(best_model.get_weights())
            model.fit(apply_weight(X_train1, weight), y_traincat, epochs=20,
                      validation_data=(apply_weight(X_test1, weight), y_testcat), callbacks=[early_stop, save_best_model])

            model.set_weights(best_model_weights)

            pred_y = model.predict(apply_weight(X_test1, weight))
            inv_pred_y = np.argmax(pred_y, axis=1)
            inv_valid_y = y_test1
            val_loss, acc = model.evaluate(apply_weight(X_test1, weight), y_testcat)
            rmse = sqrt(mean_squared_error(inv_valid_y, inv_pred_y))
            mae = mean_absolute_error(inv_valid_y, inv_pred_y)
            print("RMSE: %.4f, MAE: %.4f, ACC: %.4f, VAL_LOSS: %.4f" % (rmse, mae, acc, val_loss))
            #algoritam je napravljen za minimiziranje rmse, pa ako koristimo accuracy moramo da ga promenimo
            negacc = 1 - acc
            if is_minimum(negacc, key_to_rmse):
                best_model.set_weights(model.get_weights())
                best_weight = copy(weight)
            key_to_rmse[key] = negacc

    pop_selected, fitness_selected = select(pop, n_select, key_to_rmse)
    pop = reconstruct_population(pop_selected, pop_size)
    weights = pop_to_weights(pop, time_steps, code_length)


iteration: [1/20] indiv_no: [1/36]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 19: early stopping
RMSE: 1.2781, MAE: 0.8367, ACC: 0.5618, VAL_LOSS: 1.1968
iteration: [1/20] indiv_no: [2/36]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 11: early stopping
RMSE: 1.2718, MAE: 0.8287, ACC: 0.5657, VAL_LOSS: 1.0884
iteration: [1/20] indiv_no: [3/36]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 12: early stopping
RMSE: 1.2718, MAE: 0.8287, ACC: 0.5657, VAL_LOSS: 0.9687
iteration: [1/20] indiv_no: [4/36]
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Ep

In [56]:
def test_eval(model,weight,X,y,ycat):
    print('test evaluation:')
    pred_y = model.predict(apply_weight(X, weight))
    inv_pred_y = np.argmax(pred_y, axis=1)
    inv_test_y = y
    rmse = sqrt(mean_squared_error(inv_test_y, inv_pred_y))
    mae = mean_absolute_error(inv_test_y, inv_pred_y)
    val_loss, acc = model.evaluate(apply_weight(X, weight), ycat)
    print("RMSE: %.4f, MAE: %.4f, ACC: %.4f, VAL_LOSS: %.4f" % (rmse, mae, acc, val_loss))


In [149]:
test_eval(best_model,best_weight,X_test1,y_test1,y_testcat)

test evaluation:
RMSE: 1.1741, MAE: 0.7092, ACC: 0.6255, VAL_LOSS: 1.1501


In [147]:
#saving the model
save_model(best_model, "models/best_ealstm/") #acc 0.6255 rmse 1.1741



INFO:tensorflow:Assets written to: models/best_ealstm/assets


INFO:tensorflow:Assets written to: models/best_ealstm/assets


In [148]:
#load the model
best_model = load_model("models/best_ealstm/")

In [154]:
best_weight # T-6, T-5, ... T-1, T

[0.05, 0.44, 0.89, 0.87, 0.84, 0.49, 0.29]

In [151]:
#saving best weight
with open('models/best_weight.json', 'w') as f:
    json.dump(best_weight, f)

In [153]:
#loading best weight
with open('models/best_weight.json', 'r') as f:
    best_weight = json.load(f)