# Voting Predictor 

In [1]:
#imports
import numpy as np
from math import sqrt
from numpy import concatenate

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle

from matplotlib import pyplot
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from pandas import concat


import keras.utils
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Bidirectional
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import Adam

import tensorflow as tf
import seaborn as sn
import seed
import os
tf.get_logger().setLevel('ERROR')

In [2]:
"""
method to create lagged features

data - data
to_keep - number of lagged_features
to_remove - number of days to remove

"""
def create_lagged_features(data, to_keep=1, to_remove=1):
    variables = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    columns, names = list(), list()
    
    for i in range(to_keep, 0, -1):
        columns.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(variables)]

    for i in range(0, to_remove):
        columns.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(variables)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(variables)]
            
    #put it all together
    final = concat(columns, axis=1)
    final.columns = names
    
    #drop rows with NaN values
    final.dropna(inplace=True)
        
    return final

In [3]:
"""
function to calculate rsi

data - data
period - RSI period

"""
def rsi(data, period: int = 14):
    
    delta = data["Close"].diff()

    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0

    gain = up.ewm(com=(period - 1), min_periods=period).mean()
    loss = down.abs().ewm(com=(period - 1), min_periods=period).mean()

    RS = gain / loss
    return 100 - (100 / (1 + RS))

In [93]:
def load_data(lag, lagged_features):
    #lag granularity - days or hours
    lag_granularity = "days"
    # type of analyser - TextBlob or vader
    analyser = "vader"
    #dataset grouped type - day or hour
    dataset_grouped_by = "day"
    
    #read dataset
    folder = "./../../datasets/tweets_prices_volumes_sentiment/"+analyser+"/"+dataset_grouped_by+"_datasets/cleaned"
    filename = folder+"/final_data_lag_"+lag_granularity+"_"+str(lag)+".csv" if (lag > 0) else folder+"/final_data_no_lag.csv"
    print(filename)
    df = pd.read_csv(filename)
    
    #group by datetime
    df = df.groupby('DateTime').agg(lambda x: x.mean())
    
    #get change label
    df["Change"] = (df["Close"] > df["Close"].shift(1)).astype(int)

    add_RSI = False
    add_longMAvg = False
    add_shortMAvg = False

    if(add_RSI):
        #calcualte RSI
        RSI = 14
        df['RSI'] = rsi(df, RSI)
        df = df.iloc[RSI:]

    #calculate moving averages
    if(add_shortMAvg):
        short_window = 9
        df['short_mavg'] = df.rolling(window=short_window)["Close"].mean()

    if(add_longMAvg):
        long_window = 21
        df["long_mavg"] = df.rolling(window=long_window)["Close"].mean()

    if(add_longMAvg):
        df = df.iloc[long_window:]
    elif(add_RSI):
        df = df.iloc[RSI:]
    elif(add_shortMAvg):
        df = df.iloc[short_window:]
        
    #keep only wanted columns
    features = ['Change', 'Close', 'pos_pol', 'neg_pol', 'Tweet_vol']

    if(add_RSI):
        features.append("RSI")

    if(add_longMAvg):
        features.append("long_mavg")

    if(add_shortMAvg):
        features.append("short_mavg")

    df = df[features]
    
    #if lag is 3 remove first two records
    if(lag == 3):
        df = df.iloc[2:]
        
    
    #number of previous records to consider for every example
    n_lag = lagged_features
    #number of features
    n_features = len(features)
    #calculate total_features
    total_features = n_lag*n_features

    if(total_features == 0):
        total_features = n_features
        
    #add lagged data to records
    data_with_lagged = create_lagged_features(df, n_lag, 1)
    data_with_lagged = data_with_lagged.reset_index()
    
    return data_with_lagged, total_features


In [94]:
def get_features_labels(data, total_features):
    data_y = data["var1(t)"].values
    data_X = data.iloc[:, :total_features]
    data_X = data_X.drop(['DateTime'], axis=1)
    
    return data_X, data_y

In [95]:
#load data for 3 days lag
data_lag3, data_lag3_total_features = load_data(3, 3)
#load data for 1 day lag
data_lag1, data_lag1_total_features = load_data(1, 7)

#get dates of lag1
dates_lag1 = data_lag1["DateTime"].unique()
#get dates of lag3
dates_lag3 = data_lag3["DateTime"].unique()

#get common dates
common_dates = list(set(dates_lag1).intersection(dates_lag3))
#keep only common dates
data_lag3 = data_lag3.loc[data_lag3['DateTime'].isin(common_dates)]
data_lag1 = data_lag1.loc[data_lag1['DateTime'].isin(common_dates)]

#split into features and labels
data_lag3_X, data_lag3_y = get_features_labels(data_lag3, data_lag3_total_features)
data_lag1_X, data_lag1_ = get_features_labels(data_lag1, data_lag1_total_features)

./../../datasets/tweets_prices_volumes_sentiment/vader/day_datasets/cleaned/final_data_lag_days_3.csv
./../../datasets/tweets_prices_volumes_sentiment/vader/day_datasets/cleaned/final_data_lag_days_1.csv


In [96]:
data_lag3_X

Unnamed: 0,var1(t-3),var2(t-3),var3(t-3),var4(t-3),var5(t-3),var1(t-2),var2(t-2),var3(t-2),var4(t-2),var5(t-2),var1(t-1),var2(t-1),var3(t-1),var4(t-1)
4,0.0,6435.58,0.113801,0.045525,1008.0,0.0,6416.13,0.118816,0.039380,1011.0,0.0,6178.21,0.123119,0.049271
5,0.0,6416.13,0.118816,0.039380,1011.0,0.0,6178.21,0.123119,0.049271,856.0,1.0,6363.43,0.122972,0.033356
6,0.0,6178.21,0.123119,0.049271,856.0,1.0,6363.43,0.122972,0.033356,907.0,0.0,6294.42,0.123615,0.044161
7,1.0,6363.43,0.122972,0.033356,907.0,0.0,6294.42,0.123615,0.044161,958.0,1.0,6303.27,0.114486,0.045826
8,0.0,6294.42,0.123615,0.044161,958.0,1.0,6303.27,0.114486,0.045826,863.0,0.0,6299.99,0.108031,0.044800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,0.0,8123.89,0.178603,0.070517,47858.0,0.0,8085.36,0.182801,0.072054,48467.0,0.0,7618.17,0.172670,0.067985
444,0.0,8085.36,0.182801,0.072054,48467.0,0.0,7618.17,0.172670,0.067985,59488.0,0.0,7272.99,0.181425,0.067010
445,0.0,7618.17,0.172670,0.067985,59488.0,0.0,7272.99,0.181425,0.067010,62973.0,1.0,7308.09,0.195825,0.060037
446,0.0,7272.99,0.181425,0.067010,62973.0,1.0,7308.09,0.195825,0.060037,61602.0,0.0,7027.11,0.184113,0.071272


In [97]:
data_lag1_X

Unnamed: 0,var1(t-7),var2(t-7),var3(t-7),var4(t-7),var5(t-7),var1(t-6),var2(t-6),var3(t-6),var4(t-6),var5(t-6),...,var5(t-3),var1(t-2),var2(t-2),var3(t-2),var4(t-2),var5(t-2),var1(t-1),var2(t-1),var3(t-1),var4(t-1)
0,0.0,7027.99,0.130654,0.043827,557.0,1.0,7187.23,0.112971,0.039833,902.0,...,980.0,0.0,6927.90,0.118960,0.039350,987.0,0.0,6435.58,0.121899,0.049716
1,1.0,7187.23,0.112971,0.039833,902.0,1.0,7275.00,0.117174,0.036618,818.0,...,987.0,0.0,6435.58,0.121899,0.049716,1027.0,0.0,6416.13,0.121202,0.032114
2,1.0,7275.00,0.117174,0.036618,818.0,1.0,7301.01,0.115190,0.047145,860.0,...,1027.0,0.0,6416.13,0.121202,0.032114,1012.0,0.0,6178.21,0.123376,0.043544
3,1.0,7301.01,0.115190,0.047145,860.0,1.0,7357.07,0.114770,0.045408,980.0,...,1012.0,0.0,6178.21,0.123376,0.043544,800.0,1.0,6363.43,0.120444,0.045632
4,1.0,7357.07,0.114770,0.045408,980.0,0.0,6927.90,0.118960,0.039350,987.0,...,800.0,1.0,6363.43,0.120444,0.045632,787.0,0.0,6294.42,0.109227,0.045101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
439,0.0,8746.45,0.179880,0.064749,63453.0,0.0,8647.04,0.179688,0.067637,57852.0,...,46589.0,0.0,8200.74,0.182751,0.072099,48266.0,0.0,8123.89,0.172602,0.067930
440,0.0,8647.04,0.179688,0.067637,57852.0,0.0,8503.74,0.179675,0.069204,57564.0,...,48266.0,0.0,8123.89,0.172602,0.067930,59746.0,0.0,8085.36,0.181439,0.066980
441,0.0,8503.74,0.179675,0.069204,57564.0,0.0,8469.20,0.176000,0.070255,56993.0,...,59746.0,0.0,8085.36,0.181439,0.066980,62924.0,0.0,7618.17,0.195808,0.060034
442,0.0,8469.20,0.176000,0.070255,56993.0,1.0,8545.36,0.178359,0.070595,46589.0,...,62924.0,0.0,7618.17,0.195808,0.060034,61536.0,0.0,7272.99,0.184078,0.071387


In [98]:
data_lag3_y

array([1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1,

In [77]:
#get trend model
trend_model = load_model("../bilstm_trend/saved")
trend_model.load_weights("../bilstm_trend/saved")

#get multiclass model
multiclass_model = load_model("../cnn_multiclass/saved")
multiclass_model.load_weights("../cnn_multiclass/saved")

#get normalizer

In [None]:
#normalise features
xscaler = joblib.load("saved/scaler.pkl") 
train = xscaler.fit_transform(train)
test = xscaler.transform(test)

In [None]:
#prepare data
train_labels = train_y
test_labels = test_y

In [None]:
#remove the last set of values(data of time to be predicted)
train = train[:, :total_features]
test = test[:, :total_features]

In [None]:
#keep only prices array
train_X, train_y = train[:, :total_features], train_y[:, -1]
test_X, test_y = test[:, :total_features], test_y[:, -1]

In [None]:
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], n_lag, n_features))
test_X = test_X.reshape((test_X.shape[0], n_lag, n_features))

In [None]:
#set labels for training data to categorical
train_y = keras.utils.to_categorical(train_y, 2)

In [None]:
np.random.seed(1) 
tf.random.set_seed(1)

# design network
model = Sequential()
neurons = 64
epochs = 10000
dropout = 0.25
batch_size = 80
activ_func = "linear"

model.add(Bidirectional(LSTM(neurons, return_sequences=True, input_shape=(train_X.shape[1], train_X.shape[2]), activation=activ_func)))
model.add(Dropout(dropout))

model.add(Bidirectional(LSTM(neurons, return_sequences=False, activation=activ_func)))
model.add(Dropout(dropout))

model.add(Dense(2, activation="softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# early stopping
callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience =50)

# fit network
history = model.fit(train_X, train_y, epochs=10000, batch_size=batch_size, verbose=2, shuffle=True,validation_split=0.2, callbacks=[callback])

In [None]:
#save model
model_file = 'model.png'
tf.keras.utils.plot_model(model, to_file=model_file, show_shapes=True)

In [None]:
#plot loss graph
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.title("Loss graph")
plt.show()

In [None]:
#plot training and validation accuracy
loss_train = history.history['accuracy']
loss_val = history.history['val_accuracy']
epochs = range(1,len(loss_val) + 1)
plt.plot(epochs, loss_train, 'g', label='Training accuracy')
plt.plot(epochs, loss_val, 'b', label='validation accuracy')
plt.title('Training and Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
#reshape
test_X = test_X.reshape((test_X.shape[0], n_lag, n_features))

#predict values for test data
pred = model.predict(test_X)

#reshape again
test_X = test_X.reshape((test_X.shape[0], n_lag* n_features,))

In [None]:
#change back from categorical
pred = np.argmax(pred, axis=1)

In [None]:
#calculate accuracy
prices = pd.DataFrame()
prices["Actual"] = test_y
prices["Predicted"] = pred

prices["Correct"] = (prices["Actual"] - prices["Predicted"]) == 0
incorrect = prices.loc[prices['Correct'] == False]
incorrect_len = len(incorrect)
prices_len = len(prices)

print("Accuracy = ",((prices_len-incorrect_len)/prices_len))