# Notebook to test the paramters of the LSTM trend model

In [1]:
#imports
import numpy as np
from math import sqrt
from numpy import concatenate
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot
import pandas as pd

from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from keras.models import Sequential
from keras.layers import Dropout
import sklearn
from keras.layers import Activation
from keras.layers import Bidirectional
# from keras.layers import CuDNNLSTM
from keras.optimizers import Adam
from keras.layers import Dense
import keras.utils
from keras.layers import LSTM
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import tensorflow as tf
import seaborn as sn
import matplotlib.pyplot as plt
import seed
import os
tf.get_logger().setLevel('ERROR')

In [2]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
        
    return agg

In [3]:
#function to calculate rsi
def rsi(ohlc, period: int = 14) -> pd.Series:
    """See source https://github.com/peerchemist/finta
    and fix https://www.tradingview.com/wiki/Talk:Relative_Strength_Index_(RSI)
    Relative Strength Index (RSI) is a momentum oscillator that measures the speed and change of price movements.
    RSI oscillates between zero and 100. Traditionally, and according to Wilder, RSI is considered overbought when above 70 and oversold when below 30.
    Signals can also be generated by looking for divergences, failure swings and centerline crossovers.
    RSI can also be used to identify the general trend."""

    delta = ohlc["Close"].diff()

    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0

    _gain = up.ewm(com=(period - 1), min_periods=period).mean()
    _loss = down.abs().ewm(com=(period - 1), min_periods=period).mean()

    RS = _gain / _loss
    return 100 - (100 / (1 + RS))

In [4]:
def get_cols_to_rem_label(n_lag, n_features, label_col):
    to_rem = []
    for i in range(1, n_features+1):
        for j in range(0, n_lag+1):
            if(j!=0 or i != label_col):
                if(j == 0):
                    to_rem.append("var"+str(i)+"(t)")
                else:
                    to_rem.append("var"+str(i)+"(t-"+str(j)+")")
    return to_rem
    
def get_cols_to_rem_feat(n_lag, n_features, label_col):
    #uncomment the below line to only remove the last price
    to_rem = ["var1(t)", "var2(t)"]
    return to_rem

In [5]:
def create_model_test(epochs, neurons, batch_size, layers, train_X, test_X, train_y, test_y, lag_features, features, df, train_size):
    global n_lag
    global n_features

    # design network
    model = Sequential()
    dropout = 0.25
    activ_func = "linear"
    
    return_seq = layers > 1

    model.add(LSTM(neurons, return_sequences=return_seq, input_shape=(train_X.shape[1], train_X.shape[2]), activation=activ_func))
    model.add(Dropout(dropout))
    
    for i in range(1, layers):
        ret_seq = i != (layers-1)
        model.add(LSTM(neurons, return_sequences=ret_seq, activation=activ_func))
        model.add(Dropout(dropout))
        
#     model.summary()


    model.add(Dense(2, activation="softmax"))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # fit network
    history = model.fit(train_X, train_y, epochs=epochs, batch_size=batch_size, verbose=1, shuffle=False,validation_split=0.2)
    
    if(lag_features > 0):
        test_X = test_X.reshape((test_X.shape[0], lag_features, features))
    else:
        test_X = test_X.reshape((test_X.shape[0], 1, features-1))

    pred = model.predict(test_X)

    if(lag_features > 0):
        test_X = test_X.reshape((test_X.shape[0], lag_features* features,))
    else:
        test_X = test_X.reshape((test_X.shape[0], features-1,))
        
    pred = np.argmax(pred, axis=1)
    
    times = pd.Series(df.index)
    times = times[lag_features:]
    times = times[train_size:]
    
    prices = pd.DataFrame()
    prices["Actual"] = test_y
    prices["Predicted"] = pred
    prices["Correct"] = (prices["Actual"] - prices["Predicted"]) == 0
    incorrect = prices.loc[prices['Correct'] == False]
    incorrect_len = len(incorrect)
    prices_len = len(prices)
    accuracy = ((prices_len-incorrect_len)/prices_len)
    
    return accuracy

In [6]:
def load_lag_sets(lag_features, train_ratio, lag_granularity, lag, dataset_grouped_by, cleaned):
    # type of analyser - TextBlob or vader
    analyser = "vader"
    # analyser = "TextBlob"
    
    #read dataset
    folder = "./../datasets/tweets_prices_volumes_sentiment/"+analyser+"/"+dataset_grouped_by+"_datasets"
    
    if cleaned:
        folder = folder + '/cleaned'
        
    print(folder)        
    filename = folder+"/final_data_lag_"+lag_granularity+"_"+str(lag)+".csv" if (lag > 0) else folder+"/final_data_no_lag.csv"

    df = pd.read_csv(filename)
    
    #group by datetime
    df = df.groupby('DateTime').agg(lambda x: x.mean())
    
    df["Change"] = (df["Close"] > df["Close"].shift(1)).astype(int)

    add_RSI = False
    add_longMAvg = False
    add_shortMAvg = False

    if(add_RSI):
        #calcualte RSI
        RSI = 14
        df['RSI'] = rsi(df, RSI)
        df = df.iloc[RSI:]

    #calcualte moving averages

    if(add_shortMAvg):
        short_window = 9
        df['short_mavg'] = df.rolling(window=short_window)["Close"].mean()

    if(add_longMAvg):
        long_window = 21
        df["long_mavg"] = df.rolling(window=long_window)["Close"].mean()

    if(add_longMAvg):
        df = df.iloc[long_window:]
    elif(add_RSI):
        df = df.iloc[RSI:]
    elif(add_shortMAvg):
        df = df.iloc[short_window:]
        
        
    #keep only wanted columns
    features = ['Change', 'subjectivity', 'polarity','Tweet_vol','Volume_(BTC)'] if analyser == "Textblob" else ['Change', 'Close', 'pos_pol', 'neg_pol', 'Tweet_vol']

    # features = ['Change', 'subjectivity', 'polarity','Tweet_vol','Volume_(BTC)'] if analyser == "Textblob" else ['Change', 'Close', 'compound', 'pos_pol', 'neg_pol', 'neu_pol', 'Tweet_vol','Volume_(BTC)']

    if(add_RSI):
        features.append("RSI")

    if(add_longMAvg):
        features.append("long_mavg")

    if(add_shortMAvg):
        features.append("short_mavg")

    df = df[features]

    #creating copy so that data is not loaded once again
    df_copy = df.copy()
    
    #number of previous records to consider for every example
    n_lag = lag_features
    #number of features
    n_features = len(features)
    #calcualte total_features
    total_features = n_lag*n_features

    if(total_features == 0):
        total_features = n_features
        
    #divide df into train and test
    data_len = len(df_copy)
    train_size = int(data_len*train_ratio)    

    train = df_copy.iloc[:train_size]
    test = df_copy.iloc[train_size:]
    
    train_y = train["Change"][n_lag:].values
    test_y = test["Change"][n_lag:].values

    train_y = train_y.reshape(len(train_y), 1)
    test_y = test_y.reshape(len(test_y), 1)
    
    xscaler = MinMaxScaler(feature_range=(0, 1))
    train = xscaler.fit_transform(train)
    test = xscaler.transform(test)

    train_reframed = series_to_supervised(train, n_lag, 1)
    train_reframed =  train_reframed.reset_index()
    train_reframed = train_reframed.drop(['index'], axis=1)

    test_reframed = series_to_supervised(test, n_lag, 1)
    test_reframed =  test_reframed.reset_index()
    test_reframed = test_reframed.drop(['index'], axis=1)

    train_reframed_labels = train_y
    test_reframed_labels = test_y
    
    train_labels = train_reframed.drop(get_cols_to_rem_feat(n_lag, n_features, 1), axis=1)
    test_labels = test_reframed.drop(get_cols_to_rem_feat(n_lag, n_features, 1), axis=1)

    train = train_reframed.values
    test = test_reframed.values
    
    #remove the last set of values(data of time to be predicted)
    train = train[:, :total_features]
    test = test[:, :total_features]

    #keep only prices array
    train_X, train_y = train[:, :total_features], train_y[:, -1]
    test_X, test_y = test[:, :total_features], test_y[:, -1]

    # reshape input to be 3D [samples, timesteps, features]
    if(n_lag > 0):
        train_X = train_X.reshape((train_X.shape[0], n_lag, n_features))
        test_X = test_X.reshape((test_X.shape[0], n_lag, n_features))
    else:
        train_X = train_X.reshape((train_X.shape[0], 1, n_features-1))
        test_X = test_X.reshape((test_X.shape[0], 1, n_features-1))

    train_y = keras.utils.to_categorical(train_y, 2)
    
    return train_X, test_X, train_y, test_y, len(features), df, train_size

In [7]:
def average(lag_granularity, lag, dataset_grouped_by, cleaned, epochs, layers, neurons, batch_size, lag_features):
    #get filename
    filename = 'results/lstm_groupedby_'+dataset_grouped_by+"_lag_"+lag_granularity+"_"+str(lag)

    if cleaned:
        filename = filename + '_cleaned'

    full_filename = filename+".csv"
    
    train_ratio = 0.85
    
    train_X, test_X, train_y, test_y, features, df, train_size = load_lag_sets(lag_features, train_ratio, lag_granularity, lag, dataset_grouped_by, cleaned)

    print("Testing model: lag:", lag_features, ", epochs:", epochs, ", neurons:", neurons, ", layers:", layers, ", batch_size:", batch_size)
    accuracies = []
    
    for i in range (0,10):
        acc = create_model_test(epochs, neurons, batch_size, lag_features, train_X, test_X, train_y, test_y, lag_features, features, df, train_size)
        accuracies.append(acc)
        print("Run", (i+1), ":", acc)

    return np.array(accuracies).mean()
        

In [8]:
#lag granularity - days or hours
lag_granularity = "days"
#lag value
lag = 3
#dataset grouped type - day or hour
dataset_grouped_by = "day"
#cleaned
cleaned = True

In [9]:
epochs = 1000
feature_lag = 14
neurons = 32
layers = 3
batch_size = 5

In [10]:
average(lag_granularity, lag, dataset_grouped_by, cleaned, epochs, layers, neurons, batch_size, feature_lag)

./../datasets/tweets_prices_volumes_sentiment/vader/day_datasets/cleaned
Testing model: lag: 14 , epochs: 1000 , neurons: 32 , layers: 3 , batch_size: 5
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
 6/60 [==>...........................] - ETA: 3s - loss: 0.7328 - accuracy: 0.4000

KeyboardInterrupt: 