# LSTM Online Prediction
Do online testing once in a month

In [None]:
# Import all necessary packages
from math import sqrt, floor
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
from indicator import *
from keras.layers import AlphaDropout

In [None]:
# Create a dataframe for each stocks and label them as winning/losing for each period
def preposessing(close,volume,high,low,lags):
    # number of tickers
    N = close.shape[1]
    # Initialize 3 list to store dataframe
    dflist = []
    FutureReturnlist = []
    outputlist = []

    # Use for loop to create a dataframe for each stock
    for i in range(N):
        closei = close.iloc[:,i].values
        volumei = volume.iloc[:,i].values
        highi = high.iloc[:,i].values
        lowi = low.iloc[:,i].values
        df = pd.DataFrame(data={'Close':closei,'Volume':volumei,'High':highi,'Low':lowi},index=close.index)
        # Current Return
        #df['Return'] = df['Close'] / df['Close'].shift(lags) - 1
        # Future Return only for classify stocks into winning and losing. Need to drop it after classification
        df['FutureReturn'] =   df['Close'].shift(-lags) /df['Close']  - 1  # t's return = t+lags / t  - 1
        FutureReturnlist.append(df['FutureReturn'])
        dflist.append(df)

    # Concatenate all stocks returns and find the median for each date
    Return = pd.concat(FutureReturnlist,axis=1)
    Return['median'] = Return.quantile(q=0.5,axis=1)
    
    # Add other stock's median return as features
    currentReturn = close/close.shift(1)-1
    currentReturn.columns = [colName + ' Return' for colName in close.columns]
    currentReturn['Median Return'] = currentReturn.quantile(q=0.5,axis=1)
    for i in range(len(dflist)):
        dflist[i] = pd.concat([dflist[i], currentReturn], axis=1)

    # Classify stocks as 1 if its return > median
    for i in range(N):
        df = dflist[i]
        df['Y'] = np.where(df['FutureReturn'] > Return['median'],pd.Series(1),pd.Series(0))
        df.dropna(inplace = True)
        df.drop(columns='FutureReturn', inplace=True)
        outputlist.append(df)

    return outputlist

In [None]:
# Generate technical indicators
def indicator(df):
    df= moving_average(df,20)
    df= exponential_moving_average(df,40)
    df= stochastic_oscillator_d(df,3)
    df = macd(df, 12, 26)
    df = on_balance_volume(df, 30)
    df = relative_strength_index(df, 6)
    df = commodity_channel_index(df, 14)
    df = average_directional_movement_index(df, 14, 6)
    df = trix(df, 12)
    df = standard_deviation(df, 20)
    df.dropna(inplace=True)
    df.drop('High', axis=1, inplace=True)
    df.drop('Low', axis=1, inplace=True)
    return df

In [None]:
# convert series to supervised learning for LSTM
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [None]:
# Train LSTM model and predict the state of last period 
def trainLSTM(stocks, Ys, date, train_ratio, window=40, max_epoch=50, lstm_node = 50, LSTMs = None, epoch_validation = True, yLookback = 20):
     
    np.random.seed(0)
    
    if LSTMs is None:
        LSTMs = []
    
    performance_record = []
    normalizers = []
    optimal_epoch = []
    trainPredict = None
    validationPredict = None
    testPredict = None

    
    for stock_idx in range(len(stocks)):
        stock = stocks[stock_idx]
        print('Training LSTM: '+str(stock_idx)+'/'+str(len(stocks)))
        
        values = stock.values.astype('float32')
        valuesY = Ys[stock_idx].values.astype('float32')
        n_features = values.shape[1]
        
        # normalize features
        n_train_days = floor(values.shape[0]*train_ratio)
        train_values = values[:n_train_days,:]
        validation_values = values[n_train_days:, :]
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_train = scaler.fit_transform(train_values)
        normalizers.append(scaled_train)
        scaled_validation = scaler.transform(validation_values)
        scaled = np.append(scaled_train, scaled_validation, axis=0)
             
        # get lagged features
        reframed = series_to_supervised(scaled, window-1, 1)
        
        # split into train, validation and test sets, input and outputs
        values = reframed.values

        training_len = n_train_days - (window - 1)
        
        train_date = date[(window-1):(window-1+training_len)]
        validation_date = date[(window-1+training_len):-yLookback]
        test_date = date[-1]

        train_y = valuesY[(window-1):(window-1+training_len)]
        validation_y = valuesY[(window-1+training_len):-yLookback]
        test_y = valuesY[-1]

        train_X = values[:training_len, :]
        validation_X = values[training_len:-yLookback,:]
        test_X = values[-1,:]
        
        # reshape input to be 3D [samples, timesteps, features]
        train_X = train_X.reshape((train_X.shape[0], window, n_features))
        validation_X = validation_X.reshape((validation_X.shape[0], window, n_features))
        test_X = test_X.reshape((1, window, n_features))
        
        # construct network
        model_weights = []
        history_record = []
        
        if(len(LSTMs)<stock_idx+1):
            # design network
            model = Sequential()
            model.add(LSTM(lstm_node, input_shape=(train_X.shape[1], train_X.shape[2])))
            #model.add(AlphaDropout(0.1))
            #model.add(Dense(20, activation='selu', kernel_initializer='lecun_normal'))
            model.add(Dense(1, activation='sigmoid'))
            model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
            LSTMs.append(model)
        else:
            model = LSTMs[stock_idx]
            
        # fit network
        if epoch_validation:
            for i in range(max_epoch):
                history = model.fit(train_X, train_y, epochs=1, validation_data=(validation_X, validation_y), verbose=0, shuffle=False)
                history_record.append(history.history)
                model_weights.append(model.get_weights())

            val_loss = [dic['val_loss'][0] for dic in history_record]
            optimal_epoch.append(np.argmin(val_loss) + 1)
            model.set_weights(model_weights[np.argmin(val_loss)])
            performance_record.append(history_record[np.argmin(val_loss)])

            model.fit(validation_X, validation_y, verbose=0, shuffle=False, epochs=optimal_epoch[-1])
            
        else:
            train_X = np.append(train_X, validation_X, axis=0)
            train_y = np.append(train_y, validation_y, axis=0)
            model.fit(train_X, train_y, verbose=0, shuffle=False, epochs=max_epoch)
        
        LSTMs[stock_idx] = model
        
        # make a prediction
        if testPredict is None:
            #trainPredict = DataFrame({'Date':train_date, symbols[stock_idx]:np.squeeze(model.predict(train_X))}).set_index('Date')
            #validationPredict= DataFrame({'Date':validation_date, symbols[stock_idx]:np.squeeze(model.predict(validation_X))}).set_index('Date')
            testPredict= DataFrame({symbols[stock_idx]:np.squeeze(model.predict(test_X))}, index=pd.Index([test_date], name='Date'))
        else:
            #trainPredict[symbols[stock_idx]] = np.squeeze(model.predict(train_X))
            #validationPredict[symbols[stock_idx]] = np.squeeze(model.predict(validation_X))
            testPredict[symbols[stock_idx]] = np.squeeze(model.predict(test_X))
        
    
    return LSTMs, optimal_epoch, performance_record, trainPredict, validationPredict, testPredict


In [None]:
# Define a function to calculate its perfomance matrix
def Performancematrix(portfolio,rf):
    portfolio['Cum_Wealth'] = pd.Series(1+portfolio.Return).cumprod()
    portfolio['Cum_Return'] = portfolio['Cum_Wealth'] - 1
    Sharpe = round((np.mean(portfolio.Return) - rf ) / (np.std(portfolio.Return)) * sqrt(12) , 2)
    drawdown = round (portfolio['Cum_Wealth'] - portfolio['Cum_Wealth'].cummax(), 2)
    max_drawdown = drawdown.min()
    Cumulative_Return = round(portfolio['Cum_Return'].values[-1],2)
    return Sharpe,max_drawdown,Cumulative_Return


In [None]:
# Load data
close = pd.read_csv('price.csv',index_col=0)
volume = pd.read_csv('volume.csv',index_col=0)
high = pd.read_csv('high.csv',index_col=0)
low = pd.read_csv('low.csv',index_col=0)

In [None]:
# Set Basic Parameters
holding = 20
rf = 0.02/12
n_stocks = 10
n_block_len = 200
training_ratio = 0.8
lookback_window = 20
first_train_max_epoch = 50
online_train_max_epoch = 50
n_LSTM_node = 50
block_start_idx = 300  # Set the start point to train
testingperiod = 95 # How many periods you want to test

In [None]:
# Get label and return
stocks = []
Ys = []
symbols = list(close.columns)

dataset = preposessing(close,volume,high,low,holding)
date = dataset[0].index  # Store the calendar date
for i in range(len(dataset)):
    stock = indicator(dataset[i].reset_index(drop=True))
    stocks.append(stock.drop(['Y','Close','Volume','MA_20','EMA_40', 'OBV_30'], axis=1))
    Ys.append(stock['Y'])
date = date[stocks[0].index[0]:]  # Log out the corresponding calendar date

In [None]:
# Calculate the Return of the following holding period
stockreturns = pd.DataFrame()
lagged = holding
for i in range(len(symbols)):
    ticker = symbols[i]
    stockreturns[ticker] = close[ticker].shift(-lagged) / close[ticker]   - 1
stockreturns.dropna(inplace=True)

In [None]:
# Get the first block of data
predictedResult = []

Xblock = [df.iloc[block_start_idx:(block_start_idx+n_block_len),:] for df in stocks]
Yblock = [df.iloc[block_start_idx:(block_start_idx+n_block_len)] for df in Ys]
dateBlock = date[block_start_idx:(block_start_idx + n_block_len)]

# Initialize and train LSTMs
LSTMs, optimal_epoch, performance_record, trainPredict, validationPredict, testPredict = trainLSTM(
    Xblock, Yblock, dateBlock, training_ratio, window=lookback_window, max_epoch=first_train_max_epoch, 
    lstm_node = n_LSTM_node, LSTMs = None)

predictedResult.append(testPredict)

In [None]:
# Start online training and get the prediction 
start_idx = block_start_idx +  holding

In [None]:
for i in range (testingperiod):
    #Get the next block
    block_start_idx = start_idx + holding * i
    Xblock = [ df.iloc[block_start_idx:(block_start_idx+n_block_len),:] for df in stocks]
    Yblock = [ df.iloc[block_start_idx:(block_start_idx+n_block_len)] for df in Ys]
    dateBlock = date[block_start_idx:(block_start_idx + n_block_len)]
    print('Training: ' + str(dateBlock[-1]))
    # Training LSTMs
    LSTMs, optimal_epoch, performance_record, trainPredict, validationPredict, testPredict = trainLSTM(
        Xblock, Yblock, dateBlock, training_ratio, window=lookback_window, max_epoch=online_train_max_epoch, 
        lstm_node = n_LSTM_node, LSTMs = LSTMs)
    # Construct a portfolio based on the predicted results
    predictedResult.append(testPredict)

# Store all predict result into a dataframe for portfolio construction
predictedResult = pd.concat(predictedResult,axis=0)


In [None]:
predictedResult.to_csv('predicted.csv')