# Stock Prediction
### Ignacio Fernández Adrados

---

In [3]:
import pandas as pd # type: ignore

def getStockDataFrame (stock, years):
    STOCK = stock
    YEARS = years

    file = None
    for year in YEARS:
        FILE='Datasets/Stock-'+STOCK+'-'+year+'.csv'
        ds=pd.read_csv(FILE, sep=',')
        file = pd.concat([file,ds])
    file = file.sort_values(by='timestamp', ascending=True)
    return file

In [4]:
from datetime import datetime

def groupStocks(dataset, n, datefilter):

    N=n     #Minutos agrupados
    DATEFILTER=datefilter #Filtro de fecha
    file = dataset
    
    df = file[file['timestamp'].str.startswith(DATEFILTER)]
    df=pd.DataFrame(df)
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    dfgroup = pd.DataFrame(df.groupby(pd.Grouper(key='timestamp', freq=N)).first()['open'])
    dfgroup = dfgroup.join(df.groupby(pd.Grouper(key='timestamp', freq=N)).last()['close'])
    dfgroup = dfgroup.join(df.groupby(pd.Grouper(key='timestamp', freq=N)).min()['low'])
    dfgroup = dfgroup.join(df.groupby(pd.Grouper(key='timestamp', freq=N)).max()['high'])
    dfgroup = dfgroup.join(df.groupby(pd.Grouper(key='timestamp', freq=N, offset='60min')).sum()['volume'])
    dfgroup = pd.DataFrame(pd.to_datetime(dfgroup.index[1:])).join(dfgroup[:-1].reset_index(drop=True))
    return dfgroup

In [5]:

def get_rsi(close, lookback):
    ret = close.diff()
    up = []
    down = []
    for i in range(len(ret)):
        if ret[i] < 0:
            up.append(0)
            down.append(ret[i])
        else:
            up.append(ret[i])
            down.append(0)
    up_series = pd.Series(up)
    down_series = pd.Series(down).abs()
    up_ewm = up_series.ewm(com = lookback - 1, adjust = False).mean()
    down_ewm = down_series.ewm(com = lookback - 1, adjust = False).mean()
    rs = up_ewm/down_ewm
    rsi = 100 - (100 / (1 + rs))
    rsi_df = pd.DataFrame(rsi).rename(columns = {0:'rsi'}).set_index(close.index)
    rsi_df = rsi_df.dropna()
    return rsi_df[3:]

def getIndicators(dataset):
    
    dfgroup = dataset

    dfgroup['EMA7']= dfgroup['close'].ewm(span=7, adjust=False).mean()
    dfgroup['MACD']= dfgroup['close'].ewm(span=12, adjust=False).mean()- dfgroup['close'].ewm(span=26, adjust=False).mean()
    dfgroup['SignalMACD'] = dfgroup['MACD'].ewm(span=9, adjust=False).mean()
    dfgroup['RSI'] = get_rsi(dfgroup['close'], 14)
    dfgroup = dfgroup.dropna()
    dfgroup = dfgroup.reset_index(drop=True)
    return dfgroup

In [6]:
from sklearn.preprocessing import MinMaxScaler

def normalize(dataset):
    #Normalise data into (0,1) range
    normData=dataset
    scaler = MinMaxScaler(feature_range=(0, 1))
    #normData[['open']] = scaler.fit_transform(normData[['open']])
    normData[['open','close','low','high','volume','EMA7','MACD','SignalMACD','RSI']] = \
        scaler.fit_transform(normData[['open','close','low','high','volume','EMA7','MACD','SignalMACD','RSI']])
    return normData

In [7]:
import matplotlib.pyplot as plt

def plotDataset(dataset):

    normData = dataset
    
    plt.figure(figsize=(14,4))
    plt.title("Dataset 'Close' data ")
    plt.plot(normData['close'],color='black')
    #plt.scatter(normData.index, np.where(normData['buysell'] == -1,normData['close'], None), color="red", marker="v")
    #plt.scatter(normData.index, np.where(normData['buysell'] ==  1,normData['close'], None), color="blue", marker="^")
    plt.show()

### Split the values in train and test

So, we took only 25% of the data as training samples and set aside the rest of the data for testing.

Looking at the time-series plot, we think **it is not easy for a standard model to come up with correct trend predictions.**

In [8]:
def splitData(dataset, split, step):

    S=split
    step = step
    normData = dataset

    split = int(len(normData) * S)
    #values = normData.values
    #print(values)
    train = normData[:split]#.drop(['buysell'],axis=1)
    test = pd.concat([train.tail(step),normData[split:]]).reset_index(drop=True)

    #print("Train data length:", train.shape)
    #print("Test data length:", test.shape)

    return train, test

In [9]:
def plotSplitDataset(dataset, split):
    
    normData = dataset
    split = int(len(normData) * split)

    plt.figure(figsize=(14,4))
    plt.title("Dataset 'Close' data split")
    plt.plot(normData.index.values,normData['close'],c='black')
    plt.axvline(normData.index[split], c="r")
    #plt.scatter(normData.index, np.where(normData['buysell'] == -1,normData['close'], None), color="red",  marker="v")
    #plt.scatter(normData.index, np.where(normData['buysell'] ==  1,normData['close'], None), color="blue", marker="^")
    plt.show()


### Converting to a multi-dimensional array
Next, we'll convert test and train data into the matrix with step value as it has shown above example.

In [10]:
import numpy as np

def convertToMatrix(data, step):
    X, Y =[], []
    for i in range(len(data)-step):
        d=i+step
        #print(i, d, data[i:d)
        X.append(data[i:d])
        Y.append(data[d,1])
    return np.array(X), np.array(Y)

def dataset2Matrix(train, test, step):
    trainX,trainY =convertToMatrix(train.to_numpy(),step)
    testX,testY =convertToMatrix(test.to_numpy(),step)
    #print(trainY)
    #print("Training data shape:", trainX.shape,', ',trainY.shape)
    #print("Test data shape:", testX.shape,', ',testY.shape)
    return trainX, trainY, testX, testY


### Keras model with `SimpleRNN` layer

- 256 neurons in the RNN layer
- 32 denurons in the densely connected layer
- a single neuron for the output layer
- ReLu activation
- learning rate: 0.001

In [11]:
from keras import Sequential
from keras.layers import Dense, SimpleRNN, Input, LSTM, Dropout # type: ignore

def createModel(type, units, trainX, step):
    UNITS = units #num_units: Number of units of a the simple RNN layer
    DENSEUNITS = 32 #Number of neurons in the dense layer followed by the RNN layer
    
    model = Sequential()
    model.add(Input((step, trainX.shape[2])))
    if type=="LSTM":
        model.add(LSTM(units=UNITS, activation="tanh",return_sequences=True))
        model.add(Dropout(0.2))
        model.add(LSTM(units=UNITS//2, activation="tanh"))
        model.add(Dropout(0.2))      
    elif type == "RNN":
        model.add(SimpleRNN(units=UNITS, activation="relu"))
        model.add(Dropout(0.2))
        model.add(SimpleRNN(units=UNITS//2, activation="relu"))
        model.add(Dropout(0.2))     
    model.add(Dense(DENSEUNITS, activation="tanh"))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam',metrics=['mse'])    
    return model

### Fit the model

In [23]:
from keras.callbacks import Callback # type: ignore

'''class MyCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        #if (epoch+1) % 10 == 0 and epoch>0:
            print("Epoch number {} done".format(epoch+1))
'''
def trainModel(model, batch, epochs, trainX, trainY):
    batch_size=batch
    num_epochs = epochs

    model.fit(trainX,trainY,
            epochs=num_epochs,
            batch_size=batch_size,
            #callbacks=[MyCallback()],verbose=0
            )
    return model

### Plot loss

In [13]:
def plotLosss(model):
    plt.figure(figsize=(8,3))
    plt.title("RMSE loss over epochs",fontsize=16)
    plt.plot(np.sqrt(model.history.history['loss']),c='k',lw=2)
    plt.grid(True)
    plt.xlabel("Epochs",fontsize=14)
    plt.ylabel("Root-mean-squared error",fontsize=14)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.show()

### Predictions
Note that the model was fitted only with the `trainX` and `trainY` data.

In [14]:
def predict(model, trainX, testX):
    
    trainPredict = model.predict(trainX)
    testPredict= model.predict(testX)
    predicted=np.concatenate((trainPredict,testPredict),axis=0)
    print(trainPredict.shape)
    print(testPredict.shape)

    return trainPredict, testPredict, predicted


### Comparing it with the ground truth (test set)

In [15]:

def plotCompare(normData, testPredict, split):

    OFFSET=0
    split = int(len(normData) * split)
    index = normData.index.values
    plt.figure(figsize=(14,4))
    plt.title("Ground truth and prediction together",fontsize=18)
    plt.plot(normData['close'].iloc[split+OFFSET:].reset_index(drop=True),color='black', label='Ground Truth')
    plt.plot(testPredict[OFFSET:],color='blue', label='Prediction')
    plt.legend()
    plt.show()

In [16]:
def calculateDecision(test, testPredict, step, profit):

    PROFIT = profit

    decision=test['close'].iloc[step:]
    decision = decision.diff()
    decision = decision.dropna()
    decision = np.where(abs(decision)<PROFIT,0,np.sign(decision).astype('int'))
    decision = pd.DataFrame(data={'buysell':decision})#.drop(0).reset_index(drop=True)

    predictedDecision = pd.DataFrame(data={'buysellPredicted':testPredict[:,0]})
    predictedDecision = predictedDecision.diff()
    predictedDecision = predictedDecision.dropna()#.reset_index(drop=True)
    predictedDecision = np.where(abs(predictedDecision)<PROFIT,0,np.sign(predictedDecision).astype('int'))
    predictedDecision = pd.DataFrame(data={'buysellPredicted':predictedDecision[:,0]})

    return decision, predictedDecision

In [17]:
def plotResult(normData, testPredict, split, profit, decision, predictedDecision):

    PROFIT=profit
    OFFSET=0
    split = int(len(normData) * split)

    

    index = normData.index.values
    plt.figure(figsize=(14,4))
    plt.title("Ground truth and prediction together",fontsize=18)
    plt.plot(normData['close'].iloc[split+OFFSET:].reset_index(drop=True),color='black')
    plt.plot(testPredict[OFFSET:],color='blue')

    x = decision.iloc[OFFSET:].reset_index(drop=True).index
    y = normData['close'].iloc[len(normData)-len(predictedDecision.index)-1+OFFSET:-1].reset_index(drop=True)

    plt.scatter(x, np.where(decision['buysell'].iloc[OFFSET:] == -1,y, None), color="red",  marker="v", label='Ground Truth Sell')
    plt.scatter(x, np.where(decision['buysell'].iloc[OFFSET:] ==  1,y, None), color="blue", marker="^", label='Ground Truth Buy')

    plt.scatter(x, np.where(predictedDecision['buysellPredicted'].iloc[OFFSET:] == -1,y-0.015, None), color="orange",  marker="v", label='Prediction Sell')
    plt.scatter(x, np.where(predictedDecision['buysellPredicted'].iloc[OFFSET:] ==  1,y+0.015, None), color="green",  marker="^", label='Prediction Buy')
    plt.legend()
    plt.show()


In [18]:
def plotProfit(normData, split, decision, predictedDecision):

    OFFSET=0
    split = int(len(normData) * split)
    index = normData.index.values
    plt.figure(figsize=(14,4))
    plt.title("Ground truth and prediction together",fontsize=18)
    plt.plot(normData['close'].iloc[split+OFFSET:].reset_index(drop=True),color='black')
    #plt.plot(testPredict[OFFSET:],color='blue')

    x = decision.iloc[OFFSET:].reset_index(drop=True).index
    y = normData['close'].iloc[len(normData)-len(predictedDecision.index)-1+OFFSET:-1].reset_index(drop=True)

    plt.scatter(x, np.where((decision['buysell'].iloc[OFFSET:] !=  predictedDecision['buysellPredicted'].iloc[OFFSET:]) & (predictedDecision['buysellPredicted'].iloc[OFFSET:] != 0),y, None), color="red", label='Ground Truth Buy')
    plt.scatter(x, np.where((decision['buysell'].iloc[OFFSET:] == predictedDecision['buysellPredicted'].iloc[OFFSET:]) & (decision['buysell'].iloc[OFFSET:] != 0),y, None), color="green",  label='Ground Truth Sell')

    plt.legend()
    plt.show()

In [19]:
def calculateProfit(normData, decision, predictedDecision):

    OFFSET=0
    y = normData['close'].iloc[len(normData)-len(predictedDecision.index)-1+OFFSET:-1].reset_index(drop=True)

    #print("Predicciones no coincidentes con el conjunto de pruebas: ", \
    nocoincide =np.count_nonzero(np.where((decision['buysell'].iloc[OFFSET:] !=  predictedDecision['buysellPredicted'].iloc[OFFSET:]) \
          & (predictedDecision['buysellPredicted'].iloc[OFFSET:] != 0),y, None))#)
    #print("Predicciones coincidentes con el conjunto de pruebas: ", \
    coincide=np.count_nonzero(np.where((decision['buysell'].iloc[OFFSET:] == predictedDecision['buysellPredicted'].iloc[OFFSET:]) \
          & (decision['buysell'].iloc[OFFSET:] != 0),y, None))#)
    return coincide, nocoincide

In [24]:
split = 0.7
stock = 'AAPL'
dates = ['2022','2023']

intervals = ['10min','1h','1D']
algorithms = ['LSTM', 'RNN']
steps = range(5, 16, 5)
units = range(100,500,100)
batchs = range(5,100, 20)
profits = range(1, 10, 5)
epochs = range (5,50,50)

df = None
i=0
for interval in intervals:
    for algorithm in algorithms:
        for step in steps:
            for unit in units:
                for batch in batchs:
                    for profit in profits:
                        for epoch in epochs:
                            print('Calculating iteration ', i)
                            i += 1
                            s1 = getStockDataFrame(stock, dates )
                            s2 = groupStocks(s1,interval, '')
                            s2 = s2.drop(['timestamp'],axis=1)
                            s3 = getIndicators(s2)
                            s4 = normalize(s3)
                            #plotDataset(s4)
                            train, test = splitData(s4, split, step)
                            #plotSplitDataset(s4, split)
                            trainX, trainY, testX, testY = dataset2Matrix(train, test, step)
                            model = createModel(algorithm, unit,trainX, step)
                            #model.summary()
                            trainedModel = trainModel(model,batch,epoch,trainX,trainY)
                            #plotLosss(trainedModel)
                            loss = np.sqrt(trainedModel.history.history['loss'][-1])
                            #print ('RMSE loss :', loss)
                            trainPredict, testPredict, predicted = predict(trainedModel,trainX, testX)
                            #plotCompare(s4, testPredict, split)
                            decision, predictedDecision = calculateDecision(test, testPredict, step, profit)
                            #plotResult(s4, testPredict,split, profit, decision, predictedDecision)
                            plotProfit(s4, split, decision, predictedDecision)
                            coincide, nocoincide = calculateProfit(s4, decision, predictedDecision)
                            df = pd.concat([df, pd.DataFrame({\
                                'interval': [interval],\
                                'algo': [algorithm],\
                                'step':[step], \
                                'units':[unit],\
                                'batchs':[batch],\
                                'profit':[profit*0.1],\
                                'epochs':[epoch],\
                                'loss': [loss],\
                                'coincide': [coincide],\
                                'nocoincide': [nocoincide]
                             })])

df.reset_index(drop=True)

Calculating iteration  0
Epoch 1/5
[1m59563/59563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m585s[0m 10ms/step - loss: 5.8906e-04 - mse: 5.8906e-04
Epoch 2/5
[1m34712/59563[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m3:21[0m 8ms/step - loss: 6.2728e-05 - mse: 6.2728e-05

KeyboardInterrupt: 