# Stock Prediction
### Ignacio Fernández Adrados

---

In [1]:
'''
%pip install pandas
%pip install scikit-learn
%pip install matplotlib
%pip install numpy
%pip install keras
%pip install tensorflow
'''

'\n%pip install pandas\n%pip install scikit-learn\n%pip install matplotlib\n%pip install numpy\n%pip install keras\n%pip install tensorflow\n'

In [2]:
import pandas as pd # type: ignore

def getStockDataFrame (stock, years):
    STOCK = stock
    YEARS = years

    file = None
    for year in YEARS:
        FILE='Datasets/Stock-'+STOCK+'-'+year+'.csv'
        ds=pd.read_csv(FILE, sep=',')
        file = pd.concat([file,ds])
    file = file.sort_values(by='timestamp', ascending=True)
    return file
#getStockDataFrame('AAPL', ['2022','2023'])

In [3]:
from datetime import datetime

def groupStocks(dataset, n, datefilter):

    N=n     #Minutos agrupados
    DATEFILTER=datefilter #Filtro de fecha
    file = dataset
    
    df = file[file['timestamp'].str.startswith(DATEFILTER)]
    df=pd.DataFrame(df)
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    dfgroup = pd.DataFrame(df.groupby(pd.Grouper(key='timestamp', freq=N)).first()['open'])
    dfgroup = dfgroup.join(df.groupby(pd.Grouper(key='timestamp', freq=N)).last()['close'])
    dfgroup = dfgroup.join(df.groupby(pd.Grouper(key='timestamp', freq=N)).min()['low'])
    dfgroup = dfgroup.join(df.groupby(pd.Grouper(key='timestamp', freq=N)).max()['high'])
    dfgroup = dfgroup.join(df.groupby(pd.Grouper(key='timestamp', freq=N)).sum()['volume'])
    #dfgroup = dfgroup[:-1].reset_index(drop=True)
    dfgroup = pd.DataFrame(pd.to_datetime(dfgroup.index[:])).join(dfgroup.reset_index(drop=True))
    dfgroup = dfgroup.dropna().reset_index(drop=True)
    return dfgroup
#xxx = groupStocks(getStockDataFrame('AAPL', ['2022']), '1D', '')
#xxx

In [4]:

def get_rsi(close, lookback):
    ret = close.diff()
    up = []
    down = []
    for i in range(len(ret)):
        if ret[i] < 0:
            up.append(0)
            down.append(ret[i])
        else:
            up.append(ret[i])
            down.append(0)
    up_series = pd.Series(up)
    down_series = pd.Series(down).abs()
    up_ewm = up_series.ewm(com = lookback - 1, adjust = False).mean()
    down_ewm = down_series.ewm(com = lookback - 1, adjust = False).mean()
    rs = up_ewm/down_ewm
    rsi = 100 - (100 / (1 + rs))
    rsi_df = pd.DataFrame(rsi).rename(columns = {0:'rsi'}).set_index(close.index)
    rsi_df = rsi_df.dropna()
    return rsi_df[3:]

def getIndicators(dataset):
    
    dfgroup = dataset

    dfgroup['EMA7']= dfgroup['close'].ewm(span=7, adjust=False).mean()
    dfgroup['MACD']= dfgroup['close'].ewm(span=12, adjust=False).mean()- dfgroup['close'].ewm(span=26, adjust=False).mean()
    dfgroup['SignalMACD'] = dfgroup['MACD'].ewm(span=9, adjust=False).mean()
    dfgroup['RSI'] = get_rsi(dfgroup['close'], 14)
    dfgroup = dfgroup.dropna()
    dfgroup = dfgroup.reset_index(drop=True)
    return dfgroup

#getIndicators(xxx)

Añadiendo sentimiento

In [5]:
def getNewsDataFrame(stock, date, interval, stockDataFrame, relevance):
    STOCK = stock
    YEARS = date
    file = None
    for year in YEARS:
        FILE='Datasets/News-'+STOCK+'-'+year+'.csv'
        ds=pd.read_csv(FILE, sep=',')
        file = pd.concat([file,ds])
    file = file.sort_values(by='date', ascending=True)
    file['date'] = pd.to_datetime(file['date'], format='%Y%m%dT%H%M%S')
    file = file.drop(['title','summary','ticker'], axis=1)
    file = file[file['relevance']>=relevance]

    dfgroup = pd.DataFrame(file.groupby(pd.Grouper(key='date', freq=interval)).last()['relevance'])
    dfgroup = dfgroup.join(file.groupby(pd.Grouper(key='date', freq=interval)).last()['sentiment'])
    dfgroup = pd.DataFrame(pd.to_datetime(dfgroup.index[:])).join(dfgroup.reset_index(drop=True))
    dfgroup = dfgroup.fillna(0)
    stockDataFrame['timestamp'] = pd.to_datetime(stockDataFrame['timestamp'])
    res = stockDataFrame.join(dfgroup.set_index("date"), on='timestamp', how='left')
    res = res.fillna(0)
    res = res.reset_index(drop=True)

    return res

In [6]:
from sklearn.preprocessing import MinMaxScaler

def normalize(dataset):
    #Normalise data into (0,1) range
    normData=dataset
    scaler = MinMaxScaler(feature_range=(0, 1))

    #normData[['open','close','low','high','volume','EMA7','MACD','SignalMACD','RSI']] = \
    #    scaler.fit_transform(normData[['open','close','low','high','volume','EMA7','MACD','SignalMACD','RSI']])
    #print(normData[['open','close','low','high','volume','EMA7','MACD','SignalMACD','RSI']].head(1))
    c=normData.columns.values
    normData[c] = scaler.fit_transform(normData[c])
    return normData

##################################################3
#data = getStockDataFrame('AAPL', ['2023'])
#indicatorData = groupStocks(data, '1D', '')
#indicatorData = getNewsDataFrame('AAPL', ['2023'], '1D', indicatorData, 0)
#indicatorData = indicatorData.drop(['timestamp'], axis=1)
#normalize(indicatorData)
##################################################3

In [7]:
import matplotlib.pyplot as plt

def plotDataset(dataset):

    normData = dataset
    
    plt.figure(figsize=(14,4))
    plt.title("Dataset 'Close' data ")
    plt.plot(normData['close'],color='black')
    #plt.scatter(normData.index, np.where(normData['buysell'] == -1,normData['close'], None), color="red", marker="v")
    #plt.scatter(normData.index, np.where(normData['buysell'] ==  1,normData['close'], None), color="blue", marker="^")
    plt.show()

### Split the values in train and test

So, we took only 25% of the data as training samples and set aside the rest of the data for testing.

Looking at the time-series plot, we think **it is not easy for a standard model to come up with correct trend predictions.**

In [8]:
def splitData(dataset, split, step):

    S=split
    step = step
    normData = dataset

    split = int(len(normData) * S)
    #values = normData.values
    #print(values)
    train = normData[:split]#.drop(['buysell'],axis=1)
    test = pd.concat([train.tail(step),normData[split:]]).reset_index(drop=True)

    #print("Train data length:", train.shape)
    #print("Test data length:", test.shape)

    return train, test

In [9]:
def plotSplitDataset(dataset, split):
    
    normData = dataset
    split = int(len(normData) * split)

    plt.figure(figsize=(14,4))
    plt.title("Dataset 'Close' data split")
    plt.plot(normData.index.values,normData['close'],c='black')
    plt.axvline(normData.index[split], c="r")
    #plt.scatter(normData.index, np.where(normData['buysell'] == -1,normData['close'], None), color="red",  marker="v")
    #plt.scatter(normData.index, np.where(normData['buysell'] ==  1,normData['close'], None), color="blue", marker="^")
    plt.show()


### Converting to a multi-dimensional array
Next, we'll convert test and train data into the matrix with step value as it has shown above example.

In [10]:
import numpy as np

def convertToMatrix(data, step):
    X, Y =[], []
    for i in range(len(data)-step):
        d=i+step
        #print(i, d, data[i:d)
        X.append(data[i:d])
        Y.append(data[d,1])
    return np.array(X), np.array(Y)

def dataset2Matrix(train, test, step):
    trainX,trainY =convertToMatrix(train.to_numpy(),step)
    testX,testY =convertToMatrix(test.to_numpy(),step)
    #print(trainY)
    #print("Training data shape:", trainX.shape,', ',trainY.shape)
    #print("Test data shape:", testX.shape,', ',testY.shape)
    return trainX, trainY, testX, testY


### Keras model with `SimpleRNN` layer

- 256 neurons in the RNN layer
- 32 denurons in the densely connected layer
- a single neuron for the output layer
- ReLu activation
- learning rate: 0.001

In [11]:
from keras import Sequential
from keras.layers import Dense, SimpleRNN, Input, LSTM, Dropout # type: ignore

def createModel(type, units, trainX, step):
    UNITS = units #num_units: Number of units of a the simple RNN layer
    DENSEUNITS = 32 #Number of neurons in the dense layer followed by the RNN layer
    
    model = Sequential()
    model.add(Input((step, trainX.shape[2])))
    if type=="LSTM":
        model.add(LSTM(units=UNITS, activation="tanh",return_sequences=True))
        model.add(Dropout(0.2))
        model.add(LSTM(units=UNITS//2, activation="tanh"))
        model.add(Dropout(0.2))      
    elif type == "RNN":
        model.add(SimpleRNN(units=UNITS, activation="relu",return_sequences=True))
        model.add(Dropout(0.2))
        model.add(SimpleRNN(units=UNITS//2, activation="relu"))
        model.add(Dropout(0.2))     
    model.add(Dense(DENSEUNITS, activation="tanh"))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam',metrics=['mse'])    
    return model

### Fit the model

In [12]:
from keras.callbacks import Callback # type: ignore

class MyCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        #if (epoch+1) % 10 == 0 and epoch>0:
            print("Epoch number {} done".format(epoch+1))

def trainModel(model, batch, epochs, trainX, trainY):
    batch_size=batch
    num_epochs = epochs

    model.fit(trainX,trainY,
            epochs=num_epochs,
            batch_size=batch_size,
            callbacks=None,verbose=0
            )
    return model

### Plot loss

In [13]:
def plotLosss(model):
    plt.figure(figsize=(8,3))
    plt.title("RMSE loss over epochs",fontsize=16)
    plt.plot(np.sqrt(model.history.history['loss']),c='k',lw=2)
    plt.grid(True)
    plt.xlabel("Epochs",fontsize=14)
    plt.ylabel("Root-mean-squared error",fontsize=14)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.show()

### Predictions
Note that the model was fitted only with the `trainX` and `trainY` data.

In [14]:
def predict(model, trainX, testX):
    trainPredict = model.predict(trainX, verbose=0)
    testPredict= model.predict(testX, verbose=0)
 
    predicted=np.concatenate((trainPredict,testPredict),axis=0)
    #print(trainPredict.shape)
    #print(testPredict.shape)

    return trainPredict, testPredict, predicted


### Comparing it with the ground truth (test set)

In [15]:

def plotCompare(normData, testPredict, split):

    OFFSET=0
    split = int(len(normData) * split)
    #index = normData.index.values
    plt.figure(figsize=(14,4))
    plt.title("Ground truth and prediction together",fontsize=18)
    plt.plot(normData['close'].iloc[split+OFFSET:].reset_index(drop=True),color='black', label='Ground Truth')
    plt.plot(testPredict[OFFSET:],color='blue', label='Prediction')
    plt.legend()
    plt.show()

In [16]:
def calculateDecision(test, testPredict, step, profit):

    PROFIT = profit

    decision=test['close'].iloc[step:]
    decision = decision.diff()
    decision = decision.dropna()
    decision = np.where(abs(decision)<PROFIT,0,np.sign(decision).astype('int'))
    decision = pd.DataFrame(data={'buysell':decision})#.drop(0).reset_index(drop=True)

    predictedDecision = pd.DataFrame(data={'buysellPredicted':testPredict[:,0]})
    predictedDecision = predictedDecision.diff()
    predictedDecision = predictedDecision.dropna()#.reset_index(drop=True)
    predictedDecision = np.where(abs(predictedDecision)<PROFIT,0,np.sign(predictedDecision).astype('int'))
    predictedDecision = pd.DataFrame(data={'buysellPredicted':predictedDecision[:,0]})

    decision = decision.join(predictedDecision)
    return decision

In [17]:
def plotResult(normData, testPredict, split, profit, decision):

    PROFIT=profit
    OFFSET=0
    split = int(len(normData) * split)

    

    index = normData.index.values
    plt.figure(figsize=(14,4))
    plt.title("Ground truth and prediction together",fontsize=18)
    plt.plot(normData['close'].iloc[split+OFFSET:].reset_index(drop=True),color='black')
    plt.plot(testPredict[OFFSET:],color='blue')

    x = decision.iloc[OFFSET:].reset_index(drop=True).index
    y = normData['close'].iloc[len(normData)-len(decision.index)-1+OFFSET:-1].reset_index(drop=True)

    plt.scatter(x, np.where(decision['buysell'].iloc[OFFSET:] == -1,y, None), color="red",  marker="v", label='Ground Truth Sell')
    plt.scatter(x, np.where(decision['buysell'].iloc[OFFSET:] ==  1,y, None), color="blue", marker="^", label='Ground Truth Buy')

    plt.scatter(x, np.where(decision['buysellPredicted'].iloc[OFFSET:] == -1,y-0.015, None), color="orange",  marker="v", label='Prediction Sell')
    plt.scatter(x, np.where(decision['buysellPredicted'].iloc[OFFSET:] ==  1,y+0.015, None), color="green",  marker="^", label='Prediction Buy')
    plt.legend()
    plt.show()


In [18]:
def plotProfit(normData, split, decision):

    OFFSET=0
    split = int(len(normData) * split)
    index = normData.index.values
    plt.figure(figsize=(14,4))
    plt.title("Ground truth and prediction together",fontsize=18)
    plt.plot(normData['close'].iloc[split+OFFSET:].reset_index(drop=True),color='black')
    #plt.plot(testPredict[OFFSET:],color='blue')

    x = decision.iloc[OFFSET:].reset_index(drop=True).index
    y = normData['close'].iloc[len(normData)-len(decision.index)-1+OFFSET:-1].reset_index(drop=True)

    plt.scatter(x, np.where((decision['buysell'].iloc[OFFSET:] !=  decision['buysellPredicted'].iloc[OFFSET:]) & (decision['buysellPredicted'].iloc[OFFSET:] != 0),y, None), color="red", label='Ground Truth Buy')
    plt.scatter(x, np.where((decision['buysell'].iloc[OFFSET:] == decision['buysellPredicted'].iloc[OFFSET:]) & (decision['buysell'].iloc[OFFSET:] != 0),y, None), color="green",  label='Ground Truth Sell')

    plt.legend()
    plt.show()

In [19]:
def calculateProfit(normData, decision):

    OFFSET=0
    y = normData['close'].iloc[len(normData)-len(decision.index)-1+OFFSET:-1].reset_index(drop=True)
    
    #print("Predicciones no coincidentes con el conjunto de pruebas: ", \
    nocoincide =np.count_nonzero(np.where((decision['buysell'].iloc[OFFSET:] !=  decision['buysellPredicted'].iloc[OFFSET:]) \
          & (decision['buysellPredicted'].iloc[OFFSET:] != 0),y, None))#)
    #print("Predicciones coincidentes con el conjunto de pruebas: ", \
    coincide=np.count_nonzero(np.where((decision['buysell'].iloc[OFFSET:] == decision['buysellPredicted'].iloc[OFFSET:]) \
          & (decision['buysell'].iloc[OFFSET:] != 0),y, None))#)
    return coincide, nocoincide

In [20]:
def runTest(split, stock, dates, profitMult, intervals, algorithms, steps, units, batchs, profits, epochs, sentiment, relevance, useIndicators):
    df = None
    i=0
    for interval in intervals:
        for algorithm in algorithms:
            for step in steps:
                for unit in units:
                    for batch in batchs:
                        for epoch in epochs:
                            for indicator in useIndicators:
                                for sent in sentiment:
                                    #print('Calculating iteration:', i, ':', 'interval:', str(interval) + ",",
                                    #    'algorithm:', algorithm + ",", 'step:', str(step) + ",", 'unit:', str(unit), 'batch:', str(batch) + ",",
                                    #    'profit:', str(profit*profitMult) + ",", 'epoch:', str(epoch))
                                    i += 1
                                    data = getStockDataFrame(stock, dates )
                                    groupedData = groupStocks(data,interval, '')#.drop(['timestamp'],axis=1)
                                    if useIndicators:
                                        indicatorData = getIndicators(groupedData)
                                    else:
                                        indicatorData = groupedData                        
                                    
                                    if sentiment:
                                        indicatorData = getNewsDataFrame(stock, dates, interval, indicatorData, relevance)
                                    indicatorData = indicatorData.drop(["timestamp"], axis=1)
                                    finalData = normalize(indicatorData)                            
                                    #print(finalData)
                                    #plotDataset(finalData)
                                    train, test = splitData(finalData, split, step)
                                    #plotSplitDataset(s4, split)
                                    trainX, trainY, testX, testY = dataset2Matrix(train, test, step)
                                    model = createModel(algorithm, unit,trainX, step)
                                    #model.summary()
                                    trainedModel = trainModel(model,batch,epoch,trainX,trainY)
                                    #plotLosss(trainedModel)
                                    loss = np.sqrt(trainedModel.history.history['loss'][-1])
                                    #print ('RMSE loss :', loss)
                                    trainPredict, testPredict, predicted = predict(trainedModel,trainX, testX)
                                    #plotCompare(finalData, testPredict, split)

                                    for profit in profits:
                                        decision = calculateDecision(test, testPredict, step, profit*profitMult)

                                        #plotResult(finalData, testPredict,split, profit, decision)
                                        #plotProfit(finalData, split, decision)
                                        coincide, nocoincide = calculateProfit(finalData, decision)
                                        if coincide+nocoincide == 0:
                                            ben = '-'
                                        else:
                                            ben = str(round(coincide/(coincide+nocoincide)*100,2))+'%'
                                        df = pd.concat([df, pd.DataFrame({
                                            'int': ["{:>3}".format(interval)],
                                            'algo': ["{:>5}".format(algorithm)],
                                            'step':["{:3}".format(step)],
                                            'unit':["{:4}".format(unit)],
                                            'bat':["{:5}".format(batch)],
                                            'epo':["{:3}".format(epoch)],
                                            'profit':["{:9.6f}".format(profit*profitMult)],
                                            'loss': ["{:9.6f}".format(loss)],
                                            'indicators:' : ["{:>6}".format(str(indicator))],
                                            'sentiment' : ["{:>6}".format(str(sent))],
                                            'relevance' : ["{:>4}".format(relevance)],
                                            'good': ["{:4}".format(coincide)],
                                            'bad': ["{:4}".format(nocoincide)],
                                            'benefit': ["{:>7}".format(ben)]                                    
                                        })])

                                        print(df.iloc[-1:].to_string(header=False, index=False))
                                        df.to_csv('Datasets/Tests_Results-'+stock + str(dates)+ \
                                            str(intervals)+str(algorithms)+str(steps)+str(units)+str(batchs)+ \
                                            str(epochs)+str(profitMult)+str(sentiment)+str(useIndicators)+'.csv', index=False)

                              
    df.reset_index(drop=True)
    

In [None]:
split = 0.7
stock = 'AAPL'
dates = ['2022','2023']
profitMult=0.000001

intervals = ['12h']         #['1min', '10min', '60min', '6h', '12h', '1D']
algorithms = ['LSTM']       #['RNN', 'LSTM']
steps   = [10, 20]          #[2, 5, 10, 20]
units   = [100]             #[50, 100., 150]
batchs  = [16,64,128]       #[16,64,128]
epochs  = [5, 10]           #[5, 10, 20]
profits = [1,10,100,1000]   #[1,10,100,1000]
sentiment = [True, False]   #[True, False]
relevance = 0.8             # 0.0-1.0
useIndicators=[True, False] #[True, False]

numberoftests = len(intervals)*len(algorithms)*len(steps)*len(units)*len(batchs)*len(profits)*len(epochs)*len(sentiment)*len(useIndicators)
print ('Ejecutando', numberoftests, 'tests:')

runTest(split, stock, dates,profitMult,intervals, algorithms, steps, units, batchs, profits, epochs, sentiment, relevance, useIndicators)
print("Ejecución finalizada")

Ejecutando 48 tests:
12h  LSTM  10  100    16   5  0.000000  0.068164   True   True  0.8  173  126  57.86%
12h  LSTM  10  100    16   5  0.000000  0.064848   True  False  0.8  183  116   61.2%
12h  LSTM  10  100    16   5  0.000000  0.063458  False   True  0.8  170  129  56.86%
12h  LSTM  10  100    16   5  0.000000  0.066876  False  False  0.8  175  124  58.53%
12h  LSTM  10  100    16  10  0.000000  0.063470   True   True  0.8  181  118  60.54%
12h  LSTM  10  100    16  10  0.000000  0.057191   True  False  0.8  175  124  58.53%
12h  LSTM  10  100    16  10  0.000000  0.057954  False   True  0.8  176  123  58.86%
12h  LSTM  10  100    16  10  0.000000  0.055583  False  False  0.8  176  123  58.86%
12h  LSTM  10  100    64   5  0.000000  0.075478   True   True  0.8  169  130  56.52%
12h  LSTM  10  100    64   5  0.000000  0.074481   True  False  0.8  180  119   60.2%


KeyboardInterrupt: 