The aim of this project is to create a LSTM capable of predicting next-day share price increases, for the ASX Schools Sharemarket game.


In [1]:
import pandas as pd

In [2]:
import yfinance as yf

stocks = pd.read_csv("./stocks.tsv", sep="\t")["SYMBOL"].to_list()
stocks = [f"{stock}.AX" for stock in stocks]

data = yf.download(stocks, period="6mo")["Close"]

data.to_csv("./data/stock_values.csv")

KeyboardInterrupt: 

### Preparing the data.

Data is squished into a long dataset, before moving averages are added, null rows are dropped and the stock category is encoded.


In [None]:
all_data = pd.read_csv("./data/stock_values.csv")

# Reshape the dataframe from wide to long format, with each row representing a unique date-stock combination and its corresponding closing price.
all_data = all_data.melt(id_vars="Date", var_name="Stock", value_name="Close")

all_data = all_data.drop(columns=["Date"])
all_data["MA_5"] = all_data.groupby("Stock")["Close"].transform(
    lambda x: x.rolling(5).mean()
)
all_data["MA_10"] = all_data.groupby("Stock")["Close"].transform(
    lambda x: x.rolling(10).mean()
)
all_data["MA_20"] = all_data.groupby("Stock")["Close"].transform(
    lambda x: x.rolling(20).mean()
)

all_data.sample(10, random_state=42)

Unnamed: 0,Stock,Close,MA_5,MA_10,MA_20
24985,ORI.AX,18.74,18.602,18.419,18.235
19125,LOV.AX,20.639999,,,
3077,ARG.AX,8.76,8.794,8.808,8.809
24526,ORA.AX,2.65,2.638,2.62,2.6055
32428,TCL.AX,13.47,13.292,13.132,13.212
2847,ARB.AX,37.849998,38.338,38.387001,39.035
6747,CCP.AX,14.29,14.52,14.417,14.6735
35091,VOC.AX,,,,
21693,MTS.AX,3.91,3.938,3.857,3.7605
1133,ALD.AX,35.529999,35.324,,


In [None]:
# normalise data for each stock


def normalise_data(df: pd.DataFrame):
    """This function normalises stock data IN PLACE. Assumes `Close`, `MA_5`, `MA_10`, and `MA_20` columns are present."""

    df[["Close", "MA_5", "MA_10", "MA_20"]] = df.groupby("Stock")[
        ["Close", "MA_5", "MA_10", "MA_20"]
    ].transform(lambda x: (x - x.min()) / (x.max() - x.min()))

In [None]:
normalise_data(all_data)

# print("Before Dropna: ", all_data.shape)
all_data = all_data.dropna()
# print("After Dropna: ", all_data.shape)

one_hot = pd.get_dummies(all_data["Stock"])

all_data = pd.concat([all_data, one_hot], axis=1)
# Don't drop Stock just yet. It makes the logic easier later.
# all_data = all_data.drop(columns=["Stock"])

# print(all_data.describe())

all_data

Before Dropna:  (37500, 5)
After Dropna:  (27898, 5)
              Close          MA_5         MA_10         MA_20
count  27898.000000  27898.000000  27898.000000  27898.000000
mean       0.520851      0.529396      0.525246      0.518439
std        0.264341      0.286520      0.304226      0.324673
min        0.000000      0.000000      0.000000      0.000000
25%        0.308139      0.286434      0.258594      0.215942
50%        0.525140      0.541045      0.536445      0.533419
75%        0.741936      0.778132      0.799168      0.819747
max        1.000000      1.000000      1.000000      1.000000


Unnamed: 0,Stock,Close,MA_5,MA_10,MA_20,A200.AX,A2M.AX,AAA.AX,ABC.AX,AFI.AX,...,WGX.AX,WHC.AX,WLE.AX,WOR.AX,WOW.AX,WPR.AX,WTC.AX,XRO.AX,YAL.AX,ZIM.AX
19,A200.AX,0.297922,0.380515,0.354623,0.000000,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
20,A200.AX,0.388761,0.349721,0.348189,0.039956,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
21,A200.AX,0.321786,0.334414,0.331744,0.069468,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
22,A200.AX,0.375673,0.339636,0.311249,0.104280,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
23,A200.AX,0.345650,0.339636,0.273355,0.130944,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37495,ZIM.AX,0.195313,0.182215,0.128934,0.040162,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
37496,ZIM.AX,0.171875,0.181544,0.135193,0.044121,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
37497,ZIM.AX,0.146875,0.176174,0.136087,0.045569,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
37498,ZIM.AX,0.146875,0.165772,0.132332,0.045279,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
WINDOW_SIZE = 10

X, y = [], []

for ticker in all_data["Stock"].unique():
    stock_data = all_data[all_data["Stock"] == ticker]
    for i in range(len(stock_data) - WINDOW_SIZE):
        X.append(stock_data.iloc[i : i + WINDOW_SIZE, 1:].values)  # Previous 10 days
        y.append(stock_data.iloc[i + WINDOW_SIZE, 1:])  # Next day's data

X, y = np.array(X), np.array(y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

The data is now ready.

We construct an LSTM with PyTorch as follows:
