#Stock Price Prediction 2022

## Data Prepare

+ I use S&P 500 Stocks dataset from Kaggle
[download link here](https://www.kaggle.com/datasets/andrewmvd/sp-500-stocks)

+ I have already preprocess and split train, test so I just load from gg drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')
!unzip /content/drive/MyDrive/sp500_stocks.csv.zip
!cp /content/drive/MyDrive/sp500_preprocessed.csv /content/sp500_preprocessed.csv
!cp  /content/drive/MyDrive/sp500_train.csv /content/sp500_train.csv
!cp  /content/drive/MyDrive/sp500_test.csv /content/sp500_test.csv

In [None]:
import pandas as pd
df = pd.read_csv('/content/sp500_stocks.csv')
df.head(5)

Unnamed: 0,Date,Symbol,Adj Close,Close,High,Low,Open,Volume
0,2010-01-04 00:00:00-05:00,MMM,59.318886,83.019997,83.449997,82.669998,83.089996,3043700.0
1,2010-01-05 00:00:00-05:00,MMM,58.947342,82.5,83.230003,81.699997,82.800003,2847000.0
2,2010-01-06 00:00:00-05:00,MMM,59.783295,83.669998,84.599998,83.510002,83.879997,5268500.0
3,2010-01-07 00:00:00-05:00,MMM,59.826176,83.730003,83.760002,82.120003,83.32,4470100.0
4,2010-01-08 00:00:00-05:00,MMM,60.247749,84.32,84.32,83.300003,83.690002,3405800.0



+ So our data is tablet data so that I just use some columns for create dataset for training: date, symbol, close.

+ Each row in the table is a price result of a specific day from 2010 to 2023. But I don't use all days records, instead I use the every ending date of all months. For example for 31 days in January 2010, I just use the 2010-01-31 date for my dataset. 

+ I choose close price for our target prediction to make it simple.

+ Then I split the dataset into 2 parts before 2018 (for training) and after 2018 (for test)

In [None]:
# def preprocess(df):

#   years = [*range(2010, 2023)]
#   months = [*range(1, 13)]
#   cols = []
#   for year in years:
#     for month in months:
#       cols.append(f'{year} - {month}')
#   new_df = pd.DataFrame(columns = cols)

#   old_month = '01'
#   end_prices = []
#   old_symbol = 'MMM'

#   for index, row in df.iterrows():
    
#     month = row['Date'].split('-')[1]
#     symbol = row['Symbol']

#     if old_month == month:
#       continue

#     if symbol != old_symbol:
#       new_df.loc[old_symbol] = end_prices
#       end_prices = []
#       old_symbol = symbol

#     end_price = df['Close'].iloc[index - 1]
#     end_prices.append(end_price)
#     old_month = month

#   new_df.to_csv('sp500_preprocessed.csv')
#   return new_df

# preprocess(df)

In [None]:
# def split_train_test(df, split_year):
#   split_col_index = (split_year - 2010 + 1) * 12

#   train = df.iloc[:, :split_col_index]
#   test = df.iloc[:, split_col_index:]
  
#   train.to_csv('sp500_train.csv')
#   test.to_csv('sp500_test.csv')
#   return train, test


# preprocessed_df = pd.read_csv('sp500_preprocessed.csv', index_col = 0)
# train, test = split_train_test(preprocessed_df, 2018)

## Create Dataset

+ Use MinMaxScaler for normalize data.

+ The solution for create training data for time series problem is that: the future value is the result of the past value. That means INPUT for our model will be data in the past and OUTPUT is data in the future. 

+ Choose look-back periods = 30.

+ The look-back periods term is the number of timestamps we will look back to predict future value. In my case I choose prices of 30 months ago to predict next month price.


In [None]:
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import numpy as np
class SP500Dataset(Dataset):
  def __init__(self, csv_path, mode = 'training'):
    super(SP500Dataset).__init__()
    data = pd.read_csv(csv_path, index_col = 0)
    self.symbols = data.index.to_list()
    self.scaler = MinMaxScaler()
    data= self.transform(data)
    self.X, self.y = self.split_period(data = data, mode = mode)
    

  def __len__(self):
    return len(self.X)

  def __getitem__(self, index):
    return self.X[index], self.y[index]

  def transform(self, data):

    data = data.values.astype(np.float32)
    data = self.scaler.fit_transform(data)
    return data

  def split_period(self, data, periods = 30, mode = 'training'):
    X, y = [], []
    if mode == 'predict':
      data = data[-(periods + 12): -12]
    size = data.shape[1]

    for i in range(size):
      end_i = i + periods
      if end_i == size:
        break;
      X.extend(data[:, i:end_i])
      y.extend(data[:, end_i])
    return X, y

  

## Model

I choose LSTM for my solution. Using LSTM, time series forecasting models can predict future values based on previous, sequential data. 

In [None]:
class BaseModel(nn.Module):
  def __init__(self, 
               valid_symbols,
               input_size = 1, 
               hidden_size = 30, 
               num_layers = 1, 
               ):
    super(BaseModel, self).__init__()
    self.lstm = nn.LSTM(input_size = input_size, 
                        hidden_size = hidden_size,
                        num_layers = num_layers,
                        batch_first = True)
    self.fc = nn.Linear(hidden_size, 1)


  def forward(self, batch):
    batch = batch.to(next(self.parameters()).device)
    _, (hn, cn) = self.lstm(batch)
    output = self.fc(hn[-1].squeeze()).squeeze()
    return output

In [None]:
!pip install wandb
!wandb login

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
from torch.optim import Adam
import time
import wandb
torch.manual_seed(42)
def train(device = 'cpu', epochs = 20, lr = 1e-3, batch = 32):


  wandb.init(project="stock prediction", entity="kmthchai")

  wandb.config = {
    "learning_rate": lr,
    "epochs": epochs,
    "batch_size": 128
  }

  train_ds = SP500Dataset('/content/sp500_train.csv', mode = 'training')
  test_ds = SP500Dataset('/content/sp500_test.csv', mode = 'training')

  valid_symbols = train_ds.symbols
  train_loader = DataLoader(train_ds, batch_size = batch, shuffle = True)
  test_loader = DataLoader(test_ds, batch_size = batch)

 
  if device == 'cuda' and torch.cuda.is_available():
    device = torch.device('cuda')
  else:
    device = torch.device('cpu')


  model = BaseModel(valid_symbols = valid_symbols).to(device)
  opt = Adam(params = model.parameters(), lr = lr)
  mse = nn.MSELoss()
  
  
  for epoch in range(epochs):
    model.train()
    train_loss = 0
    test_loss = 0
    for batch in iter(train_loader):
      features, true_prices = batch
      opt.zero_grad()
      output = model.forward(features.unsqueeze(-1))
      loss = mse(output, true_prices.to(device))
      loss.backward()
      opt.step()
      train_loss += loss

    model.eval()
    for batch in iter(test_loader):
      features, true_prices = batch
      output = model.forward(features.unsqueeze(-1))
      loss = mse(output, true_prices.to(device))
      test_loss += loss
    wandb.log({"train_loss": train_loss, 'test_loss': test_loss})
    print(f'epoch {epoch}: train_loss: {train_loss} - test_loss: {test_loss}')

  now = time.localtime()
  current_time = time.strftime("%H:%M:%S", now)

  torch.save(model.state_dict(), f'time: {current_time} loss:{test_loss} - lr:{lr}.w')

train(device = 'cuda', 
      epochs = 20)

VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
test_loss,█▄▃▂▂▁▂▂▁▄▁▃▅▁▁▁▁▂▁▁
train_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
test_loss,0.0065
train_loss,0.04274


epoch 0: train_loss: 1.1105679273605347 - test_loss: 0.01905941031873226
epoch 1: train_loss: 0.11553435772657394 - test_loss: 0.010796588845551014
epoch 2: train_loss: 0.09300500154495239 - test_loss: 0.009283967316150665
epoch 3: train_loss: 0.07317405939102173 - test_loss: 0.008673260919749737
epoch 4: train_loss: 0.061385322362184525 - test_loss: 0.007237032521516085
epoch 5: train_loss: 0.05375215783715248 - test_loss: 0.0069282082840800285
epoch 6: train_loss: 0.049857042729854584 - test_loss: 0.007356479298323393
epoch 7: train_loss: 0.046261388808488846 - test_loss: 0.007233374752104282
epoch 8: train_loss: 0.047504913061857224 - test_loss: 0.006405081134289503
epoch 9: train_loss: 0.04452311620116234 - test_loss: 0.010926172137260437
epoch 10: train_loss: 0.04388606175780296 - test_loss: 0.00680704228579998
epoch 11: train_loss: 0.042821671813726425 - test_loss: 0.009588507004082203
epoch 12: train_loss: 0.044367920607328415 - test_loss: 0.012967843562364578
epoch 13: train_lo

## Prediction

For prediction the dataset will be load from 30 months + 12 months (1 year) before December 2022. It means I will loop 12 times (12 months), each loop will use the data of 30 months ago to predict next month then the predict value will join with 29 months before to become an input in the next loop.

Then after 12 loops. Inverse transform then take the last loop (December 2022) value to compare with the last year value.

In [None]:
def predict(model_path, symbol, N):

  predict_ds =  SP500Dataset('/content/sp500_test.csv', mode = 'predict')
  valid_symbols = predict_ds.symbols
  try:
    symbol_index = valid_symbols.index(symbol)
  except:
    return None

  model = torch.load(model_path)
  
  predict_loader = DataLoader(predict_ds, batch_size = len(valid_symbols))
  old_periods, _ =  next(iter(predict_loader))
  last_year_prices = old_periods[:, -1]
  all_predicts = []
  for i in range(12):
    next_period_predict = model.forward(old_periods.unsqueeze(-1)).unsqueeze(-1)
    old_periods = torch.cat((old_periods[:, 1:].to(next_period_predict.device), next_period_predict), axis = 1)
    all_predicts.append(next_period_predict)
 

  all_predicts = torch.cat(all_predicts, dim = 1)
  all_final_predicts = all_predicts[:, -1].unsqueeze(-1)

  inverse_scaler = MinMaxScaler()
  inverse_scaler.min_, inverse_scaler.scale_ = predict_ds.scaler.min_[0],predict_ds.scaler.scale_[0]

  last_year_prices = torch.Tensor(inverse_scaler.inverse_transform(last_year_prices.unsqueeze(-1))).squeeze()
  all_final_predicts = torch.Tensor(inverse_scaler.inverse_transform(all_final_predicts.cpu().detach())).squeeze()
  all_predicts = torch.Tensor(inverse_scaler.inverse_transform(all_predicts.cpu().detach())).squeeze()


  growth_rates = torch.div(all_final_predicts.squeeze(), last_year_prices)
  sorted_growth_rates, sorted_indexs = torch.sort(growth_rates, descending = True)
  topN_growth_rates = sorted_growth_rates[:N].tolist()

  topN_predict_prices = torch.index_select(all_predicts, 0, sorted_indexs)[:N].tolist()
  sorted_indexs = sorted_indexs.tolist()

  topN_symbols = []
  for idx in sorted_indexs[:N]:
    topN_symbols.append(valid_symbols[idx])

  symbol_predict_prices = all_predicts[symbol_index].tolist()
  return symbol_predict_prices, topN_growth_rates, topN_predict_prices, topN_symbols

In [None]:
path = '/content/time: 09:21:03 loss:0.006499168463051319 - lr:0.001.w'
symbol_predict_prices, topN_growth_rates, topN_predict_prices, topN_symbols= predict(path, 'MMM', 5)