# Fundamentals of Machine Learning, Fall 2021, Final Project

## Stock Closing Price Prediction

1. Download ```final_project.pdf```, ```Kaggle & Colab Guide.pptx```, and ```utils.py``` from i-campus.
2. Go to [Kaggle competition page](https://www.kaggle.com/c/2021mlfinal), join Kaggle & competition, and download dataset.
3. Following guide slides, upload ```utils.py```.
4. Mount Google Drive.
5. Implement your own model and predict on test dates.
6. Download and submit ```submission.csv``` to Kaggle.
7. Write a report on your project and submit on i-campus.

# INITIAL PACKAGES

In [None]:
# INITIAL PACKAGES
import os
import numpy as np
import pandas as pd

from utils import load_data, run

## Mount Google Drive

Assmue you made ```final_project``` directory on the root,
and data files are there.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
gdrive_root = '/content/gdrive/My Drive'
data_path = os.path.join(gdrive_root, 'final_project')
os.listdir(data_path)

['sample_submission.csv',
 'valid_id_answer.csv',
 'test_id.csv',
 'train_id_answer.csv',
 'test_input.npy',
 'valid_input.npy',
 'train_input.npy',
 'utils.py']

In [None]:
train_data, valid_data, test_input = load_data(data_path)

In [None]:
print(f'train id answer:\n {train_data[0].head()}')
print(f'train input shape: {train_data[1].shape}\n')

print(f'valid id answer:\n {valid_data[0].head()}')
print(f'valid input shape: {valid_data[1].shape}\n')

print(f'test id:\n {test_input[0].head()}')
print(f'test input shape: {test_input[1].shape}\n')

train id answer:
            id    answer
0  Z1HpN8DdqD -0.897532
1  4kZUPHdZCm  0.222965
2  4B9Zruxygn  3.562945
3  8BhHCriaH2  2.666667
4  mkYyKwYdek -2.083333
train input shape: (739, 142, 60, 11)

valid id answer:
            id    answer
0  XqAfjiZoin  2.627939
1  P8fIDWGztk  1.814882
2  qSWi7pDeyq -7.017544
3  JZq9kbg8gY  1.449275
4  dOiCHAt5wv -5.991736
valid input shape: (248, 142, 60, 11)

test id:
            id
0  Nqx4Oqo6eJ
1  RKI4KKxRdT
2  U6MPA99ktR
3  ztP24qyofv
4  TjGmwz9Z7T
test input shape: (245, 142, 60, 11)



---

# SHOW YOUR WORK
From here, import packages you need as long as they are permitted. <br>
Fill ```train_and_predict``` function with your codes. <br>
If you want, you can implement your own classes or functions within "SHOW YOUR WOKR" block. <br>
The rest of work is ours.

In [None]:
# set random seed for reproducibility
import torch
import numpy as np
import random


def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_random_seed(2021)

In [None]:
# IMPORT PACKAGES YOU NEED

import torch.nn as nn
from time import time
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import mean_absolute_error

In [None]:
# YOUR OWN CLASSES OR FUNCTIONS

class MyModel(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(MyModel, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        self.layers = nn.Sequential(nn.Linear(self.input_dim, self.input_dim), nn.Tanh(), nn.RNN(input_size=self.input_dim, hidden_size=self.hidden_dim, num_layers=1, batch_first=True))
        self.logit_layer = nn.Linear(self.hidden_dim, 1)
  
    def forward(self, x, index=None):
        hidden_x, hn_x = self.layers(x)

        output = self.logit_layer(hn_x).squeeze()

        return output


In [None]:
def train_and_predict(train_data, valid_data, test_data):
    """Train a model and return prediction on test input.

    Given train and valid data, build your model and optimize.
    Then, return predictions on test_input.

    You can import packages you want inside 'EDIT HERE' as long as they are permitted.
    (See document for the list of possible packages)

    arguments:
        train_data: tuple of (pandas.DataFrame, np.array).
        - 0: pandas.DataFrame with columns ['id', 'answer']
          'id' contains unique id assigned to each timestamp.
          'answer' contains closing price ratio corresponding to its timestamp.
        - 1: train input in np.array of (# of train timestamps, 1 + # of stocks, # of previous dates to be input, # of features)

        valid_data: tuple of (pandas.DataFrame, np.array).
        - 0: pandas.DataFrame with columns ['id', 'answer']
          'id' contains unique id assigned to each timestamp.
          'answer' contains closing price ratio corresponding to its timestamp.
        - 1: valid input in np.array of (# of valid timestamps, 1 + # of stocks, # of previous dates to be input, # of features)

        test_data: tuple of (pandas.DataFrame, np.array).
        - 0: pandas.DataFrame with columns ['id']
          'id' contains unique id assigned to each timestamp.
        - 1: test input in np.array of (# of test timestamps, 1 + # of stocks, # of previous dates to be input, # of features)
    
    returns:
        pandas.DataFrame, predictions on test input with columns ['id', 'answer'].
        'id' should contain unique id assigned to test input. 
        'answer' should contain prediction on the test input correspond to its id

    """
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    LEARNING_RATE = 0.01
    BATCH_SIZE = 256
    TEST_BATCH_SIZE = 1024
    NUM_EPOCHS = 200
    PATIENCE = 10
    ENDURE = 15

    # Example code for DecisionTreeRegressor:
    train_id_answer, train_input = train_data
    valid_id_answer, valid_input = valid_data
    test_id, test_input = test_data

    num_train = len(train_input)
    num_valid = len(valid_input)
    num_test = len(test_input)

    # Separate index
    index_train = train_input[:, 0]
    x_train = train_input[:, 1:]
    y_train = train_id_answer['answer'].values

    index_valid = valid_input[:, 0]
    x_valid = valid_input[:, 1:]
    y_valid = valid_id_answer['answer'].values

    index_test = test_input[:, 0]
    x_test = test_input[:, 1:]

    # Use last 60 days to train, 10 days to valid
    x_train = x_train[-60:]
    y_train = y_train[-60*x_train.shape[1]:]

    x_valid = x_valid[-10:]
    y_valid = y_valid[-10*x_valid.shape[1]:]

    # Use previous 3 days to predict
    x_train = x_train[:, :, -3:]
    x_valid = x_valid[:, :, -3:]
    x_test = x_test[:, :, -3:]

    # Fit data shape for model
    x_train_shape = x_train.shape
    x_train = x_train.reshape(x_train_shape[0] * x_train_shape[1], x_train_shape[2], -1)

    x_valid_shape = x_valid.shape
    x_valid = x_valid.reshape(x_valid_shape[0] * x_valid_shape[1], x_valid_shape[2], -1)

    x_test_shape = x_test.shape
    x_test = x_test.reshape(x_test_shape[0] * x_test_shape[1], x_test_shape[2], -1)

    # Convert data into torch.Tensor
    x_train = torch.FloatTensor(x_train)
    y_train = torch.FloatTensor(y_train)

    x_valid = torch.FloatTensor(x_valid)
    y_valid = torch.FloatTensor(y_valid)

    x_test = torch.FloatTensor(x_test)

    # Build torch dataset, dataloader
    print(x_train.shape, y_train.shape)
    print(x_valid.shape, y_valid.shape)
    print(x_test.shape)
    train_dataset = TensorDataset(x_train, y_train)
    valid_dataset = TensorDataset(x_valid, y_valid)
    test_dataset = TensorDataset(x_test)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE)

    # Build RNN model
    model = MyModel(x_train.shape[-1], 100).to(device)

    # Optimizer and loss function
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
    loss_fn = nn.L1Loss()
    
    # Train model
    mean_train_losses = []
    mean_valid_losses = []
    valid_mae_list = []
    best_mae = 99999

    train_s = time()
    for epoch in range(1, NUM_EPOCHS + 1):
        epoch_s = time()
        model.train()
        
        train_losses = []
        valid_losses = []
        for i, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()
            
            train_losses.append(loss.item())
            
        model.eval()
        
        val_pred = []
        val_gt = []
        # Disable gradient calculation for memory, computation efficiency
        with torch.no_grad():
            for i, (inputs, targets) in enumerate(valid_loader):
                inputs, targets = inputs.to(device), targets.to(device)

                outputs = model(inputs)
                loss = loss_fn(outputs, targets)
                
                valid_losses.append(loss.item())
                val_pred += outputs.detach().cpu().numpy().tolist()
                val_gt += targets.detach().cpu().numpy().tolist()
                
        mean_train_losses.append(np.mean(train_losses))
        mean_valid_losses.append(np.mean(valid_losses))

        epoch_elapsed = time() - epoch_s
        
        valid_mae = mean_absolute_error(val_gt, val_pred)
        valid_mae_list.append(valid_mae)
        print('epoch: {}, train loss: {:.4f}, valid loss: {:.4f}, valid mae: {:.4f}, elapsed: {:.4f}'\
            .format(epoch, np.mean(train_losses), np.mean(valid_losses), valid_mae, epoch_elapsed))
        
        if best_mae > valid_mae:
            print('Best Accuracy updated (%.4f => %.4f)' % (best_mae, valid_mae))
            best_mae = valid_mae
            best_epoch = epoch
            ENDURE = 0
            # Save best model
            torch.save(model.state_dict(), 'best_rnn.p')
        else:
            ENDURE += 1
            if ENDURE >= PATIENCE:
                print('Early stop triggered...!')
                break
    train_elapsed = time() - train_s

    print('Training Finished...!!')
    print('Time: %.4f' % train_elapsed)
    print('Best Valid acc : %.4f at epoch %d' % (best_mae, best_epoch))
    
    # Load best model
    model.load_state_dict(torch.load('best_rnn.p'))
    model.eval()

    # Make prediction on test data
    test_preds = []
    with torch.no_grad():
        for i, (inputs, ) in enumerate(test_loader):
            inputs = inputs.to(device)

            outputs = model(inputs)

            if device == 'cuda':
                test_preds += outputs.detach().cpu().numpy().tolist()
            else:
                test_preds += outputs.detach().numpy().tolist()
    
    # Make prediction data frame
    test_id['answer'] = test_preds
    pred = test_id.loc[:, ['id', 'answer']]

    return pred

---

# YOUR WORK IS DONE!
Do not touch any line below. <br>
```run``` function will grap your prediction and make ```submission.csv```. <br>
Take it and submit to Kaggle!

In [None]:
run(train_and_predict, train_data, valid_data, test_input)

torch.Size([8460, 3, 11]) torch.Size([8460])
torch.Size([1410, 3, 11]) torch.Size([1410])
torch.Size([34545, 3, 11])
epoch: 1, train loss: 1.5276, valid loss: 1.8992, valid mae: 1.8772, elapsed: 0.1647
Best Accuracy updated (99999.0000 => 1.8772)
epoch: 2, train loss: 1.5231, valid loss: 1.9053, valid mae: 1.8836, elapsed: 0.1834
epoch: 3, train loss: 1.5224, valid loss: 1.9089, valid mae: 1.8874, elapsed: 0.1374
epoch: 4, train loss: 1.5221, valid loss: 1.9114, valid mae: 1.8900, elapsed: 0.1395
epoch: 5, train loss: 1.5218, valid loss: 1.9131, valid mae: 1.8917, elapsed: 0.1523
epoch: 6, train loss: 1.5218, valid loss: 1.9121, valid mae: 1.8908, elapsed: 0.1471
epoch: 7, train loss: 1.5214, valid loss: 1.9116, valid mae: 1.8903, elapsed: 0.1377
epoch: 8, train loss: 1.5211, valid loss: 1.9112, valid mae: 1.8900, elapsed: 0.1571
epoch: 9, train loss: 1.5209, valid loss: 1.9110, valid mae: 1.8898, elapsed: 0.1407
epoch: 10, train loss: 1.5207, valid loss: 1.9108, valid mae: 1.8896, ela