<!-- # Overview
- Project
    - data engineer 
        - keep acceptable sessions
        - create prediction classes and eliminate end of each session
        - make same input size for rnn
        - add features if this is suitable
    - code each rnn
        - vanilla rnn
        - lstm
        - gru -->

# Overview
- Load Data
    - 
- Modelling
    - Train-Test-Split
    - RNN
        - Model Def
        - Model Training
    - LSTM
        - Model Def
        - Model Training
    - GRU
        - Model Def
        - Model Training
    - (Transformer Model Def)
        - Transfer Learning/Model Def
        - Model Training
        
- Evaluation and Comparison of Models
    - Numerical Evalutaion Metrics
        - AUC and F1 Score
       - (Plotting number results comparing each as parameters change)
    - Plotted Evaluation Metrics
        - ROC Curve

# Imports

In [1]:
# manipulating data
import pandas as pd
import numpy as np

# Neural Networks
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch import Tensor
import torch.optim as optim
from torch.autograd import Variable

# Evaluation
from torchmetrics import F1Score
from torchmetrics.functional import auc
from torchmetrics import ConfusionMatrix
from torchmetrics.classification import MulticlassF1Score
from torchmetrics.functional import precision_recall
from torchmetrics.functional import auc
from torchmetrics import ROC
from sklearn.metrics import roc_auc_score

# handling time data
import time # for timestamps
import datetime

# data analysis
import matplotlib.pyplot as plt 

#
import math



# Modelling

### Baseline - The baseline would be predicting that nobody will purchase
## THIS SHOULD BE PUT LATER ACTUALLY ONCE CLASSES ARE MADE MORE BALANCED

#### For 2 Labels

In [48]:
np.unique(my2Labels, return_counts = True)[1][0] / len(my2Labels)

0.9587844058150812

#### For 3 Labels

In [49]:
np.unique(my3Labels, return_counts = True)[1][0] / len(my2Labels)

0.8106947870051741

####
- https://ai.stackexchange.com/questions/3156/how-to-select-number-of-hidden-layers-and-number-of-memory-cells-in-an-lstm

- There's one additional rule of thumb that helps for supervised learning problems. You can usually prevent over-fitting if you keep your number of neurons below:

𝑁ℎ=𝑁𝑠(𝛼∗(𝑁𝑖+𝑁𝑜))

- 𝑁𝑖 = number of input neurons.
- 𝑁𝑜 = number of output neurons.
- 𝑁𝑠 = number of samples in training data set.
- 𝛼 = an arbitrary scaling factor usually 2-10.

Guy says he geneerally uses 2

#### https://www.reddit.com/r/MachineLearning/comments/4behuh/does_the_number_of_layers_in_an_lstm_network/
Some discussion about what "depth" in recurrent architectures means. Downward skip-connections seem to be the most helpful, but in general skip connections are critical in deep recurrent networks.

#### 
- https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
- usually one hidden layer is fine

### This looks like best tutorial to follow so far (in an article)
- https://blog.floydhub.com/a-beginners-guide-on-recurrent-neural-networks-with-pytorch/
- https://coderzcolumn.com/tutorials/artificial-intelligence/pytorch-rnn-for-text-classification-tasks
    - didn't look bad either
- https://docs.wandb.ai/guides/integrations/pytorch
    - logging gradients with wandb

### Youtube Tutorial - Just Like Best One up There
- https://www.youtube.com/watch?v=1vGOQAel2yU&ab_channel=SungKim
- packed sequence

## Metric Functions

#### Binary Classification

In [3]:
def bin_class_metrics(pred, target, positive_class = 1):
    conf_mat = ConfusionMatrix(num_classes=2)
    cm = conf_mat(pred, target)
    print("Confusion Matrix (0 in Top Left): ")
    print(cm)
    
    if positive_class == 1: 
        # true positives / (true positives + false positives)
        # recall = true positives / (true positives + false negatives)
        tp = cm[1][1]
        fp = cm[0][1]
        fn = cm[1][0]
    else:
        tp = cm[0][0]
        fp = cm[1][0]
        fn = cm[0][1]
      
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1score = 2 * precision * recall / (precision + recall)
    
    if not ((precision.item() > 0) & (precision.item() < 100)):
        precision = 0
        print("\n\nprecision is nan (has been set to 0)")
    
    if not ((recall.item() > 0) & (recall.item() < 100)):
        recall = 0
        print("recall is nan (has been set to 0)")
    
    if not ((f1score.item() > 0) & (f1score.item() < 100)):
        f1score = 0
        print("F1score is nan (has been set to 0)\n\n")
    
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1-Score: ", f1score)
    
#     roc = ROC(num_classes = 2, pos_label=1)
#     score_fpr, score_tpr, _ = roc(target, pred)
#     score_roc_auc = roc_auc_score(target, pred)
    
    

In [4]:
def multiclass_metrics(pred, target, num_classes):
    conf_mat = ConfusionMatrix(num_classes=num_classes)
    cm = conf_mat(pred, target)
    print("Confusion Matrix (0 in Top Left): ")
    print(cm)
    
    
    metric = MulticlassF1Score(num_classes=num_classes, average='macro')
    f1_score_avg = metric(pred, target)
    print("F1-Score (Average)", f1_score_avg)

    metric = MulticlassF1Score(num_classes=num_classes, average=None)
    f1_score_each = metric(pred, target)
    print("F1-Score (each):")
    for i, f in enumerate(f1_score_each):
        print("Class ", i, ":", f)
    

In [5]:
def evaluate_model_metrics(model, num_class, dataloader):
    # to store all labels and predictions for f1-score
    all_pred = []
    all_label = []

    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in dataloader:
            inputs, labels = data
            # calculate outputs by running images through the network
            outputs, h = model(inputs)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs, 1)
            all_pred.append(predicted)
            all_label.append(labels)

    # get all predictions and labels into one array and as integer tensors
    all_pred = [i for s in all_pred for i in s]
    all_label = [i for s in all_label for i in s]
    all_label = [all_label[i][0] for i in range(0, len(all_label))]
    all_label = [all_label[i].to(dtype=torch.long) for i in range(len(all_label))]

    all_pred = torch.LongTensor(all_pred)
    all_label = torch.LongTensor(all_label)
    if num_class == 2:
        bin_class_metrics(all_pred, all_label, positive_class = 1)
    elif num_class > 2:
        multiclass_metrics(all_pred, all_label, num_class)
    return all_pred

In [6]:
def print_metrics(model, model_name, num_classes, train_dl, test_dl):
    print("-----------------------------------------------------------------------------------")
    print(model_name, " Metrics")
    print("Train")
    preds_lstm_train = evaluate_model_metrics(model, num_classes, train_dl)

    print("Test")
    preds_lstm = evaluate_model_metrics(lstm, num_classes, test_dl)

    print("-----------------------------------------------------------------------------------")

# Model Definitions
#### Training Function
- https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
- https://www.geeksforgeeks.org/training-neural-networks-with-validation-using-pytorch/
    - training with validation as well

## Vanilla RNN Model Definition

In [7]:
    
#         print("hidden2 shape: ", hidden.shape)
#         print("output1 shape: ", out.shape)
       
        # out = out[:, -1]   # found from online for a certain error, might want to investigate further
                            # refer to this for fix -> i think about three entries down
                            # https://stackoverflow.com/questions/4493554/neural-network-always-produces-same-similar-outputs-for-any-input
       
        # out = out[:, -1, :] # this solution from https://discuss.pytorch.org/t/cross-entropy-loss-target-size-and-output-size-mismatch/99031/8
                            # makes the loss huuuuggee
        
        # https://discuss.pytorch.org/t/valueerror-expected-target-size-32-7-got-torch-size-32/42409/4
            # might have some useful information about this stuff
        
        
        #out = out.contiguous().view(-1, self.hidden_size) # from second resource below
                                                          # doesn't seem to work either
#         print("out1 reshaped: ", out.reshape(-1, hidden_size))  # also doesn't seem to work
#         out = self.linear(out.reshape(-1, hidden_size))
        #scores = torch.nn.CrossEntropyLoss()

In [8]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers = 1):
        super(RNNClassifier, self).__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        # self.embedding = (input_size, hidden_size)
        
        # input_size might need to be hidden_size as well
        #nonlinearity='relu',
        self.rnn = torch.nn.RNN(input_size, hidden_size,  batch_first = True, dropout = 0)
        # MAYBE NEED TO ADD ANOTHER LINEAR LAYER
        self.linear = torch.nn.Linear(hidden_size, output_size)
        
    def forward(self, sequences):

        batch_size = sequences.size(0)
    
        #embedded = self.embedding(sequence)
#         print(sequences.shape)
        
        hidden = self._init_hidden(batch_size)
        out, hidden = self.rnn(sequences, hidden) # embedded here for sequence if not commented out
                
        out = self.linear(hidden[-1])
    
        return out, hidden
    
    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        return Variable(hidden)
           

In [9]:
# https://github.com/hunkim/PyTorchZeroToAll/blob/master/13_2_rnn_classification.py
# - written with this, use the one below
# https://blog.floydhub.com/a-beginners-guide-on-recurrent-neural-networks-with-pytorch/

# TRY THIS - THIS ONE WORKED!!!
# https://github.com/rasbt/stat453-deep-learning-ss21/blob/main/L15/1_lstm.ipynb


## LSTM Model Definition
- can try stacking lstm
- dropout
- bidirectional lstm (isn't really used for predicting the future)
- cnn lstm hybrid

In [10]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers = 1):
        super(LSTMClassifier, self).__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        # self.embedding = (input_size, hidden_size)
        
        # input_size might need to be hidden_size as well
        self.lstm = torch.nn.LSTM(input_size, hidden_size, batch_first = True, dropout = 0)
        # MAYBE NEED TO ADD ANOTHER LINEAR LAYER
        self.linear = torch.nn.Linear(hidden_size, output_size)
        
    def forward(self, sequences):

        batch_size = sequences.size(0)
    
        #embedded = self.embedding(sequence)
#         print(sequences.shape)
        
        hidden = self._init_hidden(batch_size)
        cell = self._init_hidden(batch_size)
        out, (hidden, cell) = self.lstm(sequences, (hidden, cell)) # embedded here for sequence if not commented out
    
#         output, hidden = self.lstm(sequences)
        out = self.linear(hidden[-1])
    
        return out, hidden
    
    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        return Variable(hidden)

## GRU Model Definition

In [11]:
class GRUClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers = 1):
        super(GRUClassifier, self).__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        # self.embedding = (input_size, hidden_size)
        
        # input_size might need to be hidden_size as well
        self.gru = torch.nn.GRU(input_size, hidden_size, batch_first = True, dropout = 0)
        # MAYBE NEED TO ADD ANOTHER LINEAR LAYER
        self.linear = torch.nn.Linear(hidden_size, output_size)
        
    def forward(self, sequences):

        batch_size = sequences.size(0)
    
        #embedded = self.embedding(sequence)
#         print(sequences.shape)
        
        hidden = self._init_hidden(batch_size)
        out, hidden = self.gru(sequences, hidden) # embedded here for sequence if not commented out
            
        out = self.linear(hidden[-1])
    
        return out, hidden
    
    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        return Variable(hidden)

## Model Training Function

In [12]:
def train_model(model, loss_fn, optimizer, num_epochs, train_dataloader, val_dataloader):

    for epoch in range(num_epochs):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            #labels = labels.long() # convert to expected target datatype (Long which is equivalent to int here)
            labels = labels.type(torch.LongTensor)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs, h = model(inputs)

            loss = loss_fn(outputs,labels.view(-1).long()) # do i need to fix what's in here (even necessary to have it)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
                running_loss = 0.0


    print('Finished Training')
    return model

## Model Training

#### Get Dataloaders

In [227]:
# get dataloders for 2 classes
batch_size = 64

train_dataloader_2 
test_dataloader_2 = get_dataloaders(2, batch_size)

train_dataloader_3 
test_dataloader_3 = get_dataloaders(3, batch_size)


## Hyperparameter Tuning

In [None]:
# https://medium.com/distributed-computing-with-ray/scaling-up-pytorch-lightning-hyperparameter-tuning-with-ray-tune-4bd9e1ff9929

## Early Stopping

In [None]:
# https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py

#### Train RNN

In [None]:
# model parameters for instantiation
input_size = seq_arrays.shape[2]
hidden_size = 32
output_size = 2
n_layers = 1

#instantiate model
rnn = RNNClassifier(input_size, hidden_size, output_size, n_layers)

loss_fn_rnn = torch.nn.CrossEntropyLoss() # SHOULD USE WEIGHT PARAMETER SINCE UNBALANCED
learning_rate = 0.001

optimizer_rnn = torch.optim.Adam(rnn.parameters(), lr=learning_rate)

In [None]:
# train
num_epochs = 3
rnn = train_model(rnn, loss_fn_rnn, optimizer_rnn, num_epochs, train_dataloader_2, val_dataloader_2)

In [None]:
print("-----------------------------------------------------------------------------------")
print("RNN Metrics")
print("Train")
preds_rnn_train = evaluate_model_metrics(rnn, 2, train_dataloader_2)

print("Test")
preds_rnn = evaluate_model_metrics(rnn, 2, test_dataloader_2)
print("-----------------------------------------------------------------------------------")

#### Train LSTM

In [None]:
# model parameters for instantiation
input_size = seq_arrays.shape[2]
hidden_size = 32
output_size = 2
n_layers = 1

#instantiate model
lstm = LSTMClassifier(input_size, hidden_size, output_size, n_layers)

loss_fn_lstm = torch.nn.CrossEntropyLoss() # SHOULD USE WEIGHT PARAMETER SINCE UNBALANCED
learning_rate = 0.001

optimizer_lstm = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

In [None]:
# train
num_epochs = 3
lstm = train_model(lstm, loss_fn_lstm, optimizer_lstm, num_epochs, train_dataloader_2, val_dataloader_2)

In [None]:
print("-----------------------------------------------------------------------------------")
print("LSTM Metrics")
print("Train")
preds_lstm_train = evaluate_model_metrics(lstm, 2, train_dataloader_2)

print("Test")
preds_lstm = evaluate_model_metrics(lstm, 2)

print("-----------------------------------------------------------------------------------")

#### Train GRU

In [292]:
# model parameters for instantiation
input_size = seq_arrays.shape[2]
hidden_size = 64
output_size = 2
n_layers = 1

#instantiate model
gru = GRUClassifier(input_size, hidden_size, output_size, n_layers)

loss_fn_gru = torch.nn.CrossEntropyLoss() # SHOULD USE WEIGHT PARAMETER SINCE UNBALANCED
learning_rate = 0.001

optimizer_gru = torch.optim.Adam(gru.parameters(), lr=learning_rate)

In [295]:
# train
num_epochs = 3
gru = train_model(gru, loss_fn_gru, optimizer_gru, num_epochs, train_dataloader_2, val_dataloader_2)

[1,  2000] loss: 0.149
[2,  2000] loss: 0.080
[3,  2000] loss: 0.059
Finished Training


In [None]:
print("-----------------------------------------------------------------------------------")
print("GRU Metrics")
print("Train")
preds_gru_train = evaluate_model_metrics(gru, 2, train_dataloader_2)

print("Test")
preds_gru = evaluate_model_metrics(gru, 2, test_dataloader_2)
print("-----------------------------------------------------------------------------------")

In [296]:
# # model parameters for instantiation
# input_size = seq_arrays.shape[2]
# hidden_size = 32
# output_size = 3
# n_layers = 1

# #instantiate model
# gru = GRUClassifier(input_size, hidden_size, output_size, n_layers)

# loss_fn_gru = torch.nn.CrossEntropyLoss() # SHOULD USE WEIGHT PARAMETER SINCE UNBALANCED
# learning_rate = 0.001

# optimizer_gru = torch.optim.Adam(gru.parameters(), lr=learning_rate)

# # train
# num_epochs = 3
# gru = train_model(gru, loss_fn_gru, optimizer_gru, num_epochs, train_dataloader_3, val_dataloader_3)

print("-----------------------------------------------------------------------------------")
print("GRU Metrics")
print("Train")
preds_gru_train = evaluate_model_metrics(gru, 3, train_dataloader_3)

print("Test")
preds_gru = evaluate_model_metrics(gru, 3, test_dataloader_3)
print("-----------------------------------------------------------------------------------")

-----------------------------------------------------------------------------------
GRU Metrics
Train
Confusion Matrix (0 in Top Left): 
tensor([[115083,    282,      0],
        [ 19618,   1353,      0],
        [  1166,   4687,      0]])
F1-Score (Average) tensor(0.3384)
F1-Score (each):
Class  0 : tensor(0.9161)
Class  1 : tensor(0.0991)
Class  2 : tensor(0.)
Test
Confusion Matrix (0 in Top Left): 
tensor([[24571,    61,     0],
        [ 4302,   261,     0],
        [  267,  1007,     0]])
F1-Score (Average) tensor(0.3342)
F1-Score (each):
Class  0 : tensor(0.9139)
Class  1 : tensor(0.0886)
Class  2 : tensor(0.)
-----------------------------------------------------------------------------------


RNN Metrics
Train
Confusion Matrix (0 in Top Left): 
tensor([[136334,      2],
        [  5853,      0]])


precision is nan (has been set to 0)
recall is nan (has been set to 0)
F1score is nan (has been set to 0)


Precision:  0
Recall:  0
F1-Score:  0
Test
Confusion Matrix (0 in Top Left): 
tensor([[29194,     1],
        [ 1274,     0]])


precision is nan (has been set to 0)
recall is nan (has been set to 0)
F1score is nan (has been set to 0)


Precision:  0
Recall:  0
F1-Score:  0


In [205]:
print("-----------------------------------------------------------------------------------")
print("LSTM Metrics")
print("Train")
preds_lstm_train = evaluate_model_metrics(lstm, 2, train_dataloader_2)

print("Test")
preds_lstm = evaluate_model_metrics(lstm, 2)

print("-----------------------------------------------------------------------------------")

LSTM Metrics
Train
Confusion Matrix (0 in Top Left): 
tensor([[136336,      0],
        [  5853,      0]])


precision is nan (has been set to 0)
recall is nan (has been set to 0)
F1score is nan (has been set to 0)


Precision:  0
Recall:  0
F1-Score:  0
Test
Confusion Matrix (0 in Top Left): 
tensor([[29195,     0],
        [ 1274,     0]])


precision is nan (has been set to 0)
recall is nan (has been set to 0)
F1score is nan (has been set to 0)


Precision:  0
Recall:  0
F1-Score:  0


In [235]:
print("-----------------------------------------------------------------------------------")
print("GRU Metrics")
print("Train")
preds_gru_train = evaluate_model_metrics(gru, 2, train_dataloader_2)

print("Test")
preds_gru = evaluate_model_metrics(gru, 2, test_dataloader_2)
print("-----------------------------------------------------------------------------------")

GRU Metrics
Train
Confusion Matrix (0 in Top Left): 
tensor([[133283,   3053],
        [  1170,   4683]])
Precision:  tensor(0.6054)
Recall:  tensor(0.8001)
F1-Score:  tensor(0.6892)
Test
Confusion Matrix (0 in Top Left): 
tensor([[28553,   642],
        [  259,  1015]])
Precision:  tensor(0.6126)
Recall:  tensor(0.7967)
F1-Score:  tensor(0.6926)


In [None]:
num_epochs = 3

for epoch in range(num_epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        
        #labels = labels.long() # convert to expected target datatype (Long which is equivalent to int here)
        labels = labels.type(torch.LongTensor)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs, h = gru(inputs)
        
#         outputs = outputs[:,154,0:2]
        loss = loss_fn_gru(outputs,labels.view(-1).long()) # do i need to fix what's in here (even necessary to have it)
#         loss = loss_fn(outputs, labels.reshape(-1))
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0
        

print('Finished Training')