In [None]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List, Optional

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# set the random seed for reproduction
SEED=190
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

#checking if GPU is available or not
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## 1. Split into Train and Validation Dataset

In [None]:
def get_category(df):
    data_category = {}
    for col in df.columns:
        if df.dtypes[[col]][0] == object:
            data_category[col] = {}
            for (i, cat) in enumerate(df[[col]].drop_duplicates().values.flatten().tolist()):
                data_category[col][cat] = i
    
    return data_category


In [None]:
df = pd.read_csv('predict_df.csv', low_memory=False)
data_category = get_category(df)

msk = np.random.rand(len(df)) > 0.2
train_df = df[msk]
valid_df = df[~msk]
# train_df.to_csv('sbc_train.csv', index=False)
# valid_df.to_csv('sbc_valid.csv', index=False)

## 2. Dataset for Prediction

In [None]:
from torch.utils.data.dataset import Dataset
from sklearn.preprocessing import OneHotEncoder

class SBCDataset(Dataset):
    def __init__(self, raw_data, cat_dict: dict, predictors: List[str], response: str, inference=False):
        self.cat_dict = cat_dict
        self.pred_data = self.one_hot(self.encode(raw_data, cat_dict, predictors))
        self.labels = raw_data[[response]].to_numpy(dtype=float)
        self.inference = inference
        self.shape = self.pred_data.shape

    def encode(self, raw_data, cat_dict, predictors):
        data = pd.DataFrame()
        for col in predictors:
            if "currLength" not in col and ":" not in col:
                data[[col]] = raw_data[[col]].replace(cat_dict)
            else: 
                data[[col]] = raw_data[[col]]
        return data
    
    def one_hot(self, raw_data):
        data = np.zeros(shape=(len(raw_data), 1))
        for col in raw_data.columns:
            if "currLength" not in col and ":" not in col:
                data = np.append(data, 
                                pd.get_dummies(raw_data[[col]], drop_first=True, dtype=float), 
                                axis=1)
            elif ":" in col:
                temp_col = col.split(":")
                data = np.append(data, 
                                pd.get_dummies(raw_data[[temp_col[0]]], drop_first=True, dtype=float).to_numpy() * raw_data[[temp_col[1]]].to_numpy(), 
                                axis=1)
            else: 
                data = np.append(data, raw_data[[col]], axis=1)
        return data[:, 1:]

    def __len__(self):
        return len(self.pred_data)
    
    def __getitem__(self, idx):
        if not self.inference:
            label = self.labels[idx]
            return torch.tensor(self.pred_data[idx], dtype=torch.float), torch.tensor(label, dtype=torch.float)
        else:
            return torch.tensor(self.pred_data[idx], dtype=torch.float)


In [None]:
BATCH_SIZE = 2048
TEST_BATCH_SIZE = 2048

# create the dataset
predictors = ["prevEndnote", "prev1", "prev2", "prev3", "currLength", "init1", "init2", "init3"]
train_ds = SBCDataset(train_df, cat_dict=data_category, predictors=predictors, response="preBoundary")
valid_ds = SBCDataset(valid_df, cat_dict=data_category, predictors=predictors, response="preBoundary")

# build the dataloader
train_loader = torch.utils.data.DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True
)
valid_loader = torch.utils.data.DataLoader(
    valid_ds, batch_size=TEST_BATCH_SIZE
)

## 3. Define the Training and Validation Loops

In [None]:
class Trainer(object):
    def __init__(self, model, criterion, lr_rate, max_epoch):
        self.model = model.to(device)
        self.criterion = criterion.to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr_rate)
        self.max_epoch = max_epoch

    def run(self, train_loader, valid_loader):
        # calculate the inital loss and accu on validation set
        valid_best_loss = self.validate(-1, valid_loader, best_loss=None)
        for epoch in range(self.max_epoch):
            self.train(epoch, train_loader)
            # save the checkpoint with the lowest validation loss
            valid_best_loss = self.validate(epoch, valid_loader, valid_best_loss)

    def train(self, epoch, loader):            
        self.model.train()
        running_loss, total, correct = 0.0, 0, 0
        with tqdm(enumerate(loader, 0), mininterval=10) as tepoch:
            for i, data in tepoch:
                # get the inputs; data is a list of [inputs, labels]
                # inputs: tensor, (batch_size, predictors_size)
                # labels: tensor, (batch_size, 1)
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                self.optimizer.zero_grad()

                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()

                # calculate the metric
                match, number = self.cal_metric(outputs.data, labels)

                # gather statistics
                total += number
                correct += match
                running_loss += loss.item()
                tepoch.set_postfix(loss=loss.item(), accuracy=100. * correct / total)

        running_loss /= len(loader)
        
        print('Training | Epoch: {}| Loss: {:.3f} | Accuracy on train: {:.1f}%'.format \
              (epoch+1, running_loss, 100 * correct / total))

    def validate(self, epoch, loader, best_loss=None):
        # switch to the evaluation mode, do not need to calculate the gradient
        self.model.eval()
        running_loss, total, correct = 0.0, 0, 0
        for i, data in tqdm(enumerate(loader)):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # replace the outputs and loss
            outputs = self.model(inputs)
            loss = self.criterion(outputs, labels)

            # calculate the metric
            match, number = self.cal_metric(outputs.data, labels)

            # gather statistics
            total += number
            correct += match
            running_loss += loss.item()

        running_loss /= len(loader)

        if best_loss is None or running_loss < best_loss:
            # if a better loss appears, save the checkpoint
            save_file = 'best_epoch{}_loss{:.2f}_accu{:.2f}.pt'.format(epoch+1, running_loss, 100 * correct / total)
            print('Save to file: ', save_file)
            torch.save(self.model, save_file)

            # overwrite the best_checkpoint.pt file
            torch.save(self.model, 'best_checkpoint.pt')

            best_loss = running_loss

        print('Validation | Epoch: {}| Loss: {:.3f} | Accuracy on val: {:.1f}%'.format \
              (epoch+1, running_loss,100 * correct / total))

        return best_loss


    def cal_metric(self, outputs, labels):
        # compare predictions to ground truth
        _, predicted = torch.max(outputs, 1, keepdim=True)
        number = labels.size(0)
        correct = (predicted == labels).sum().item()
        return correct, number

## 4. Define the Model Structure

In [None]:
class MLP(nn.Module):
    def __init__(self, input_dim: int):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 1)

        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

## 5. Training

In [None]:
NUM_EPOCH = 20
LEARNING_RATE = 0.001
input_shape = train_ds.shape
model = MLP(input_shape[-1])

criterion = nn.BCELoss()
trainer = Trainer(model, criterion, LEARNING_RATE, max_epoch=NUM_EPOCH)