In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List, Optional

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# set the random seed for reproduction
SEED=190
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

#checking if GPU is available or not
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## 1. Split into Train and Validation Dataset

In [2]:
def get_category(df):
    data_category = {}
    for col in df.columns:
        if df.dtypes[[col]][0] == object:
            data_category[col] = {}
            for (i, cat) in enumerate(df[[col]].drop_duplicates().values.flatten().tolist()):
                data_category[col][cat] = i
    
    return data_category


In [6]:
df = pd.read_csv('predict_df.csv', low_memory=False)
df.dropna(subset=["prevEndnote", "prev1", "prev2", "prev3", "currLength", "init1", "init2", "init3"])
data_category = get_category(df)

msk = np.random.rand(len(df)) > 0.2
train_df = df[msk]
valid_df = df[~msk]
# train_df.to_csv('sbc_train.csv', index=False)
# valid_df.to_csv('sbc_valid.csv', index=False)



  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == object:
  if df.dtypes[[col]][0] == 

## 2. Dataset for Prediction

In [7]:
from torch.utils.data.dataset import Dataset
from sklearn.preprocessing import OneHotEncoder

class SBCDataset(Dataset):
    def __init__(self, raw_data, cat_dict: dict, predictors: List[str], response: str, inference=False):
        self.cat_dict = cat_dict
        self.pred_data = self.one_hot(self.encode(raw_data, cat_dict, predictors))
        self.labels = raw_data[[response]].to_numpy(dtype=float)
        self.inference = inference
        self.shape = self.pred_data.shape

    def encode(self, raw_data, cat_dict, predictors):
        data = pd.DataFrame()
        for col in predictors:
            if "currLength" not in col and ":" not in col:
                data[[col]] = raw_data[[col]].replace(cat_dict)
        return data
    
    def one_hot(self, raw_data):
        data = np.zeros(shape=(len(raw_data), 1))
        for col in raw_data.columns:
            if "currLength" not in col and ":" not in col:
                data = np.append(data, 
                                pd.get_dummies(raw_data[[col]], drop_first=True, dtype=float), 
                                axis=1)
            elif ":" in col:
                temp_col = col.split(":")
                data = np.append(data, 
                                pd.get_dummies(raw_data[[temp_col[0]]], drop_first=True, dtype=float).to_numpy() * raw_data[[temp_col[1]]].to_numpy(), 
                                axis=1)
            else: 
                data = np.append(data, raw_data[[col]], axis=1)
        return data[:, 1:]

    def __len__(self):
        return len(self.pred_data)
    
    def __getitem__(self, idx):
        if not self.inference:
            label = self.labels[idx]
            return torch.tensor(self.pred_data[idx], dtype=torch.float), torch.tensor(label, dtype=torch.float)
        else:
            return torch.tensor(self.pred_data[idx], dtype=torch.float)


In [8]:
BATCH_SIZE = 2048
TEST_BATCH_SIZE = 2048

# create the dataset
predictors = ["prevEndnote", "prev1", "prev2", "prev3", "currLength", "init1", "init2", "init3"]
train_ds = SBCDataset(train_df, cat_dict=data_category, predictors=predictors, response="preBoundary")
valid_ds = SBCDataset(valid_df, cat_dict=data_category, predictors=predictors, response="preBoundary")

# build the dataloader
train_loader = torch.utils.data.DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True
)
valid_loader = torch.utils.data.DataLoader(
    valid_ds, batch_size=TEST_BATCH_SIZE
)

## 3. Define the Training and Validation Loops

In [18]:
class Trainer(object):
    def __init__(self, model, criterion, lr_rate, max_epoch):
        self.model = model.to(device)
        self.criterion = criterion.to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr_rate)
        self.max_epoch = max_epoch

    def run(self, train_loader, valid_loader):
        # calculate the inital loss and accu on validation set
        valid_best_loss = self.validate(-1, valid_loader, best_loss=None)
        for epoch in range(self.max_epoch):
            self.train(epoch, train_loader)
            # save the checkpoint with the lowest validation loss
            valid_best_loss = self.validate(epoch, valid_loader, valid_best_loss)

    def train(self, epoch, loader):            
        self.model.train()
        running_loss = 0.0
        with tqdm(enumerate(loader, 0), mininterval=10) as tepoch:
            for i, data in tepoch:
                # get the inputs; data is a list of [inputs, labels]
                # inputs: tensor, (batch_size, predictors_size)
                # labels: tensor, (batch_size, 1)
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                self.optimizer.zero_grad()

                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()

                # gather statistics
                running_loss += loss.item()
                tepoch.set_postfix(loss=loss.item())

        running_loss /= len(loader)
        
        print('Training | Epoch: {}| Loss: {:.3f}'.format(epoch+1, running_loss))

    def validate(self, epoch, loader, best_loss=None):
        # switch to the evaluation mode, do not need to calculate the gradient
        self.model.eval()
        running_loss = 0.0
        for i, data in tqdm(enumerate(loader)):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # replace the outputs and loss
            outputs = self.model(inputs)
            loss = self.criterion(outputs, labels)

            # gather statistics
            running_loss += loss.item()

        running_loss /= len(loader)

        if best_loss is None or running_loss < best_loss:
            # if a better loss appears, save the checkpoint
            save_file = 'best_epoch{}_loss{:.2f}.pt'.format(epoch+1, running_loss)
            print('Save to file: ', save_file)
            torch.save(self.model, save_file)

            # overwrite the best_checkpoint.pt file
            torch.save(self.model, 'best_checkpoint.pt')

            best_loss = running_loss

        print('Validation | Epoch: {}| Loss: {:.3f}'.format(epoch+1, running_loss))

        return best_loss

## 4. Define the Model Structure

In [19]:
class MLP(nn.Module):
    def __init__(self, input_dim: int):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 1)

        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

## 5. Training

In [20]:
NUM_EPOCH = 20
LEARNING_RATE = 0.001
input_shape = train_ds.shape
model = MLP(input_shape[-1])

criterion = nn.MSELoss()
trainer = Trainer(model, criterion, LEARNING_RATE, max_epoch=NUM_EPOCH)

In [21]:
trainer.run(train_loader, valid_loader)

26it [00:00, 59.75it/s]


Save to file:  best_epoch0_loss47.74.pt
Validation | Epoch: 0| Loss: 47.743


103it [00:02, 44.05it/s, loss=0.344]


Training | Epoch: 1| Loss: 2.626


26it [00:00, 52.28it/s]


Save to file:  best_epoch1_loss0.35.pt
Validation | Epoch: 1| Loss: 0.349


103it [00:02, 44.63it/s, loss=0.26]


Training | Epoch: 2| Loss: 0.285


26it [00:00, 52.26it/s]


Save to file:  best_epoch2_loss0.25.pt
Validation | Epoch: 2| Loss: 0.246


103it [00:02, 44.16it/s, loss=0.233]


Training | Epoch: 3| Loss: 0.235


26it [00:00, 58.12it/s]


Save to file:  best_epoch3_loss0.22.pt
Validation | Epoch: 3| Loss: 0.221


103it [00:02, 43.18it/s, loss=0.214]


Training | Epoch: 4| Loss: 0.217


26it [00:00, 49.60it/s]


Save to file:  best_epoch4_loss0.21.pt
Validation | Epoch: 4| Loss: 0.210


103it [00:02, 44.53it/s, loss=0.212]


Training | Epoch: 5| Loss: 0.208


26it [00:00, 49.65it/s]


Save to file:  best_epoch5_loss0.20.pt
Validation | Epoch: 5| Loss: 0.203


103it [00:02, 44.06it/s, loss=0.197]


Training | Epoch: 6| Loss: 0.203


26it [00:00, 45.54it/s]


Save to file:  best_epoch6_loss0.20.pt
Validation | Epoch: 6| Loss: 0.200


103it [00:02, 43.43it/s, loss=0.189]


Training | Epoch: 7| Loss: 0.200


26it [00:00, 59.69it/s]


Save to file:  best_epoch7_loss0.20.pt
Validation | Epoch: 7| Loss: 0.198


103it [00:02, 42.42it/s, loss=0.209]


Training | Epoch: 8| Loss: 0.198


26it [00:00, 55.90it/s]


Save to file:  best_epoch8_loss0.19.pt
Validation | Epoch: 8| Loss: 0.195


103it [00:02, 44.15it/s, loss=0.208]


Training | Epoch: 9| Loss: 0.195


26it [00:00, 49.20it/s]


Validation | Epoch: 9| Loss: 0.196


103it [00:02, 43.20it/s, loss=0.196]


Training | Epoch: 10| Loss: 0.194


26it [00:00, 48.99it/s]


Save to file:  best_epoch10_loss0.19.pt
Validation | Epoch: 10| Loss: 0.193


103it [00:02, 43.49it/s, loss=0.193]


Training | Epoch: 11| Loss: 0.194


26it [00:00, 49.38it/s]


Validation | Epoch: 11| Loss: 0.193


103it [00:02, 43.51it/s, loss=0.19]


Training | Epoch: 12| Loss: 0.192


26it [00:00, 49.20it/s]


Save to file:  best_epoch12_loss0.19.pt
Validation | Epoch: 12| Loss: 0.190


103it [00:02, 39.59it/s, loss=0.195]


Training | Epoch: 13| Loss: 0.190


26it [00:00, 46.06it/s]


Validation | Epoch: 13| Loss: 0.191


103it [00:02, 43.57it/s, loss=0.189]


Training | Epoch: 14| Loss: 0.191


26it [00:00, 46.28it/s]


Save to file:  best_epoch14_loss0.19.pt
Validation | Epoch: 14| Loss: 0.190


103it [00:02, 41.94it/s, loss=0.191]


Training | Epoch: 15| Loss: 0.191


26it [00:00, 48.38it/s]


Validation | Epoch: 15| Loss: 0.193


103it [00:02, 42.64it/s, loss=0.187]


Training | Epoch: 16| Loss: 0.189


26it [00:00, 52.03it/s]


Save to file:  best_epoch16_loss0.19.pt
Validation | Epoch: 16| Loss: 0.188


103it [00:02, 42.91it/s, loss=0.192]


Training | Epoch: 17| Loss: 0.189


26it [00:00, 49.46it/s]


Validation | Epoch: 17| Loss: 0.189


103it [00:02, 44.71it/s, loss=0.192]


Training | Epoch: 18| Loss: 0.189


26it [00:00, 50.28it/s]


Validation | Epoch: 18| Loss: 0.190


103it [00:02, 44.86it/s, loss=0.192]


Training | Epoch: 19| Loss: 0.190


26it [00:00, 49.26it/s]


Save to file:  best_epoch19_loss0.19.pt
Validation | Epoch: 19| Loss: 0.188


103it [00:02, 43.58it/s, loss=0.19]


Training | Epoch: 20| Loss: 0.188


26it [00:00, 55.55it/s]

Validation | Epoch: 20| Loss: 0.198



