In [None]:
RESOURCES_PATH = '../../../resources'

In [None]:
NAME = 'categorical_ffnn'

MAX_EPOCHS = 300
EARLY_STOP_PATIENCE = 30

In [None]:
from pathlib import Path
from time import time, strftime, gmtime
import pickle
import multiprocessing
from collections import namedtuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from scipy.special import softmax

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
from torch import tensor
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
Path(f'{RESOURCES_PATH}/model_checkpoint/budget/{NAME}/').mkdir(parents=True, exist_ok=True)

## Load & prepare data

In [None]:
df = pd.read_csv(f'{RESOURCES_PATH}/dataset/budget/cleared.tsv', sep='\t').fillna('')

In [None]:
budget_le = LabelEncoder()
budget_le.classes_ = np.array(Path(f'{RESOURCES_PATH}/dataset/budget/targets.txt').read_text().split('\n'))

object_le = LabelEncoder()
object_le.fit(df.object)

project_le = LabelEncoder()
project_le.fit(df.project)

financing_le = LabelEncoder()
financing_le.fit(df.financing);

In [None]:
NetInput = namedtuple('NetInput', 'object project financing')

In [None]:
def to_vectors(df):
    y = budget_le.transform(df.budget)
    x_object = to_categorical(object_le.transform(df.object))
    x_project = to_categorical(project_le.transform(df.project))
    x_financing = to_categorical(financing_le.transform(df.financing))

    return NetInput(tensor(x_object), tensor(x_project), tensor(x_financing)), tensor(y)

In [None]:
x, y = to_vectors(df)

## Model

In [None]:
class DatasetImpl(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
        return NetInput(self.x.object[index], self.x.project[index], self.x.financing[index]), self.y[index]

In [None]:
class ModelImpl(nn.Module):
    def __init__(self, optimizer_fn=None, loss=None):
        super(ModelImpl, self).__init__()

        self.object_linear = nn.Linear(x.object.shape[1], 512)
        self.object_batch_norm = nn.BatchNorm1d(512)
        self.object_dropout = nn.Dropout(0.2)
        
        self.project_linear = nn.Linear(x.project.shape[1], 512)
        self.project_batch_norm = nn.BatchNorm1d(512)
        self.project_dropout = nn.Dropout(0.2)

        self.financing_linear = nn.Linear(x.financing.shape[1], 512)
        self.financing_batch_norm = nn.BatchNorm1d(512)
        self.financing_dropout = nn.Dropout(0.2)

        self.common_linear = nn.Linear(512 * 3, 512)
        self.common_batch_norm = nn.BatchNorm1d(512)
        self.common_dropout = nn.Dropout(0.2)

        self.cls_linear = nn.Linear(512, int(y.max()+1))

        self.loss = loss
        self.optimizer = optimizer_fn(self) if optimizer_fn != None else None
    
    def forward(self, x):
        obj_branch = self.object_linear(x.object)
        obj_branch = self.object_batch_norm(obj_branch)
        obj_branch = F.relu(obj_branch)
        obj_branch = self.object_dropout(obj_branch)

        prj_branch = self.project_linear(x.project)
        prj_branch = self.project_batch_norm(prj_branch)
        prj_branch = F.relu(prj_branch)
        prj_branch = self.project_dropout(prj_branch)

        fin_branch = self.financing_linear(x.financing)
        fin_branch = self.financing_batch_norm(fin_branch)
        fin_branch = F.relu(fin_branch)
        fin_branch = self.financing_dropout(fin_branch)

        concatenated_branches = torch.cat((obj_branch, prj_branch, fin_branch), dim=1)

        common_branch = self.common_linear(concatenated_branches)
        common_branch = self.common_batch_norm(common_branch)
        common_branch = F.relu(common_branch)
        common_branch = self.common_dropout(common_branch)

        logits = self.cls_linear(common_branch)

        return logits

## Train

In [None]:
torch.manual_seed(42);

In [None]:
train_i, val_i = next(StratifiedKFold(10, shuffle=True, random_state=42).split(x.object, y))

x_train, y_train = DatasetImpl(x, y)[train_i]
x_val, y_val = DatasetImpl(x, y)[val_i]

In [None]:
def log_metrics(model, epoch, history, train_losses, started_at):
    with torch.no_grad():
        y_val_logits = model(x_val)
        y_val_proba = softmax(y_val_logits.numpy())

    val_acc = accuracy_score(y_val, y_val_proba.argmax(axis=1))
    val_loss = log_loss(y_val, y_val_proba, labels=y.unique())
    train_loss = np.array(train_losses).mean()

    history.append({
        'Validation Accuracy': val_acc,
        'Validation Loss': val_loss,
        'Train Loss': train_loss
    })

    print(f'Epoch #{epoch}: Val. Loss -- {val_loss}, Train Loss -- {train_loss}, Spent time -- {strftime("%Hh %Mm %Ss", gmtime(time() - started_at))}')

    return val_loss

In [None]:
def fit(model, dataloader):
    started_at = time()
    history = []

    best_epoch = 0
    best_loss = 10e100

    for epoch in range(1, MAX_EPOCHS):
        train_losses = []

        for x, y in dataloader:
            y_pred = model(x)

            loss = model.loss(y_pred, y)
            loss.backward()

            model.optimizer.step()
            model.optimizer.zero_grad()

            train_losses.append(float(loss))
        
        val_loss = log_metrics(model, epoch, history, train_losses, started_at)

        if val_loss < best_loss:
            best_loss = val_loss
            best_epoch = epoch
            torch.save(model, f'{RESOURCES_PATH}/model_checkpoint/budget/{NAME}/model.pt')
        elif epoch - best_epoch > EARLY_STOP_PATIENCE:
            print(f'    Early stop training. Best validation loss - {best_loss} of epoch #{best_epoch}')
            break
        else:
            print(f"    Validation loss hasn't improved. Current best value - {best_loss} of epoch #{best_epoch}")
    
    training_time = time() - started_at

    return history, training_time

In [None]:
model = ModelImpl(
    lambda model: optim.Adam(model.parameters(), lr=1e-4),
    nn.CrossEntropyLoss()
)

In [None]:
history, training_time = fit(model, DataLoader(DatasetImpl(x_train, y_train), batch_size=64, shuffle=True))

In [None]:
pd.DataFrame(history).to_csv(f'{RESOURCES_PATH}/model_checkpoint/budget/{NAME}/history.tsv', index=False, sep='\t')

## Evaluation

In [None]:
history = pd.read_csv(f'{RESOURCES_PATH}/model_checkpoint/budget/{NAME}/history.tsv', sep='\t')
history[['Validation Loss', 'Train Loss']].plot()
plt.xlabel('epoch');

In [None]:
orig_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/budget/original.tsv', sep='\t').fillna('')
x_orig, y_orig = to_vectors(orig_df)

In [None]:
def to_orig_shape(y_pred_proba):
    shape_diff = len(budget_le.classes_) - y_pred_proba.shape[1] - 1
    return tensor(np.pad(y_pred_proba, ((0, 0), (0, shape_diff)), 'constant', constant_values=(0, 0)))

In [None]:
model = torch.load(f'{RESOURCES_PATH}/model_checkpoint/budget/{NAME}/model.pt')
model.eval();

In [None]:
with torch.no_grad():
    y_val_logits = model(x_val)

In [None]:
with torch.no_grad():
    y_orig_logits = model(x_orig)
    y_orig_logits = to_orig_shape(y_orig_logits)

In [None]:
def accuracy_report(y_true, y_pred_logits):
    y_pred_proba = softmax(y_pred_logits.numpy())
    return round(accuracy_score(y_true, y_pred_proba.argmax(axis=1)), 4)

def logloss_report(y_true, y_pred_logits):
    y_pred_proba = softmax(y_pred_logits.numpy())
    return round(log_loss(y_true, y_pred_proba, labels=range(y_pred_logits.shape[1])), 4)

In [None]:
report = {
    'Name': f'Categorical Feed-Forward NN',
    '[Cleared] Accuracy': accuracy_report(y_val, y_val_logits),
    '[Cleared] Log Loss': logloss_report(y_val, y_val_logits),
    '[Original] Accuracy': accuracy_report(y_orig, y_orig_logits),
    '[Original] Log Loss': logloss_report(y_orig, y_orig_logits),
    'Training time (sec)': training_time
}

report_df = pd.DataFrame(report, index=['']).T

report_df.to_csv(f'reports/{NAME}.tsv', sep='\t', header=False)

report_df