In [None]:
RESOURCES_PATH = '../../../../resources'

In [None]:
# Google Colab Only {

from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install --upgrade -r "/content/drive/My Drive/SHARE/Financial-Analytics-Classifier/requirements.txt"

In [None]:
RESOURCES_PATH = '/content/drive/My Drive/SHARE/Financial-Analytics-Classifier/resources'
# } Google Colab Only

In [None]:
from pathlib import Path
from time import time, strftime, gmtime
import multiprocessing
import pickle
import json
from collections import namedtuple
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.special import softmax
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
from transformers import BertModel, BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

In [None]:
ADDITIONAL_REPORT_METRICS = []

In [None]:
MAX_SENTENCE_LEN = 50
MAX_EPOCHS = 100
EARLY_STOP_PATIENCE = 10

In [None]:
Path(f'{RESOURCES_PATH}/model_checkpoint/turnover/bert/').mkdir(parents=True, exist_ok=True)

## Load dataset

In [None]:
def load_dfs():
    train_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/turnover/cleared_train.tsv', sep='\t')
    test_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/turnover/cleared_test.tsv', sep='\t')
    original_test_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/turnover/original_test.tsv', sep='\t')

    train_df.fillna('', inplace=True)
    test_df.fillna('', inplace=True)
    original_test_df.fillna('', inplace=True)

    with open(f'{RESOURCES_PATH}/dataset/turnover/label_encoder.pkl', 'rb') as fin:
        le = pickle.load(fin)

    train_df.turnover = le.transform(train_df.turnover)
    test_df.turnover = le.transform(test_df.turnover)
    original_test_df.turnover = le.transform(original_test_df.turnover)

    return train_df, test_df, original_test_df

In [None]:
train_df, test_df, original_test_df = load_dfs()

train_df.head()

In [None]:
NetInput = namedtuple('NetInput', 'word_tokens attention_mask')

In [None]:
tokenizer = BertTokenizer.from_pretrained(f'{RESOURCES_PATH}/pretrained/rubert')

def to_vectors(df):
    word_tokens_list = []
    attention_masks = []
    for i in range(len(df)):
        tokenized = tokenizer.encode_plus(
            df.nomenclature[i], 
            text_pair=df.description[i], 
            max_length=MAX_SENTENCE_LEN, 
            pad_to_max_length=True, 
            return_attention_mask=True, 
            return_token_type_ids=False
        )
        word_tokens_list.append(tokenized['input_ids'])
        attention_masks.append(tokenized['attention_mask'])
    
    return NetInput(torch.tensor(word_tokens_list).cuda(), torch.tensor(attention_masks).cuda()), torch.tensor(df.turnover).cuda()

In [None]:
x_train, y_train = to_vectors(train_df)
x_test, y_test = to_vectors(test_df)
x_original_test, y_original_test = to_vectors(original_test_df)

x_train.word_tokens.shape, x_train.attention_mask.shape, y_train.shape

In [None]:
class DatasetImpl(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
        return NetInput(self.x.word_tokens[index], self.x.attention_mask[index]), self.y[index]

In [None]:
class ModelImpl(nn.Module):
    def __init__(self):
        super(ModelImpl, self).__init__()
        # TODO not load pretrained on trained model
        self.bert_layer = BertModel.from_pretrained(f'{RESOURCES_PATH}/pretrained/rubert')
        self.dropout = nn.Dropout(0.1)
        self.cls_layer = nn.Linear(768, int(y_train.max()+1))
    
    def forward(self, x):
        embeddings, pooled_output = self.bert_layer(x.word_tokens, attention_mask = x.attention_mask)
        pooled_output = self.dropout(pooled_output)
        logits = self.cls_layer(pooled_output)
        return logits

## Train

In [None]:
def log_metrics(model, epoch, history, train_losses, started_at):
    with torch.no_grad():
        y_pred_logits = model(x_test)
        
    y_pred_proba = softmax(y_pred_logits.cpu().numpy())

    val_acc = accuracy_score(y_test.cpu(), y_pred_proba.argmax(axis=1))
    val_loss = log_loss(y_test.cpu(), y_pred_proba)
    train_loss = np.array(train_losses).mean()

    history.append({
        'Validation Accuracy': val_acc,
        'Validation Loss': val_loss,
        'Train Loss': train_loss
    })

    formated_training_time = strftime("%Hh %Mm %Ss", gmtime(time() - started_at))

    print(f'Epoch #{epoch}: Val. Loss -- {val_loss}, Val. accuracy -- {val_acc}, Train Loss -- {train_loss}, Spent time -- {formated_training_time}')

    return val_loss

In [None]:
def fit(model, dataloader, optimizer, criterion):
    started_at = time()
    history = []

    best_epoch = 0
    best_loss = 10e100

    for epoch in range(1, MAX_EPOCHS+1):
        train_losses = []

        for x, y in dataloader:
            optimizer.zero_grad()

            y_pred = model(x)

            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()

            train_losses.append(float(loss))
        
        val_loss = log_metrics(model, epoch, history, train_losses, started_at)

        if val_loss < best_loss:
            best_loss = val_loss
            best_epoch = epoch
            torch.save(model.state_dict(), f'{RESOURCES_PATH}/model_checkpoint/turnover/bert/model.pt')
        elif epoch - best_epoch > EARLY_STOP_PATIENCE:
            print(f'    Early stop training. Best validation loss - {best_loss} of epoch #{best_epoch}')
            break
        else:
            print(f"    Validation loss hasn't improved. Current best value - {best_loss} of epoch #{best_epoch}")
    
    training_time = time() - started_at

    return history, training_time

In [None]:
model = ModelImpl().cuda()

In [None]:
train_dataloader = DataLoader(DatasetImpl(x_train, y_train), batch_size=128, shuffle=True)

history, training_time = fit(model, train_dataloader, optim.Adam(model.parameters(), lr=2e-4), nn.CrossEntropyLoss())

In [None]:
pd.DataFrame(history).to_csv(f'{RESOURCES_PATH}/model_checkpoint/turnover/bert/history.tsv', index=False, sep='\t')

## Evaluation

In [None]:
history = pd.read_csv(f'{RESOURCES_PATH}/model_checkpoint/turnover/bert/history.tsv', sep='\t')

history[['Validation Loss', 'Train Loss']].plot()
plt.xlabel('epoch');

In [None]:
def get_report(y_true, y_pred_logits):
    y_pred_proba = softmax(y_pred_logits.cpu().numpy())
    y_pred = y_pred_proba.argmax(axis=1)

    report = {}

    report['accuracy'] = round(accuracy_score(y_true.cpu(), y_pred), 4)
    report['log_loss'] = round(log_loss(y_true.cpu(), y_pred_proba), 4)

    if 'confusion_matrix' in ADDITIONAL_REPORT_METRICS:
        report['confusion_matrix'] = confusion_matrix(y_true.cpu(), y_pred)

    return report

In [None]:
def expand_to_original_dataset_size(y_pred_logits):
    original_y_size_diff = int(y_original_test.max()+1) - y_pred_logits.shape[1]
    padded = np.pad(y_pred_logits.cpu(), ((0, 0), (0, original_y_size_diff)), 'constant', constant_values=(0, 0))
    return torch.tensor(padded)

In [None]:
model = ModelImpl().cuda()
model.load_state_dict(torch.load(f'{RESOURCES_PATH}/model_checkpoint/turnover/bert/model.pt'))
model.eval();

In [None]:
with torch.no_grad():
    y_pred_logits = model(x_test)

In [None]:
with torch.no_grad():
    y_pred_parts = []
    for x, y in DataLoader(DatasetImpl(x_original_test, y_original_test), batch_size=1024):
        y_pred_parts.append(model(x))

    y_original_pred_logits = torch.cat(y_pred_parts)

In [None]:
cleared_report = get_report(y_test, y_pred_logits)
original_report = get_report(y_original_test, expand_to_original_dataset_size(y_original_pred_logits))

report = {
    'Name': f'Fine-Tunned BERT',
    '[Cleared Test] Accuracy': cleared_report['accuracy'],
    '[Cleared Test] Log Loss': cleared_report['log_loss'],
    '[Original Test] Accuracy': original_report['accuracy'],
    '[Original Test] Log Loss': original_report['log_loss'],
    'Training time': strftime("%Hh %Mm %Ss", gmtime(training_time)),
    'Training time (sec)': int(training_time),
    'Model epoch': history["Validation Loss"].idxmin()+1,
    'Epochs': len(history)
}

print(json.dumps(report, indent=4))