In [None]:
from pathlib import Path
from time import time
import multiprocessing
import pickle
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Concatenate, Lambda, Bidirectional, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.backend as K

In [None]:
RESOURCES_PATH = '../../../../../resources'

In [None]:
EMBEDDING = 'BERT'
ADDITIONAL_REPORT_METRICS = []

In [None]:
MAX_NOMENCLATURE_LEN = {'BERT': 23, 'fastText': 17}[EMBEDDING]
MAX_DESCRIPTION_LEN = {'BERT': 45, 'fastText': 30}[EMBEDDING]
EMBEDDING_VEC_LEN = {'BERT': 768, 'fastText': 300}[EMBEDDING]

MAX_EPOCHS = 300
EARLY_STOP_PATIENCE = 15
WORKERS = multiprocessing.cpu_count()-1

## Load dataset

In [None]:
# TODO Move to commons

def load_dfs():
    train_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/turnover/cleared_train.tsv', sep='\t')
    test_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/turnover/cleared_test.tsv', sep='\t')
    original_test_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/turnover/original_test.tsv', sep='\t')

    train_df.fillna('', inplace=True)
    test_df.fillna('', inplace=True)
    original_test_df.fillna('', inplace=True)

    with open(f'{RESOURCES_PATH}/dataset/turnover/label_encoder.pkl', 'rb') as fin:
        le = pickle.load(fin)

    train_df.turnover = le.transform(train_df.turnover)
    test_df.turnover = le.transform(test_df.turnover)
    original_test_df.turnover = le.transform(original_test_df.turnover)

    return train_df, test_df, original_test_df

In [None]:
train_df, test_df, original_test_df = load_dfs()

train_df.head()

In [None]:
with open(f'{RESOURCES_PATH}/cache/{EMBEDDING.lower()}_embedding_map.pkl', 'rb') as fin:
    embedding_map = pickle.load(fin)

embedding_map['description'][''] = []

def to_vectors(df):
    y = to_categorical(df.turnover)
    x = [
        pad_sequences([embedding_map['nomenclature'][i] for i in df.nomenclature], maxlen=MAX_NOMENCLATURE_LEN, dtype='float32'),
        pad_sequences([embedding_map['description'][i] for i in df.description], maxlen=MAX_DESCRIPTION_LEN, dtype='float32')
    ]

    return x, y

In [None]:
x_train, y_train = to_vectors(train_df)
x_test, y_test = to_vectors(test_df)
x_original_test, y_original_test = to_vectors(original_test_df)

x_train[0].shape, x_train[1].shape, y_train.shape

## Train

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
nomenclature_input = Input(shape=(MAX_NOMENCLATURE_LEN, EMBEDDING_VEC_LEN))
nomenclature_mean_input = Lambda(lambda it: K.mean(it, axis=1))(nomenclature_input)

nomenclature_branch = Dense(512, activation="relu")(nomenclature_mean_input)
nomenclature_branch = BatchNormalization()(nomenclature_branch)
nomenclature_branch = Dropout(0.2)(nomenclature_branch)


description_input = Input(shape=(MAX_DESCRIPTION_LEN, EMBEDDING_VEC_LEN))
description_mean_input = Lambda(lambda it: K.mean(it, axis=1))(description_input)

description_branch = Dense(512, activation="relu")(description_mean_input)
description_branch = BatchNormalization()(description_branch)
description_branch = Dropout(0.2)(description_branch)


common_branch = Concatenate(axis=1)([nomenclature_branch, description_branch])


common_branch = Dense(512, activation="relu")(common_branch)
common_branch = BatchNormalization()(common_branch)
common_branch = Dropout(0.2)(common_branch)

common_branch = Dense(len(train_df.turnover.unique()), activation='softmax')(common_branch)


model = Model(inputs=[nomenclature_input, description_input], outputs=common_branch)
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

model.summary()

In [None]:
Path(f'{RESOURCES_PATH}/model_checkpoint/turnover/nn/mean_fc_nn/').mkdir(parents=True, exist_ok=True)

In [None]:
training_started_at = time()

fit_report = model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    epochs=MAX_EPOCHS,
    verbose=1,
    callbacks=[
        EarlyStopping(monitor='val_loss', patience=EARLY_STOP_PATIENCE),
        ModelCheckpoint(f'{RESOURCES_PATH}/model_checkpoint/turnover/nn/mean_fc_nn/model.h5', monitor='val_loss', save_best_only=True, verbose=1)
    ],
    workers=WORKERS
)

training_time_sec = time() - training_started_at

In [None]:
pd.DataFrame({'Train Loss': fit_report.history['loss'], 'Validation Loss': fit_report.history['val_loss']})\
    .to_csv(f'{RESOURCES_PATH}/model_checkpoint/turnover/nn/mean_fc_nn/history.tsv', index=False, sep='\t')

## Evaluation

In [None]:
history = pd.read_csv(f'{RESOURCES_PATH}/model_checkpoint/turnover/nn/mean_fc_nn/history.tsv', sep='\t')

history.plot()
plt.xlabel('epoch');

In [None]:
def get_report(y_true_onehot, y_pred_proba):
    y_true = y_true_onehot.argmax(axis=1)
    y_pred = y_pred_proba.argmax(axis=1)

    report = {}

    report['accuracy'] = accuracy_score(y_true, y_pred)
    report['log_loss'] = log_loss(y_true, y_pred_proba)

    if 'confusion_matrix' in ADDITIONAL_REPORT_METRICS:
        report['confusion_matrix'] = confusion_matrix(y_true, y_pred)

    return report

In [None]:
def expand_to_original_dataset_size(y_pred_proba):
    original_y_size_diff = y_original_test.shape[1] - y_original_pred_proba.shape[1]
    return np.pad(y_original_pred_proba, ((0, 0), (0, original_y_size_diff)), 'constant', constant_values=(0, 0))


In [None]:
model = load_model(f'{RESOURCES_PATH}/model_checkpoint/turnover/nn/mean_fc_nn/model.h5')

In [None]:
y_pred_proba = model.predict(x_test, workers=WORKERS)
y_original_pred_proba = model.predict(x_original_test, workers=WORKERS)

In [None]:
report = {
    'cleared_test': get_report(y_test, y_pred_proba),
    'original_test': get_report(y_original_test, expand_to_original_dataset_size(y_original_pred_proba)),
    'epochs': len(history),
    'training_time_sec': training_time_sec
}

with open('report.json', 'w') as fout:
    json.dump(report, fout)