In [None]:
from pathlib import Path
import pickle
import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Concatenate, Lambda, Bidirectional, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.backend as K

In [None]:
EMBEDDING = 'BERT'

MAX_NOMENCLATURE_LEN = {'BERT': 23, 'fastText': 17}[EMBEDDING]
MAX_DESCRIPTION_LEN = {'BERT': 45, 'fastText': 30}[EMBEDDING]
EMBEDDING_VEC_LEN = {'BERT': 768, 'fastText': 300}[EMBEDDING]

MAX_EPOCHS = 300
EARLY_STOP_PATIENCE = 30

## Load dataset

In [None]:
# TODO Move to commons

def load_dfs():
    train_df = pd.read_csv('../../../../../resources/dataset/turnover/cleared_train.tsv', sep='\t')
    test_df = pd.read_csv('../../../../../resources/dataset/turnover/cleared_test.tsv', sep='\t')
    original_test_df = pd.read_csv('../../../../../resources/dataset/turnover/original_test.tsv', sep='\t')

    train_df.fillna('', inplace=True)
    test_df.fillna('', inplace=True)
    original_test_df.fillna('', inplace=True)

    with open('../../../../../resources/dataset/turnover/label_encoder.pkl', 'rb') as fin:
        le = pickle.load(fin)

    train_df.turnover = le.transform(train_df.turnover)
    test_df.turnover = le.transform(test_df.turnover)
    original_test_df.turnover = le.transform(original_test_df.turnover)

    return train_df, test_df, original_test_df

In [None]:
train_df, test_df, original_test_df = load_dfs()

train_df.head()

In [None]:
with open(f'../../../../../resources/cache/{EMBEDDING.lower()}_embedding_map.pkl', 'rb') as fin:
    embedding_map = pickle.load(fin)

embedding_map['description'][''] = []

def to_vectors(df):
    y = to_categorical(df.turnover)
    x = [
        pad_sequences([embedding_map['nomenclature'][i] for i in df.nomenclature], maxlen=MAX_NOMENCLATURE_LEN, dtype='float32'),
        pad_sequences([embedding_map['description'][i] for i in df.description], maxlen=MAX_DESCRIPTION_LEN, dtype='float32')
    ]

    return x, y

In [None]:
x_train, y_train = to_vectors(test_df)
x_test, y_test = to_vectors(test_df)
x_original_test, y_original_test = to_vectors(test_df)

x_test[0].shape, x_test[1].shape, y_test.shape

## Train

In [None]:
nomenclature_input = Input(shape=(MAX_NOMENCLATURE_LEN, EMBEDDING_VEC_LEN))
nomenclature_mean_input = Lambda(lambda it: K.mean(it, axis=1))(nomenclature_input)

nomenclature_branch = Dense(512, activation="relu")(nomenclature_mean_input)
nomenclature_branch = BatchNormalization()(nomenclature_branch)
nomenclature_branch = Dropout(0.2)(nomenclature_branch)


description_input = Input(shape=(MAX_DESCRIPTION_LEN, EMBEDDING_VEC_LEN))
description_mean_input = Lambda(lambda it: K.mean(it, axis=1))(description_input)

description_branch = Dense(512, activation="relu")(description_mean_input)
description_branch = BatchNormalization()(description_branch)
description_branch = Dropout(0.2)(description_branch)


common_branch = Concatenate(axis=1)([nomenclature_branch, description_branch])


common_branch = Dense(512, activation="relu")(common_branch)
common_branch = BatchNormalization()(common_branch)
common_branch = Dropout(0.2)(common_branch)

common_branch = Dense(len(train_df.turnover.unique()), activation='softmax')(common_branch)


model = Model(inputs=[nomenclature_input, description_input], outputs=common_branch)
model.compile(loss='categorical_crossentropy', optimizer=Adam())

model.summary()

In [None]:
Path(f"../../../../../resources/model_checkpoint/turnover/nn/mean_fc_nn/").mkdir(parents=True, exist_ok=True)

In [None]:
fit_report = model.fit(
    x_train, y_train
    validation_data=(x_test, y_test),
    epochs=MAX_EPOCHS,
    verbose=1,
    callbacks=[
        EarlyStopping(monitor='val_loss', patience=EARLY_STOP_PATIENCE, restore_best_weights=True),
        ModelCheckpoint(f"../../../../../resources/model_checkpoint/turnover/nn/mean_fc_nn/model.h5", monitor='val_loss', save_best_only=True, verbose=1)
    ]
)

In [None]:
with open('../../../../../resources/model_checkpoint/turnover/nn/mean_fc_nn/history.txt') as fout:
    print(*fit_report.history['val_loss'], file=fout)

In [None]:
# Plot history

## Evaluation

In [None]:
model = load_model('../../../../../resources/model_checkpoint/turnover/nn/mean_fc_nn/model.h5')