In [None]:
RESOURCES_PATH = '../../../../resources'

In [None]:
from pathlib import Path
from time import time, strftime, gmtime
import multiprocessing
import re
import pickle
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import hstack
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
import pymorphy2
import nltk
from nltk.corpus import stopwords

In [None]:
MAX_EPOCHS = 500
EARLY_STOP_PATIENCE = 50

In [None]:
# nltk.download('stopwords')

## Load datasets

In [None]:
with open(f'{RESOURCES_PATH}/dataset/turnover/label_encoder.pkl', 'rb') as fin:
    le = pickle.load(fin)

In [None]:
def load_df(name):
    df = pd.read_csv(f'{RESOURCES_PATH}/dataset/turnover/{name}.tsv', sep='\t')
    df.fillna('', inplace=True)
    df.turnover = le.transform(df.turnover)
    return df

In [None]:
train_df = load_df('cleared_train')
test_df = load_df('cleared_test')
orig_test_df = load_df('original_test')

train_df.head()

## Preprocess inputs

In [None]:
def clear_phrase(phrase):
    lower_cased = phrase.lower()
    without_special_chars = re.sub(r"[^a-zА-я0-9 ]", '', lower_cased)
    without_excess_spaces = re.sub(r" {2,}", ' ', without_special_chars)
    stripped = without_excess_spaces.strip()
    return stripped

In [None]:
morph = pymorphy2.MorphAnalyzer()

def stem_phrase(phrase):
    return ' '.join([morph.parse(i)[0].normal_form for i in phrase.split()])

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('russian'), ngram_range=(1, 3), min_df=1e-3, norm='l2')

train_corpus = [stem_phrase(clear_phrase(i)) for i in list(train_df.nomenclature) + list(train_df.description)]

tfidf_vectorizer.fit(train_corpus);

In [None]:
def to_vectors(df):
    nom_prepaired = list(df.nomenclature.apply(lambda i: stem_phrase(clear_phrase(i))))
    desc_prepaired = list(df.description.apply(lambda i: stem_phrase(clear_phrase(i))))

    nom_x = tfidf_vectorizer.transform(nom_prepaired)
    desc_x = tfidf_vectorizer.transform(desc_prepaired)
    y = list(df.turnover)

    return hstack((nom_x, desc_x)), y

In [None]:
x_train, y_train = to_vectors(train_df)
x_test, y_test = to_vectors(test_df)
x_orig_test, y_orig_test = to_vectors(orig_test_df)

## Train

In [None]:
model = SGDClassifier(
    loss='log',
    penalty='elasticnet',
    shuffle=True,
    max_iter=MAX_EPOCHS,
    early_stopping=True,
    n_iter_no_change=EARLY_STOP_PATIENCE,
    random_state=42
)

In [None]:
training_started_at = time()

model.fit(x_train, y_train)

training_time = time() - training_started_at

## Evaluation

In [None]:
def get_report(y_true, y_pred_proba):
    return {
        'accuracy': round(accuracy_score(y_true, y_pred_proba.argmax(axis=1)), 4),
        'log_loss': round(log_loss(y_true, y_pred_proba, 4))
    }

In [None]:
def expand_to_orig_size(y_pred_proba):
    size_diff = max(y_orig_test) + 1 - y_pred_proba.shape[1]
    return np.pad(y_pred_proba, ((0, 0), (0, size_diff)), 'constant', constant_values=(0, 0))

In [None]:
y_test_pred_proba = model.predict_proba(x_test)
y_orig_test_pred_proba = model.predict_proba(x_orig_test)

In [None]:
cleared_report = get_report(y_test, y_test_pred_proba)
orig_report = get_report(y_orig_test, expand_to_orig_size(y_orig_test_pred_proba))

report = {
    'Name': f'TF-IDF Logistic Regression',
    '[Cleared Test] Accuracy': cleared_report['accuracy'],
    '[Cleared Test] Log Loss': cleared_report['log_loss'],
    '[Original Test] Accuracy': orig_report['accuracy'],
    '[Original Test] Log Loss': orig_report['log_loss'],
    'Training time': strftime("%Hh %Mm %Ss", gmtime(training_time)),
    'Training time (sec)': int(training_time),
    'Model epoch': model.n_iter_ - EARLY_STOP_PATIENCE,
    'Epochs': model.n_iter_
}

with open(f'report/base.json', 'w') as fout:
    json.dump(report, fout, indent=4)