In [None]:
import pickle
import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix

In [None]:
RESOURCES_PATH = '../../../../resources'

In [None]:
ADDITIONAL_REPORT_METRICS = []

In [None]:
# TODO Move to commons

def load_dfs():
    train_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/turnover/cleared_train.tsv', sep='\t')
    test_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/turnover/cleared_test.tsv', sep='\t')
    original_test_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/turnover/original_test.tsv', sep='\t')

    train_df.fillna('', inplace=True)
    test_df.fillna('', inplace=True)
    original_test_df.fillna('', inplace=True)

    with open(f'{RESOURCES_PATH}/dataset/turnover/label_encoder.pkl', 'rb') as fin:
        le = pickle.load(fin)

    train_df.turnover = le.transform(train_df.turnover)
    test_df.turnover = le.transform(test_df.turnover)
    original_test_df.turnover = le.transform(original_test_df.turnover)

    return train_df, test_df, original_test_df

In [None]:
train_df, test_df, original_test_df = load_dfs()

train_df.head()

## Train

In [None]:
np.random.seed(42)

In [None]:
cleared_test_pred = np.random.choice(
    train_df.turnover.value_counts(normalize=True).index,
    size=len(test_df),
    p=train_df.turnover.value_counts(normalize=True)
)

original_test_pred = np.random.choice(
    train_df.turnover.value_counts(normalize=True).index,
    size=len(original_test_df),
    p=train_df.turnover.value_counts(normalize=True)
)

## Evaluation

In [None]:
def get_report(y_pred, test_df):
    y_true = test_df.turnover.to_numpy()

    report = {}

    report['accuracy'] = round(accuracy_score(y_true, y_pred), 4)
    report['log_loss'] = -1

    if 'confusion_matrix' in ADDITIONAL_REPORT_METRICS:
        report['confusion_matrix'] = confusion_matrix(y_true, y_pred).tolist()

    return report

In [None]:
cleared_report = get_report(cleared_test_pred, test_df)
original_report = get_report(original_test_pred, original_test_df)

report = {
    'Name': 'Baseline',
    '[Cleared Test] Accuracy': cleared_report['accuracy'],
    '[Original Test] Accuracy': original_report['accuracy']
}

with open('report/base.json', 'w') as fout:
    json.dump(report, fout, indent=4)