In [None]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## Load dataset

In [None]:
df = pd.read_csv("../../resources/dataset/original.csv")
df

In [None]:
df = df[['nomenclature', 'description', 'turnover']]
df

In [None]:
df.isnull().sum()

In [None]:
df.drop(df[df.turnover.isnull()].index, inplace=True)
df.isnull().sum()

In [None]:
df.fillna('', inplace=True)
df.isnull().sum()

## Split to (original) train/test

In [None]:
df.drop(df[df.turnover.isin(df.turnover.value_counts()[df.turnover.value_counts() == 1].index)].index, inplace=True)

In [None]:
df, original_test_df = train_test_split(df, test_size=0.1, stratify=df.turnover, random_state=42)
len(df), len(original_test_df)

In [None]:
original_test_df.to_csv('../../resources/dataset/turnover/original_test.tsv', index=False, sep='\t')

## Clear train to only unique tuples

In [None]:
partly_unique_df = df.groupby(['nomenclature', 'description', 'turnover']).size().reset_index().rename(columns={0:'count'})
partly_unique_df

In [None]:
unique_df = partly_unique_df.groupby(['nomenclature', 'description'], as_index=False).apply(lambda x: x[x['count'] == x['count'].max()]).reset_index(drop=True)
unique_df

In [None]:
print(f'Removed from original dataset: {round((1 - unique_df["count"].sum() / partly_unique_df["count"].sum()) * 100, 2)}%')

## Clear too rare targets

In [None]:
unique_df.nunique()

In [None]:
rare_turnover_df = unique_df.groupby('turnover').agg({'count': ['count', 'sum']})
rare_turnover_df.columns = ['count', 'original_count']
rare_turnover_df

In [None]:
rare_turnovers = rare_turnover_df[(rare_turnover_df['count'] < 7) & (rare_turnover_df['original_count'] < 150)].index
len(rare_turnovers)

In [None]:
cleared_df = unique_df[~unique_df.turnover.isin(rare_turnovers)]
cleared_df

## Split to train/test cleared_df dataset

In [None]:
cleared_train_df, cleared_test_df = train_test_split(cleared_df, test_size=0.2, stratify=cleared_df.turnover, random_state=42)
len(cleared_train_df), len(cleared_test_df)

In [None]:
cleared_train_df.to_csv('../../resources/dataset/turnover/cleared_train.tsv', index=False, sep='\t')
cleared_test_df.to_csv('../../resources/dataset/turnover/cleared_test.tsv', index=False, sep='\t')

## List unique targets for label encoding

In [None]:
train_df = pd.read_csv('../../resources/dataset/turnover/cleared_train.tsv', sep='\t')
test_df = pd.read_csv('../../resources/dataset/turnover/cleared_test.tsv', sep='\t')
original_test_df = pd.read_csv('../../resources/dataset/turnover/original_test.tsv', sep='\t')

cleared_targets = sorted(set(train_df.turnover.unique()).union(set(test_df.turnover.unique())))
additional_original_targets = sorted(set(original_test_df.turnover.unique()).difference(set(cleared_targets)))

le = LabelEncoder()
le.classes_ = np.array(cleared_targets + additional_original_targets)

with open('../../resources/dataset/turnover/label_encoder.pkl', 'wb') as fout:
    pickle.dump(le, fout, pickle.HIGHEST_PROTOCOL)

le.transform(train_df.turnover.unique()).max(), le.transform(test_df.turnover.unique()).max(), le.transform(original_test_df.turnover.unique()).max(), 

In [None]:
print(*cleared_targets, sep="\n")