In [None]:
RESOURCES_PATH = '../../resources'

In [None]:
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np

## Load dataset

In [None]:
df = pd.read_csv(f'{RESOURCES_PATH}/dataset/original.csv')
df

In [None]:
df = df[['object', 'financing', 'project', 'budget']]
df

In [None]:
df.isnull().sum()

In [None]:
df.drop(df[df.budget.isnull()].index, inplace=True)
df.isnull().sum()

In [None]:
df.financing.replace('БЕЗ ВЦС', np.NaN, inplace=True)
df.isnull().sum()

In [None]:
df.isnull().sum() / len(df)

In [None]:
df.nunique()

## Replace year-specific target to current year target

In [None]:
current_year = datetime.now().year - 2000

for i in range(10, current_year):
    df.budget.replace(f'Ппкс 20{i}', f'Ппкс 20{current_year}', inplace=True)
    df.budget.replace(f'Субсидия на ИЦ_ОЗОБ 20{i}', f'Субсидия на ИЦ_ОЗОБ 20{current_year}', inplace=True)

#### Define original dataset

In [None]:
orig_df = df.copy()

## Clear to unique tuples

In [None]:
df.fillna('None', inplace=True) # Fill N/A for correct grouping

In [None]:
grouped_df = df.groupby(['object', 'financing', 'project', 'budget']).size().reset_index().rename(columns={0:'count'})
grouped_df

In [None]:
unique_df = grouped_df.groupby(['object', 'financing', 'project'], as_index=False).apply(lambda x: x[x['count'] == x['count'].max()]).reset_index(drop=True)
unique_df

In [None]:
print(f'Removed from original dataset (in current step): {round((1 - unique_df["count"].sum() / len(df)) * 100, 2)}%')

In [None]:
unique_df.nunique()

## Clear too rare targets (these targets is obviously deprecated by business)

In [None]:
budget_count_df = unique_df.groupby('budget').agg({'count': ['count', 'sum']})
budget_count_df.columns = ['count', 'original_count']
budget_count_df

In [None]:
too_rare_budgets_df = budget_count_df[(budget_count_df['count'] < 3) | (budget_count_df['original_count'] < 150)]
too_rare_budgets_df

In [None]:
budget_count_df[~budget_count_df.index.isin(too_rare_budgets_df.index)]

In [None]:
cleared_df = unique_df[~unique_df.budget.isin(too_rare_budgets_df.index)]
cleared_df

In [None]:
print(f'Removed from original dataset (in current step): {round((1 - cleared_df["count"].sum() / len(df)) * 100, 2)}%')

## Save to TSV

In [None]:
Path(f'{RESOURCES_PATH}/dataset/budget').mkdir(parents=True, exist_ok=True)

In [None]:
orig_df.to_csv(f'{RESOURCES_PATH}/dataset/budget/original.tsv', index=False, sep='\t')
cleared_df.to_csv(f'{RESOURCES_PATH}/dataset/budget/cleared.tsv', index=False, sep='\t')