# 1. SETTINGS

In [97]:
# libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import scipy.stats
from datetime import datetime
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas('PROGRESS>>>')

In [98]:
%matplotlib inline

In [99]:
# import datasets
#test  = pd.read_csv("../data/raw/Challenge_20180423.csv")
cust  = pd.read_csv("../data/raw/Customer.csv")
#bond  = pd.read_csv("../data/raw/Isin.csv")
#markt = pd.read_csv("../data/raw/Market.csv")
#macro = pd.read_csv("../data/raw/MarketData_Macro.csv")
trade = pd.read_csv("../data/raw/Trade.csv")

In [100]:
# pandas options
pd.set_option("display.max_columns", None)

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# garbage collection
import gc
gc.enable()

# Merge Trade with Isin

In [101]:
trade_new = trade.merge(cust, on='CustomerIdx', how='left')
tradedf = trade_new[['TradeDateKey','CustomerIdx', 'NotionalEUR', 'CustomerInterest', 'Sector', 'Subsector', 'Region', 'Country']]
tradedf['TradeDateKey'] = pd.to_datetime(tradedf['TradeDateKey'], format='%Y%m%d', errors='ignore')

### Customer Features

In [116]:
def smooth_mean(dataset, grouping_feature): 
    K = dataset.groupby(grouping_feature).size()
    mean_feat_y = dataset.groupby(grouping_feature)['CustomerInterest'].mean()
    global_mean_y = dataset['CustomerInterest'].mean()
    smooth_mean = (mean_feat_y*K + global_mean_y*10)/(K+10)
    return smooth_mean

def smooth_min(dataset, grouping_feature): 
    K = dataset.groupby(grouping_feature).size()
    min_feat_y = dataset.groupby(grouping_feature)['CustomerInterest'].min()
    global_min_y = dataset['CustomerInterest'].min()
    smooth_min = (min_feat_y*K + global_min_y*10)/(K+10)
    return smooth_min

def smooth_max(dataset, grouping_feature): 
    K = dataset.groupby(grouping_feature).size()
    max_feat_y = dataset.groupby(grouping_feature)['CustomerInterest'].max()
    global_max_y = dataset['CustomerInterest'].max()
    smooth_max = (max_feat_y*K + global_max_y*10)/(K+10)
    return smooth_max

def smooth_std(dataset, grouping_feature): 
    K = dataset.groupby(grouping_feature).size()
    std_feat_y = dataset.groupby(grouping_feature, as_index = False)['CustomerInterest'].std()
    std_feat_y[std_feat_y.isnull()] = 0
    global_std_y = dataset['CustomerInterest'].std()
    smooth_std = (std_feat_y*K + global_std_y*10)/(K+10)
    return smooth_std

In [104]:
def smooth_stat(dataset, grouping_features, type_of_stat, target_feature='CustomerInterest', alpha=10):
    '''dataset = set for the grouping
       grouping_feature = what we want to group on
       target_feature = what value should be calculated, here our target variable is by default,
       however, you can use other features, in the best case continous and highly correlated to the target value,
       it can help to prevent overfitting
       type_of_stat: mean min max std
       
       
       
       Note: for grouping variables where one has only one observation, 
       I decided to put zero instead f NA when we calculate standard deviation
       however, we can change it if needed
       '''
    K = dataset.groupby(grouping_feature).size()
    if (type_of_stat=='std'):
        stat_feat_y = dataset.groupby(grouping_features)[target_feature].std()
        stat_feat_y[stat_feat_y.isnull()] = 0
        global_stat_y = dataset[target_feature].std()
    elif (type_of_stat=='min'):
        stat_feat_y = dataset.groupby(grouping_features)[target_feature].min()
        global_stat_y = dataset[target_feature].min()
    elif (type_of_stat=='max'):
        stat_feat_y = dataset.groupby(grouping_features)[target_feature].max()
        global_stat_y = dataset[target_feature].max()
    elif (type_of_stat=='mean'):
        stat_feat_y = dataset.groupby(grouping_features)[target_feature].mean()
        global_stat_y = dataset[target_feature].mean()
    smooth_stat = (stat_feat_y*K + global_stat_y*alpha)/(K+alpha)
    return smooth_stat

# Merging stats to the data and wrapper

In [105]:
def group_and_merge(data_for_calculation, data_to_merge, grouping_features='CustomerIdx'):
    for stat in ['min', 'max', 'std', 'mean']:
        temp = smooth_stat(data_for_calculation, grouping_features, type_of_stat=stat)          
        if len(grouping_features) > 1:
            pass
            '''temp_df = pd.DataFrame(columns=eval(f'{grouping_features}') + [f'{grouping_features}_{stat}'])
            temp_df[eval(f'{grouping_features}')] 
            temp_df[f'{grouping_features}_{stat}'] = temp.values
            data_to_merge = data_to_merge.merge(temp_df, left_on=eval(f'{grouping_features}'), right_on=f'{grouping_features}', how='left')'''
        else:
            temp_df = pd.DataFrame(columns=[f'{grouping_features}',f'{grouping_features}_{stat}'])
            temp_df[f'{grouping_features}'] = temp.index.values 
            temp_df[f'{grouping_features}_{stat}'] = temp
            data_to_merge = data_to_merge.merge(temp_df, on=f'{grouping_features}', how='left')
    return data_to_merge

#### Tests

In [192]:
f"mean : {all(smooth_stat(tradedf, 'CustomerIdx', type_of_stat='mean') == smooth_mean(tradedf, 'CustomerIdx'))} \
  min :  {all(smooth_stat(tradedf, 'CustomerIdx', type_of_stat='min') == smooth_min(tradedf, 'CustomerIdx'))}  \
  max : {all(smooth_stat(tradedf, 'CustomerIdx', type_of_stat='max') == smooth_max(tradedf, 'CustomerIdx'))}  \
  std : {all(smooth_stat(tradedf, 'CustomerIdx', type_of_stat='std') == smooth_std(tradedf, 'CustomerIdx'))} "

'mean : True   min :  True    max : True    std : True '

In [106]:
new_d = group_and_merge(tradedf, tradedf, grouping_feature)
new_d

KeyError: "['CustomerIdx']"