# 1. SETTINGS

In [181]:
# libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import scipy.stats
from datetime import datetime
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas('PROGRESS>>>')

In [182]:
%matplotlib inline

In [183]:
# import datasets
test  = pd.read_csv("../data/raw/Challenge_20180423.csv")
cust  = pd.read_csv("../data/raw/Customer.csv")
bond  = pd.read_csv("../data/raw/Isin.csv")
markt = pd.read_csv("../data/raw/Market.csv")
macro = pd.read_csv("../data/raw/MarketData_Macro.csv")
trade = pd.read_csv("../data/raw/Trade.csv")

In [184]:
# pandas options
pd.set_option("display.max_columns", None)

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# garbage collection
import gc
gc.enable()

# Merge Trade with Isin

In [185]:
trade_new = trade.merge(cust, on='CustomerIdx', how='left')

In [186]:
tradedf = trade_new[['TradeDateKey','CustomerIdx', 'NotionalEUR', 'CustomerInterest', 'Sector', 'Subsector', 'Region', 'Country']]

In [187]:
tradedf['TradeDateKey'] = pd.to_datetime(tradedf['TradeDateKey'], format='%Y%m%d', errors='ignore')

### Customer Features

In [188]:
def smooth_mean(dataset, grouping_feature): 
    K = dataset.groupby(grouping_feature).size()
    mean_feat_y = dataset.groupby(grouping_feature)['CustomerInterest'].mean()
    global_mean_y = dataset['CustomerInterest'].mean()
    smooth_mean = (mean_feat_y*K + global_mean_y*10)/(K+10)
    return smooth_mean

def smooth_min(dataset, grouping_feature): 
    K = dataset.groupby(grouping_feature).size()
    min_feat_y = dataset.groupby(grouping_feature)['CustomerInterest'].min()
    global_min_y = dataset['CustomerInterest'].min()
    smooth_min = (min_feat_y*K + global_min_y*10)/(K+10)
    return smooth_min

def smooth_max(dataset, grouping_feature): 
    K = dataset.groupby(grouping_feature).size()
    max_feat_y = dataset.groupby(grouping_feature)['CustomerInterest'].max()
    global_max_y = dataset['CustomerInterest'].max()
    smooth_max = (max_feat_y*K + global_max_y*10)/(K+10)
    return smooth_max

def smooth_std(dataset, grouping_feature): 
    K = dataset.groupby(grouping_feature).size()
    std_feat_y = dataset.groupby(grouping_feature)['CustomerInterest'].std()
    std_feat_y[std_feat_y.isnull()] = 0
    global_std_y = dataset['CustomerInterest'].std()
    smooth_std = (std_feat_y*K + global_std_y*10)/(K+10)
    return smooth_std

In [189]:
smooth_std(tradedf, 'CustomerIdx')

CustomerIdx
0       0.086881
1       0.146612
2       0.156386
3       0.426507
4       0.156386
5       0.037235
6       0.126799
7       0.335113
8       0.079518
9       0.101991
10      0.027153
11      0.101991
12      0.426507
13      0.360891
14      0.004797
15      0.426507
16      0.091992
17      0.260643
18      0.293224
19      0.498442
20      0.312772
21      0.024563
22      0.002748
23      0.390965
24      0.123463
25      0.360891
26      0.390965
27      0.390965
28      0.106627
29      0.426507
          ...   
3441    0.390965
3442    0.012186
3443    0.120297
3444    0.360891
3445    0.008231
3446    0.390965
3447    0.109107
3448    0.360891
3449    0.426507
3450    0.470851
3451    0.018398
3452    0.426507
3453    0.187663
3454    0.180445
3455    0.029140
3456    0.203982
3457    0.053313
3458    0.246925
3459    0.161779
3460    0.027598
3461    0.060148
3462    0.036089
3463    0.028783
3464    0.035012
3465    0.011117
3466    0.360891
3467    0.426507
34

In [190]:
def smooth_stat(dataset, grouping_feature, type_of_stat, target_feature='CustomerInterest', alpha=10):
    '''dataset = set for the grouping
       grouping_feature = what we want to group on
       target_feature = what value should be calculated, here our target variable is by default,
       however, you can use other features, in the best case continous and highly correlated to the target value,
       it can help to prevent overfitting
       type_of_stat: mean min max std
       
       
       
       Note: for grouping variables where one has only one observation, 
       I decided to put zero instead f NA when we calculate standard deviation
       however, we can change it if needed
       '''
    K = dataset.groupby(grouping_feature).size()
    if (type_of_stat=='std'):
        stat_feat_y = dataset.groupby(grouping_feature)[target_feature].std()
        stat_feat_y[stat_feat_y.isnull()] = 0
        global_stat_y = dataset[target_feature].std()
    elif (type_of_stat=='min'):
        stat_feat_y = dataset.groupby(grouping_feature)[target_feature].min()
        global_stat_y = dataset[target_feature].min()
    elif (type_of_stat=='max'):
        stat_feat_y = dataset.groupby(grouping_feature)[target_feature].max()
        global_stat_y = dataset[target_feature].max()
    elif (type_of_stat=='mean'):
        stat_feat_y = dataset.groupby(grouping_feature)[target_feature].mean()
        global_stat_y = dataset[target_feature].mean()
    smooth_stat = (stat_feat_y*K + global_stat_y*alpha)/(K+alpha)
    return smooth_stat

# Merging stats to the data and wrapper

In [191]:
def group_and_merge(data_for_calculation, data_to_merge, grouping_feature='CustomerIdx'):
    for stat in ['min', 'max', 'std', 'mean']:
        temp = smooth_stat(data_for_calculation, grouping_feature, type_of_stat=stat)
        temp_df = pd.DataFrame(columns=[f'{grouping_feature}',f'{grouping_feature}_{stat}'])
        temp_df[f'{grouping_feature}'] = temp.index.values
        temp_df[f'{grouping_feature}_{stat}'] = temp
        data_to_merge = data_to_merge.merge(temp_df, on=f'{grouping_feature}', how='left')
    return data_to_merge

#### Tests

In [192]:
f"mean : {all(smooth_stat(tradedf, 'CustomerIdx', type_of_stat='mean') == smooth_mean(tradedf, 'CustomerIdx'))} \
  min :  {all(smooth_stat(tradedf, 'CustomerIdx', type_of_stat='min') == smooth_min(tradedf, 'CustomerIdx'))}  \
  max : {all(smooth_stat(tradedf, 'CustomerIdx', type_of_stat='max') == smooth_max(tradedf, 'CustomerIdx'))}  \
  std : {all(smooth_stat(tradedf, 'CustomerIdx', type_of_stat='std') == smooth_std(tradedf, 'CustomerIdx'))} "

'mean : True   min :  True    max : True    std : True '

In [193]:
new_d = group_and_merge(tradedf, tradedf, grouping_feature='CustomerIdx')
new_d

Unnamed: 0,TradeDateKey,CustomerIdx,NotionalEUR,CustomerInterest,Sector,Subsector,Region,Country,CustomerIdx_min,CustomerIdx_max,CustomerIdx_std,CustomerIdx_mean
0,2016-12-07,2789,653168.0,1.0,Asset Managers & Hedge Funds,Hedge Fund,Americas,USA,0.090909,1.0,0.426507,0.388282
1,2017-03-29,2574,1656487.0,1.0,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,0.000000,1.0,0.448672,0.717892
2,2017-04-18,2574,939673.0,1.0,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,0.000000,1.0,0.448672,0.717892
3,2017-03-10,2574,708082.0,1.0,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,0.000000,1.0,0.448672,0.717892
4,2016-11-16,2574,1147709.0,1.0,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,0.000000,1.0,0.448672,0.717892
5,2017-02-06,2398,492245.0,1.0,Asset Managers & Hedge Funds,Asset Mgr owned by Bank/Insur.,Americas,USA,0.000000,1.0,0.069991,0.005626
6,2016-07-18,2398,681613.0,1.0,Asset Managers & Hedge Funds,Asset Mgr owned by Bank/Insur.,Americas,USA,0.000000,1.0,0.069991,0.005626
7,2016-08-29,2398,586343.0,1.0,Asset Managers & Hedge Funds,Asset Mgr owned by Bank/Insur.,Americas,USA,0.000000,1.0,0.069991,0.005626
8,2016-07-05,2398,632673.0,1.0,Asset Managers & Hedge Funds,Asset Mgr owned by Bank/Insur.,Americas,USA,0.000000,1.0,0.069991,0.005626
9,2016-07-05,2398,632673.0,1.0,Asset Managers & Hedge Funds,Asset Mgr owned by Bank/Insur.,Americas,USA,0.000000,1.0,0.069991,0.005626
