In [1]:
import os
import sys
import gc
import warnings

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from tqdm import tqdm
from functools import partial
from scipy.stats import skew, kurtosis, iqr
from sklearn.externals import joblib

%matplotlib inline

In [2]:
pd.options.display.max_columns = None

In [4]:
PATH = os.path.join('..', 'input')

train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))

In [5]:
categorical_columns = [col for col in train.columns if train[col].dtype == 'object']

In [6]:
categorical_columns

['first_active_month', 'card_id']

In [26]:
train = train.sort_values('first_active_month').reset_index(drop=True)

In [27]:
train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,outliers_mean,first_active_month_year,first_active_month_weekday,first_active_month_month,first_active_month_weekofyear,first_active_month_quarter,elapsed_time,days_feature_1,days_feature_2,days_feature_3
0,2011-11-01,C_ID_f25b3d3f13,3,1,1,0.645766,0.00707,2011,1,11,44,4,2284,6852,2284,2284
1,2011-11-01,C_ID_2dbbc2b7fd,3,3,1,-0.06956,0.016459,2011,1,11,44,4,2284,6852,6852,2284
2,2011-11-01,C_ID_0a70866829,3,1,1,-2.544361,0.00707,2011,1,11,44,4,2284,6852,2284,2284
3,2011-11-01,C_ID_08cb8e0333,3,3,1,-3.573636,0.016459,2011,1,11,44,4,2284,6852,6852,2284
4,2011-11-01,C_ID_d26de4d1bf,3,3,1,-0.693447,0.016459,2011,1,11,44,4,2284,6852,6852,2284


In [28]:
test = test.sort_values('first_active_month').reset_index(drop=True)

In [29]:
test.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,outliers_mean,first_active_month_year,first_active_month_weekday,first_active_month_month,first_active_month_weekofyear,first_active_month_quarter,elapsed_time,days_feature_1,days_feature_2,days_feature_3
0,2011-11-01,C_ID_b3334831be,3,1,1,0.00707,2011,1,11,44,4,2284,6852,2284,2284
1,2011-11-01,C_ID_af9b01c5b0,3,1,1,0.00707,2011,1,11,44,4,2284,6852,2284,2284
2,2011-11-01,C_ID_22f6678098,3,1,1,0.00707,2011,1,11,44,4,2284,6852,2284,2284
3,2011-12-01,C_ID_1ed74e7415,3,3,1,0.016459,2011,3,12,48,4,2254,6762,6762,2254
4,2011-12-01,C_ID_dcdf88cb4d,3,1,1,0.00707,2011,3,12,48,4,2254,6762,2254,2254


In [30]:
train.nunique()

first_active_month                   75
card_id                          201917
feature_1                             5
feature_2                             3
feature_3                             2
target                           197110
outliers_mean                        14
first_active_month_year               8
first_active_month_weekday            7
first_active_month_month             12
first_active_month_weekofyear        21
first_active_month_quarter            4
elapsed_time                         75
days_feature_1                      224
days_feature_2                      177
days_feature_3                       75
dtype: int64

In [31]:
test.nunique()

first_active_month                   75
card_id                          123623
feature_1                             5
feature_2                             3
feature_3                             2
outliers_mean                        14
first_active_month_year               8
first_active_month_weekday            7
first_active_month_month             12
first_active_month_weekofyear        21
first_active_month_quarter            4
elapsed_time                         75
days_feature_1                      223
days_feature_2                      171
days_feature_3                       76
dtype: int64

In [17]:
historical_transactions = pd.read_csv('../remove_outlier_data/historical_transactions.csv')

In [33]:
historical_transactions.nunique()

authorized_flag                2
card_id                   325540
city_id                      308
category_1                     2
installments                  14
category_3                     4
merchant_category_id         327
merchant_id               326311
month_lag                     14
purchase_amount           215013
purchase_date           16395299
category_2                     5
state_id                      25
subsector_id                  41
dtype: int64

In [35]:
historical_transactions = historical_transactions.sort_values('purchase_date').reset_index(drop=True)

In [36]:
historical_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,1,C_ID_da2090f28e,69,1,0,0,623,M_ID_f001319a61,-11,-0.686802,2017-01-01 00:00:08,1.0,9,4
1,1,C_ID_efced389a0,76,1,1,1,842,M_ID_18038b5ae7,-12,-0.56659,2017-01-01 00:00:59,3.0,2,37
2,1,C_ID_83561fe74a,233,1,1,1,661,M_ID_52d3026407,-13,-0.559227,2017-01-01 00:01:41,1.0,9,8
3,1,C_ID_479fd6392a,-1,0,1,1,839,M_ID_e5374dabc0,-1,-0.737892,2017-01-01 00:02:03,,-1,29
4,1,C_ID_1cf6056088,69,1,0,0,278,M_ID_2cf6dc1f6f,-4,0.004418,2017-01-01 00:02:12,1.0,9,37


In [24]:
new_merchant_transactions = pd.read_csv(os.path.join('../input', 'new_merchant_transactions.csv'))

In [8]:
new_merchant_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [9]:
new_merchant_transactions.nunique()

authorized_flag               1
card_id                  290001
city_id                     308
category_1                    2
installments                 15
category_3                    3
merchant_category_id        314
merchant_id              226129
month_lag                     2
purchase_amount           75190
purchase_date           1667025
category_2                    5
state_id                     25
subsector_id                 41
dtype: int64

In [6]:
new_merchant_transactions.authorized_flag.unique()

array([1])

In [4]:
merchants = pd.read_csv('../remove_outlier_data/merchants.csv')

In [5]:
merchants.head()

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.05746,-0.05746,0,1,1,-0.4,9.666667,3,-2.25,18.666667,6,-2.32,13.916667,12,0,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.05746,-0.05746,0,1,1,-0.72,1.75,3,-0.74,1.291667,6,-0.57,1.6875,12,0,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.05746,-0.05746,0,1,1,-82.13,260.0,2,-82.13,260.0,2,-82.13,260.0,2,0,-1,5,5.0
3,M_ID_a70e9c5f81,5026,792,9,-0.05746,-0.05746,1,1,1,,1.666667,3,,4.666667,6,,3.833333,12,1,-1,-1,
4,M_ID_64456c37ce,2228,222,21,-0.05746,-0.05746,1,1,1,,0.5,3,,0.361111,6,,0.347222,12,1,-1,-1,


In [6]:
merchants.nunique()

merchant_id                    334633
merchant_group_id              109391
merchant_category_id              324
subsector_id                       41
numerical_1                       950
numerical_2                       944
category_1                          2
most_recent_sales_range             5
most_recent_purchases_range         5
avg_sales_lag3                   3372
avg_purchases_lag3             100003
active_months_lag3                  3
avg_sales_lag6                   4507
avg_purchases_lag6             135202
active_months_lag6                  6
avg_sales_lag12                  5009
avg_purchases_lag12            172917
active_months_lag12                12
category_4                          2
city_id                           271
state_id                           25
category_2                          5
dtype: int64

In [7]:
features = []

features += [f'f10{i}.pkl' for i in (2, 4)]
features += [f'f11{i}_{j}.pkl' for i in (1, 2) 
                               for j in ('Y', 'N')]
features += [f'f12{i}.pkl' for i in (1,)]
features += [f'f13{i}.pkl' for i in (1, 2)]

features += [f'f20{i}.pkl' for i in (2,)]
features += [f'f23{i}.pkl' for i in (1, 2)]

features += [f'f30{i}.pkl' for i in (2, 3, 4,)]

In [9]:
KEY = 'card_id'

train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))

for f in tqdm(features):
    t = pd.read_pickle(os.path.join('..', 'remove_outlier_feature', f))
    train = pd.merge(train, t, on=KEY, how='left')
    test = pd.merge(test, t, on=KEY, how='left')

100%|██████████| 15/15 [00:09<00:00,  1.60it/s]


In [10]:
cols = train.columns.values
for f in [
    'new_purchase_date_max', 'new_purchase_date_min',
    'hist_purchase_date_max', 'hist_purchase_date_min', 
    'Y_hist_auth_purchase_date_max', 'Y_hist_auth_purchase_date_min', 
    'N_hist_auth_purchase_date_max', 'N_hist_auth_purchase_date_min',
    'Y_new_auth_purchase_date_max', 'Y_new_auth_purchase_date_min', 
    'N_new_auth_purchase_date_max', 'N_new_auth_purchase_date_min',
]:
    if f in cols:
        train[f] = train[f].astype(np.int64) * 1e-9
        test[f] = test[f].astype(np.int64) * 1e-9

In [14]:
y = train['target']
del train['target']

In [11]:
train.nunique()

first_active_month                                 75
card_id                                        201917
feature_1                                           5
feature_2                                           3
feature_3                                           2
target                                          19435
hist_transactions_count                          1023
hist_category_1_sum                              1014
hist_category_1_mean                             2669
hist_category_2_nunique                             6
hist_category_3_nunique                             4
hist_merchant_id_nunique                          313
hist_state_id_nunique                              20
hist_subsector_id_nunique                          34
hist_city_id_nunique                               58
hist_merchant_category_id_nunique                  92
hist_installments_nunique                          13
hist_installments_mean                           7208
hist_installments_std       

In [15]:
for f in train.columns:
    print(f, train[f].nunique(), test[f].nunique())

first_active_month 75 75
card_id 201917 123623
feature_1 5 5
feature_2 3 3
feature_3 2 2
hist_transactions_count 1023 943
hist_category_1_sum 1014 922
hist_category_1_mean 2669 2356
hist_category_2_nunique 6 6
hist_category_3_nunique 4 4
hist_merchant_id_nunique 313 296
hist_state_id_nunique 20 19
hist_subsector_id_nunique 34 34
hist_city_id_nunique 58 49
hist_merchant_category_id_nunique 92 87
hist_installments_nunique 13 13
hist_installments_mean 7208 6294
hist_installments_std 6692 6460
hist_purchase_amount_sum 199346 122654
hist_purchase_amount_mean 10618 8478
hist_purchase_amount_max 74898 51993
hist_purchase_amount_min 1099 913
hist_purchase_amount_std 10885 9858
hist_purchase_month_median 23 23
hist_purchase_month_max 11 11
hist_purchase_month_min 11 11
hist_purchase_month_std 3718 3609
hist_purchase_date_max 186997 117307
hist_purchase_date_min 193984 119481
hist_month_diff_median 27 27
hist_month_diff_max 14 14
hist_month_diff_min 14 14
hist_month_diff_std 2107 1998
hist_purch

new_purchase_date_diff 62 62
new_purchase_date_average 1252 1160
new_purchase_date_uptonow 406 401
new_cumusum_sum_purchase_amount0 72398 49986
new_cumusum_sum_purchase_amount1 105796 72351
new_cumsum_count_purchase_amount0 60 53
new_cumsum_count_purchase_amount1 87 80
new_mean_installments_1 591 498
new_mean_installments_2 543 441
sum_numerical_sum 198140 121484
sum_numerical_mean 197611 121128
rate_lag3_sum 201280 123266
rate_lag3_mean 201031 123090
rate_lag6_sum 201274 123266
rate_lag6_mean 201001 123075
rate_lag12_sum 201279 123266
rate_lag12_mean 201026 123091
hist_merchants_merchant_group_id_nunique 219 215
hist_merchants_merchant_category_id_nunique 87 82
hist_merchants_numerical_1_sum 21362 19186
hist_merchants_numerical_1_mean 16841 15285
hist_merchants_numerical_2_sum 21254 19077
hist_merchants_numerical_2_mean 16823 15317
hist_merchants_category_1_sum 383 336
hist_merchants_category_2_mean 2247 2173
hist_merchants_category_4_sum 854 795
new_merchants_merchant_group_id_nuniqu

In [23]:
historical_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,installments_exception
0,1,C_ID_4e6213e9bc,88,1,0,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37,0
1,1,C_ID_4e6213e9bc,88,1,0,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16,0
2,1,C_ID_4e6213e9bc,88,1,0,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37,0
3,1,C_ID_4e6213e9bc,88,1,0,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34,0
4,1,C_ID_4e6213e9bc,88,1,0,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37,0


In [22]:
historical_transactions['installments_exception'] = historical_transactions['installments'].apply(lambda x: np.where(x == -1, 1, 0))

In [25]:
new_merchant_transactions.query('installments == -1')

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
10,Y,C_ID_ef55cf8d4b,69,N,-1,,45,M_ID_3ffd43b4cd,1,4.452265,2018-03-31 09:55:40,1.0,9,18
189,Y,C_ID_fd55871fd8,187,N,-1,,367,M_ID_2a7d376053,1,-0.329366,2018-02-10 09:33:55,1.0,15,16
270,Y,C_ID_6bf003900f,69,N,-1,,884,M_ID_1b3e5a9b05,1,-0.596673,2018-03-02 18:00:10,1.0,9,27
306,Y,C_ID_e4d60467c2,69,N,-1,,274,M_ID_2da56d089d,1,-0.596643,2018-03-22 13:49:08,1.0,9,36
402,Y,C_ID_cb2269c52d,213,N,-1,,34,M_ID_6972fc7624,2,-0.416325,2018-04-26 14:36:11,1.0,9,38
410,Y,C_ID_cb2269c52d,213,N,-1,,834,M_ID_59f6ac5d37,1,-0.410524,2018-03-09 14:26:28,1.0,9,27
447,Y,C_ID_b25d492593,308,N,-1,,422,M_ID_9b6a46e720,2,-0.686802,2018-04-13 12:01:02,1.0,16,27
448,Y,C_ID_b25d492593,308,N,-1,,45,M_ID_81dcc31929,2,-0.536537,2018-04-21 07:29:12,1.0,16,18
468,Y,C_ID_2bba08c2c4,107,N,-1,,518,M_ID_f95f2434d4,2,-0.611804,2018-04-20 08:34:02,4.0,4,27
641,Y,C_ID_c5bf64ecbd,19,N,-1,,434,M_ID_3dfd6d6de7,1,2.108129,2018-03-10 08:12:11,1.0,9,32


In [26]:
historical_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,installments_exception
0,1,C_ID_4e6213e9bc,88,1,0,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37,0
1,1,C_ID_4e6213e9bc,88,1,0,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16,0
2,1,C_ID_4e6213e9bc,88,1,0,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37,0
3,1,C_ID_4e6213e9bc,88,1,0,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34,0
4,1,C_ID_4e6213e9bc,88,1,0,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37,0


In [27]:
train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,hist_transactions_count,hist_category_1_sum,hist_category_1_mean,hist_category_2_nunique,hist_category_3_nunique,hist_merchant_id_nunique,hist_state_id_nunique,hist_subsector_id_nunique,hist_city_id_nunique,hist_merchant_category_id_nunique,hist_installments_nunique,hist_installments_mean,hist_installments_std,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_std,hist_purchase_month_median,hist_purchase_month_max,hist_purchase_month_min,hist_purchase_month_std,hist_purchase_date_max,hist_purchase_date_min,hist_month_diff_median,hist_month_diff_max,hist_month_diff_min,hist_month_diff_std,hist_purchase_date_diff,hist_purchase_date_average,hist_purchase_date_uptonow,hist_last_180_installments_mean,hist_last_180_installments_std,hist_last_180_purchase_amount_mean,hist_last_180_purchase_amount_std,hist_last_180_purchase_amount_sum,hist_last_30_installments_mean,hist_last_30_installments_std,hist_last_30_purchase_amount_mean,hist_last_30_purchase_amount_std,hist_last_30_purchase_amount_sum,hist_last_7_installments_mean,hist_last_7_installments_std,hist_last_7_purchase_amount_mean,hist_last_7_purchase_amount_std,hist_last_7_purchase_amount_sum,hist_last_90_installments_mean,hist_last_90_installments_std,hist_last_90_purchase_amount_mean,hist_last_90_purchase_amount_std,hist_last_90_purchase_amount_sum,Y_hist_auth_category_1_sum,Y_hist_auth_category_1_mean,Y_hist_auth_category_2_nunique,Y_hist_auth_category_3_nunique,Y_hist_auth_merchant_id_nunique,Y_hist_auth_state_id_nunique,Y_hist_auth_subsector_id_nunique,Y_hist_auth_city_id_nunique,Y_hist_auth_merchant_category_id_nunique,Y_hist_auth_installments_nunique,Y_hist_auth_installments_mean,Y_hist_auth_installments_std,Y_hist_auth_purchase_amount_sum,Y_hist_auth_purchase_amount_mean,Y_hist_auth_purchase_amount_max,Y_hist_auth_purchase_amount_min,Y_hist_auth_purchase_amount_std,Y_hist_auth_purchase_month_median,Y_hist_auth_purchase_month_max,Y_hist_auth_purchase_month_min,Y_hist_auth_purchase_month_std,Y_hist_auth_purchase_date_max,Y_hist_auth_purchase_date_min,Y_hist_auth_month_diff_median,Y_hist_auth_month_diff_max,Y_hist_auth_month_diff_min,Y_hist_auth_month_diff_std,Y_hist_auth_purchase_date_diff,Y_hist_auth_purchase_date_average,Y_hist_auth_purchase_date_uptonow,N_hist_auth_category_1_sum,N_hist_auth_category_1_mean,N_hist_auth_category_2_nunique,N_hist_auth_category_3_nunique,N_hist_auth_merchant_id_nunique,N_hist_auth_state_id_nunique,N_hist_auth_subsector_id_nunique,N_hist_auth_city_id_nunique,N_hist_auth_merchant_category_id_nunique,N_hist_auth_installments_nunique,N_hist_auth_installments_mean,N_hist_auth_installments_std,N_hist_auth_purchase_amount_sum,N_hist_auth_purchase_amount_mean,N_hist_auth_purchase_amount_max,N_hist_auth_purchase_amount_min,N_hist_auth_purchase_amount_std,N_hist_auth_purchase_month_median,N_hist_auth_purchase_month_max,N_hist_auth_purchase_month_min,N_hist_auth_purchase_month_std,N_hist_auth_purchase_date_max,N_hist_auth_purchase_date_min,N_hist_auth_month_diff_median,N_hist_auth_month_diff_max,N_hist_auth_month_diff_min,N_hist_auth_month_diff_std,N_hist_auth_purchase_date_diff,N_hist_auth_purchase_date_average,N_hist_auth_purchase_date_uptonow,hist_Y_month_lag_mean,hist_Y_month_lag_std,hist_Y_purchase_amount_min_mean,hist_Y_purchase_amount_min_std,hist_Y_purchase_amount_max_mean,hist_Y_purchase_amount_max_std,hist_Y_purchase_amount_mean_mean,hist_Y_purchase_amount_mean_std,hist_Y_purchase_amount_std_mean,hist_Y_purchase_amount_std_std,hist_Y_installments_mean_mean,hist_Y_installments_mean_std,hist_Y_installments_sum_mean,hist_Y_installments_sum_std,hist_Y_installments_std_mean,hist_Y_installments_std_std,hist_N_month_lag_mean,hist_N_month_lag_std,hist_N_purchase_amount_min_mean,hist_N_purchase_amount_min_std,hist_N_purchase_amount_max_mean,hist_N_purchase_amount_max_std,hist_N_purchase_amount_mean_mean,hist_N_purchase_amount_mean_std,hist_N_purchase_amount_std_mean,hist_N_purchase_amount_std_std,hist_N_installments_mean_mean,hist_N_installments_mean_std,hist_N_installments_sum_mean,hist_N_installments_sum_std,hist_N_installments_std_mean,hist_N_installments_std_std,hist_1_2017_rate,hist_2_2017_rate,hist_3_2017_rate,hist_4_2017_rate,hist_5_2017_rate,hist_6_2017_rate,hist_7_2017_rate,hist_8_2017_rate,hist_9_2017_rate,hist_10_2017_rate,hist_11_2017_rate,hist_12_2017_rate,hist_1_2018_rate,hist_2_2018_rate,hist_cumusum_sum_purchase_amount0,hist_cumusum_sum_purchase_amount1,hist_cumusum_sum_purchase_amount2,hist_cumusum_sum_purchase_amount3,hist_cumusum_sum_purchase_amount4,hist_cumusum_sum_purchase_amount5,hist_cumusum_sum_purchase_amount6,hist_cumusum_sum_purchase_amount7,hist_cumusum_sum_purchase_amount8,hist_cumusum_sum_purchase_amount9,hist_cumusum_sum_purchase_amount10,hist_cumusum_sum_purchase_amount11,hist_cumusum_sum_purchase_amount12,hist_cumusum_sum_purchase_amount13,hist_cumsum_count_purchase_amount0,hist_cumsum_count_purchase_amount1,hist_cumsum_count_purchase_amount2,hist_cumsum_count_purchase_amount3,hist_cumsum_count_purchase_amount4,hist_cumsum_count_purchase_amount5,hist_cumsum_count_purchase_amount6,hist_cumsum_count_purchase_amount7,hist_cumsum_count_purchase_amount8,hist_cumsum_count_purchase_amount9,hist_cumsum_count_purchase_amount10,hist_cumsum_count_purchase_amount11,hist_cumsum_count_purchase_amount12,hist_cumsum_count_purchase_amount13,hist_mean_installments_13,hist_mean_installments_12,hist_mean_installments_11,hist_mean_installments_10,hist_mean_installments_9,hist_mean_installments_8,hist_mean_installments_7,hist_mean_installments_6,hist_mean_installments_5,hist_mean_installments_4,hist_mean_installments_3,hist_mean_installments_2,hist_mean_installments_1,hist_mean_installments_0,new_transactions_count,new_category_1_sum,new_category_1_mean,new_category_2_nunique,new_category_3_nunique,new_merchant_id_nunique,new_state_id_nunique,new_subsector_id_nunique,new_city_id_nunique,new_merchant_category_id_nunique,new_installments_nunique,new_installments_mean,new_installments_std,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_purchase_month_median,new_purchase_month_max,new_purchase_month_min,new_purchase_month_std,new_purchase_date_max,new_purchase_date_min,new_month_diff_median,new_month_diff_max,new_month_diff_min,new_month_diff_std,new_purchase_date_diff,new_purchase_date_average,new_purchase_date_uptonow,new_cumusum_sum_purchase_amount0,new_cumusum_sum_purchase_amount1,new_cumsum_count_purchase_amount0,new_cumsum_count_purchase_amount1,new_mean_installments_1,new_mean_installments_2,sum_numerical_sum,sum_numerical_mean,rate_lag3_sum,rate_lag3_mean,rate_lag6_sum,rate_lag6_mean,rate_lag12_sum,rate_lag12_mean,hist_merchants_merchant_group_id_nunique,hist_merchants_merchant_category_id_nunique,hist_merchants_numerical_1_sum,hist_merchants_numerical_1_mean,hist_merchants_numerical_2_sum,hist_merchants_numerical_2_mean,hist_merchants_category_1_sum,hist_merchants_category_2_mean,hist_merchants_category_4_sum,new_merchants_merchant_group_id_nunique,new_merchants_merchant_category_id_nunique,new_merchants_numerical_1_sum,new_merchants_numerical_1_mean,new_merchants_numerical_2_sum,new_merchants_numerical_2_mean,new_merchants_category_1_sum,new_merchants_category_2_mean,new_merchants_category_4_sum
0,2017-06,C_ID_92a2005557,5,2,1,260,260,1.0,2,2,94,3,21,7,41,2,0.015381,0.123291,-165.968735,-0.638184,2.258394,-0.739258,0.212158,8.0,12,1,3.474609,1519551000.0,1498573000.0,-1.0,0,-1,0.342041,242,0.930664,3,0.018293,0.134418,-0.610294,0.258191,-100.088244,0.0,0.0,-0.675903,0.063957,-16.22166,0.0,0.0,-0.577446,0.06472,-2.309785,0.0,0.0,-0.589275,0.319941,-60.106062,247,1.0,2,1,93,3,21,7,41,1,0.0,0.0,-157.375,-0.637207,2.257812,-0.739258,0.216553,8.0,12,1,3.529297,1519551000.0,1498573000.0,0.0,1,-1,0.209595,242,0.930664,3,13.0,1.0,1.0,2.0,12.0,1.0,7.0,2.0,10.0,2.0,0.307617,0.480469,-8.571724,-0.659363,-0.431922,-0.737892,0.098851,10.0,12.0,7.0,1.664062,1514385000.0,1500131000.0,0.0,0.0,0.0,0.0,164.0,0.630859,63.0,-4.0,2.738281,-0.73291,0.008797,-0.158813,0.916016,-0.650391,0.052277,0.109558,0.119202,0.0,0.0,0.0,0.0,0.0,0.0,-4.5,1.871094,-0.66476,0.118497,-0.608962,0.127612,-0.637972,0.119344,0.051086,0.050203,0.319336,0.423096,0.666504,0.816406,0.269287,0.3125,0.0,0.0,0.0,0.0,0.0,1.0,1.979492,2.888672,3.837891,4.703125,5.605469,6.570312,7.570312,8.570312,0.0,0.0,0.0,0.0,0.0,-2.0713,-34.898127,-65.880495,-78.665487,-93.076859,-106.591553,-136.876331,-150.418854,-165.96874,0.0,0.0,0.0,0.0,0.0,3.0,52.0,96.0,116.0,138.0,159.0,216.0,237.0,260.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02272,0.049988,0.090881,0.0,0.0,0.0,0.0,23.0,23.0,1.0,1.0,1.0,23.0,1.0,10.0,3.0,14.0,1.0,0.0,0.0,-13.242188,-0.575684,-0.296143,-0.724609,0.135742,3.0,4.0,3.0,0.510742,1525001000.0,1520259000.0,-1.0,-1.0,-1.0,0.0,54.0,2.347656,-60.0,-6.6902,-13.2439,12.0,23.0,0.0,0.0,7962.109942,31.34689,246.906145,0.972071,244.205152,0.961438,244.044564,0.960805,73,38,4056.0,15.96875,3906.0,15.382812,19.0,1.050781,242.0,17.0,14.0,634.0,27.5625,626.0,27.21875,0.0,1.0,19.0
1,2017-01,C_ID_3d0044924f,4,1,0,350,319,0.911621,1,3,142,3,24,9,57,9,1.551758,1.510742,-210.006332,-0.600098,4.6303,-0.742188,0.38501,7.0,12,1,3.847656,1517438000.0,1483720000.0,0.0,1,0,0.353271,390,1.114258,28,1.469136,1.383904,-0.6288,0.26911,-101.86552,1.0,0.0,-0.730304,0.002758,-2.921214,,,,,0.0,1.206186,0.611319,-0.684021,0.130739,-66.350014,310,0.914551,1,3,141,3,24,9,57,9,1.477539,1.350586,-208.875,-0.616211,4.628906,-0.742188,0.355469,6.0,12,1,3.859375,1517438000.0,1483720000.0,1.0,2,0,0.315674,390,1.114258,28,9.0,0.818359,1.0,2.0,9.0,2.0,9.0,2.0,9.0,6.0,3.818359,3.488281,-1.122886,-0.102081,1.942838,-0.740897,0.785906,9.0,12.0,3.0,2.697266,1514467000.0,1488576000.0,1.0,2.0,1.0,0.404541,299.0,0.854492,62.0,-6.0,3.894531,-0.736816,0.005142,0.510742,1.327148,-0.59375,0.09552,0.282227,0.283936,1.625977,0.446533,38.6875,13.570312,1.354492,0.803223,-5.332031,3.326172,-0.292969,0.471576,0.15432,0.970537,-0.148523,0.438508,0.535645,0.75779,4.035156,3.521484,7.0,7.640625,1.782227,2.519531,1.0,2.0,2.923828,3.923828,4.855469,5.855469,6.835938,7.835938,8.625,9.585938,10.585938,11.546875,12.546875,12.546875,0.0,-7.402525,-19.876619,-25.484272,-35.846976,-44.902147,-67.445431,-99.083886,-107.024767,-116.430702,-132.638779,-143.234588,-174.948162,-210.006336,0.0,21.0,43.0,56.0,74.0,89.0,123.0,172.0,186.0,210.0,236.0,252.0,299.0,350.0,0.0,1.571289,2.136719,2.615234,2.054688,1.933594,1.323242,1.183594,1.713867,2.75,1.345703,1.3125,1.260742,1.160156,6.0,6.0,1.0,1.0,1.0,6.0,1.0,4.0,1.0,5.0,1.0,1.0,0.0,-4.355469,-0.726074,-0.70166,-0.739258,0.014397,2.5,3.0,2.0,0.547852,1522393000.0,1517505000.0,0.0,1.0,0.0,0.408203,56.0,9.335938,-30.0,-2.2047,-4.3557,3.0,6.0,1.0,1.0,19707.081308,56.305947,338.471943,0.967063,337.172427,0.96335,337.773534,0.965067,102,54,9960.0,28.453125,9752.0,27.859375,47.0,1.0,321.0,6.0,5.0,3.273438,0.545898,3.224609,0.537598,0.0,1.0,6.0
2,2016-08,C_ID_d639edf6cd,2,2,0,43,43,1.0,2,1,13,2,7,5,8,1,0.0,0.0,-29.167391,-0.678223,-0.145847,-0.72998,0.087402,4.0,12,1,3.275391,1519759000.0,1484123000.0,-1.0,0,-1,0.293945,412,9.578125,1,0.0,0.0,-0.635395,0.163704,-6.989342,0.0,,-0.661287,,-0.661287,0.0,,-0.661287,,-0.661287,0.0,0.0,-0.550586,0.270497,-2.202346,41,1.0,2,1,13,2,7,5,8,1,0.0,0.0,-27.828125,-0.678711,-0.145874,-0.72998,0.089233,4.0,12,1,3.330078,1519759000.0,1484123000.0,0.0,1,0,0.156128,412,9.578125,1,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,-1.338967,-0.669484,-0.637515,-0.701453,0.045211,3.0,4.0,2.0,1.414062,1492801000.0,1487878000.0,0.0,0.0,0.0,0.0,56.0,1.302734,313.0,-6.667969,4.398438,-0.706543,0.018799,-0.621094,0.151733,-0.669434,0.080688,0.065918,0.119019,0.0,0.0,0.0,0.0,0.0,0.0,-11.0,1.414062,-0.669484,0.045211,-0.669484,0.045211,-0.669484,0.045211,,,0.0,0.0,0.0,0.0,,,1.0,1.833008,2.833984,3.691406,4.691406,4.691406,5.691406,6.691406,7.691406,8.6875,8.6875,9.6875,10.6875,11.6875,-4.17127,-8.271328,-11.831229,-16.648263,-19.397363,-19.397363,-20.791433,-22.17805,-22.880088,-26.965046,-26.965046,-27.672854,-28.506104,-29.167391,6.0,12.0,17.0,24.0,28.0,28.0,30.0,32.0,33.0,39.0,39.0,40.0,42.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,,-0.700195,-0.700195,-0.700195,-0.700195,,4.0,4.0,4.0,,1524937000.0,1524937000.0,-1.0,-1.0,-1.0,,0.0,0.0,-59.0,0.0,-0.7,0.0,1.0,0.0,0.0,100.00909,2.325793,41.752072,0.970978,41.337093,0.961328,41.213346,0.95845,9,9,53.84375,1.251953,46.15625,1.073242,1.0,4.714844,40.0,1.0,1.0,-0.027725,-0.027725,-0.037628,-0.037628,0.0,5.0,1.0
3,2017-09,C_ID_186d6a6901,4,3,0,77,65,0.844238,2,3,50,5,13,7,25,4,1.09082,0.588867,-49.491364,-0.642578,1.445596,-0.740723,0.261719,10.0,12,1,3.904297,1519818000.0,1506443000.0,-1.0,0,-1,0.269775,154,2.0,0,1.146667,0.484722,-0.64798,0.262926,-48.598534,1.176471,0.528594,-0.674845,0.115199,-11.472366,1.0,0.0,-0.719186,0.017411,-4.315118,1.111111,0.423659,-0.670583,0.102048,-18.10573,65,0.844238,2,3,50,5,13,7,25,4,1.09082,0.588867,-49.5,-0.642578,1.445312,-0.740723,0.261719,10.0,12,1,3.904297,1519818000.0,1506443000.0,0.0,0,-1,0.113953,154,2.0,0,,,,,,,,,,,,,,,,,,,,,,-9223372000.0,-9223372000.0,,,,,,,,-2.5,1.871094,-0.734375,0.009071,-0.137451,0.790039,-0.606445,0.186035,0.201782,0.311279,1.164062,0.258789,14.335938,11.273438,0.310059,0.356445,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-7.849759,-29.119531,-30.999318,-34.936369,-38.735853,-49.491364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,42.0,49.0,55.0,61.0,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.128906,1.666992,1.0,1.0,1.1875,7.0,6.0,0.856934,1.0,2.0,7.0,2.0,5.0,2.0,6.0,2.0,0.714355,0.755859,-4.65625,-0.665039,-0.566895,-0.734375,0.065918,4.0,4.0,3.0,0.488037,1524049000.0,1520424000.0,-1.0,-1.0,-1.0,0.0,41.0,5.855469,-49.0,-1.2822,-4.655,2.0,7.0,1.0,0.600098,583.18738,7.573862,71.504955,0.928636,71.699208,0.931159,71.579492,0.929604,39,23,294.75,3.830078,288.25,3.744141,21.0,3.839844,75.0,5.0,6.0,0.98584,0.140869,0.946289,0.135132,1.0,3.427734,7.0
4,2017-11,C_ID_cdbd2c0db2,1,3,0,133,118,0.887207,3,2,66,6,17,6,26,4,1.368164,1.896484,-48.687656,-0.365967,7.193041,-0.746094,1.352539,2.0,12,1,5.003906,1519850000.0,1510445000.0,-1.0,0,-1,0.252197,108,0.812012,0,1.368421,1.896862,-0.366073,1.352094,-48.687656,2.128205,3.380902,0.184569,2.374459,7.198195,3.588235,4.80961,1.127303,3.411363,19.164151,1.401786,2.050971,-0.346547,1.448614,-38.813225,116,0.90625,3,2,65,6,17,6,26,4,1.125,1.003906,-69.0625,-0.539551,6.992188,-0.746094,0.737305,2.0,12,1,5.042969,1519850000.0,1510445000.0,0.0,0,-1,0.151855,108,0.812012,0,2.0,0.399902,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7.601562,6.023438,20.35281,4.070562,7.193041,-0.512945,4.18495,2.0,2.0,1.0,0.547852,1519759000.0,1516485000.0,0.0,0.0,0.0,0.0,37.0,0.278076,1.0,-1.5,1.291016,-0.731934,0.016266,2.259766,3.34375,-0.525391,0.137329,0.568359,0.578125,1.141602,0.158447,36.0,9.125,0.681641,0.878906,-0.5,0.707031,3.239836,5.307234,3.340048,5.448955,3.306644,5.401715,0.057861,0.081823,6.5,7.777344,19.0,24.046875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.957031,3.871094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-9.874432,-29.780157,-58.643021,-48.687656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,52.0,98.0,133.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.19043,1.032227,1.0,2.257812,36.0,34.0,0.944336,3.0,3.0,36.0,5.0,10.0,5.0,17.0,3.0,0.972168,0.376953,-19.921875,-0.553711,0.450928,-0.739258,0.223877,4.0,4.0,3.0,0.503906,1524941000.0,1519992000.0,-1.0,0.0,-1.0,0.37793,57.0,1.583008,-59.0,-9.6057,-19.9269,16.0,36.0,0.875,1.049805,854.73954,6.4753,126.997041,0.962099,128.085896,0.970348,126.791312,0.96054,52,26,430.75,3.263672,423.75,3.210938,26.0,3.943359,125.0,30.0,19.0,51.25,1.423828,49.9375,1.387695,4.0,3.277344,26.0


In [29]:
train.card_id.nunique(), len(train.card_id.unique())

(201917, 201917)

In [36]:
import os
import gc

import pandas as pd
import numpy as np
import pickle as pkl
from datetime import date

from keras.layers.normalization import BatchNormalization
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dot, Reshape, Add, Subtract
from keras import backend as K
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.regularizers import l2

PREF = 'f503'

KEY = 'card_id'

SEED = 18
np.random.seed(SEED)

# =============================================================================
# def
# =============================================================================
def get_embed(x_input, x_size, k_latent):
    if x_size > 0:  
        embed = Embedding(x_size, k_latent, input_length=1,
                          embeddings_regularizer=l2(embedding_reg))(x_input)
        embed = Flatten()(embed)
    else:
        embed = Dense(k_latent, kernel_regularizer=l2(embedding_reg))(x_input)
    return embed


def build_model_1(X, fsize):
    dim_input = len(fsize)

    input_x = [Input(shape=(1,)) for i in range(dim_input)]

    biases = [get_embed(x, size, 1) for (x, size) in zip(input_x, fsize)]

    factors = [get_embed(x, size, k_latent)
               for (x, size) in zip(input_x, fsize)]

    s = Add()(factors)

    diffs = [Subtract()([s, x]) for x in factors]

    dots = [Dot(axes=1)([d, x]) for d, x in zip(diffs, factors)]

    x = Concatenate()(biases + dots)
    x = BatchNormalization()(x)
    output = Dense(1, activation='relu', kernel_regularizer=l2(kernel_reg))(x)
    model = Model(inputs=input_x, outputs=[output])
    opt = Adam(clipnorm=0.5)
    model.compile(optimizer=opt, loss='mean_squared_error')
    output_f = factors + biases
    model_features = Model(inputs=input_x, outputs=output_f)

    return model, model_features

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [40]:
df = pd.read_csv(os.path.join(PATH, 'historical_transactions.csv'))
df['purchase_date'] = pd.to_datetime(df['purchase_date'])

In [56]:
features = ['city_id', 'merchant_category_id', 'state_id', 'subsector_id']
fsize = [int(df[f].max()) + 1 for f in features]

X = df.groupby(features)['card_id'].count()

X = X.unstack().fillna(0)
X = X.stack().astype('float32')
X = np.log1p(X).reset_index()
X.columns = features + ['num']

X_train = np.array([X[f].values for f in features])
y_train = (X[['num']].values).astype('float32')

In [57]:
# X_train = X_train.transpose((1, 0))

In [61]:
y_train = y_train.transpose((1, 0))

In [62]:
X.nunique()

city_id                  308
merchant_category_id     327
state_id                  25
subsector_id              41
num                     3388
dtype: int64

In [63]:
X_train.shape, y_train.shape

((4, 2280297), (1, 2280297))

In [64]:
k_latent = 1
embedding_reg = 0.0002
kernel_reg = 0.1

model, model_features = build_model_1(X_train, fsize)

n_epochs = 1000

batch_size = 2 ** 17
model, model_features = build_model_1(X_train, fsize)
earlystopper = EarlyStopping(patience=0, verbose=50)

history = model.fit(
    X_train,  y_train,
    epochs=n_epochs, batch_size=batch_size, verbose=1, shuffle=True,
    validation_data=(X_train, y_train),
    callbacks=[earlystopper],
)

X_pred = model_features.predict(X_train, batch_size=batch_size)

factors = X_pred[:len(features)]

biases = X_pred[len(features):2*len(features)]

for f, X_p in zip(features, factors):
    for i in range(k_latent):
        X['%s_fm_factor_%d' % (f, i)] = X_p[:, i]

for f, X_p in zip(features, biases):
    X['%s_fm_bias' % (f)] = X_p[:, 0]

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 4 array(s), but instead got the following list of 1 arrays: [array([[ -1,  -1,  -1, ..., 347, 347, 347],
       [ -1,  -1,  -1, ..., 891, 891, 891],
       [ -1,  -1,  -1, ...,  20,  20,  20],
       [ -1,   1,   2, ...,  39,  40,  41]])]...

In [70]:
historical_transactions = pd.read_csv('../input/historical_transactions.csv')

In [71]:
historical_transactions[historical_transactions.category_2.isna()].shape

(2652864, 14)

In [72]:
historical_transactions.shape

(29112361, 14)

In [73]:
historical_transactions[historical_transactions.category_3.isna()].shape

(178159, 14)

In [74]:
historical_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [75]:
historical_transactions[historical_transactions.merchant_id.isna()].shape

(138481, 14)

In [78]:
historical_transactions[['category_1', 'category_2']].apply('max', axis=1)

0           1.0
1           1.0
2           1.0
3           1.0
4           1.0
5           1.0
6           1.0
7           1.0
8           1.0
9           1.0
10          1.0
11          NaN
12          1.0
13          1.0
14          1.0
15          1.0
16          1.0
17          1.0
18          1.0
19          1.0
20          1.0
21          1.0
22          1.0
23          1.0
24          1.0
25          1.0
26          1.0
27          1.0
28          1.0
29          1.0
           ... 
29112331    1.0
29112332    NaN
29112333    NaN
29112334    NaN
29112335    1.0
29112336    1.0
29112337    3.0
29112338    3.0
29112339    1.0
29112340    NaN
29112341    NaN
29112342    NaN
29112343    4.0
29112344    4.0
29112345    4.0
29112346    3.0
29112347    3.0
29112348    1.0
29112349    NaN
29112350    NaN
29112351    1.0
29112352    1.0
29112353    2.0
29112354    NaN
29112355    2.0
29112356    NaN
29112357    NaN
29112358    1.0
29112359    1.0
29112360    1.0
Length: 29112361, dtype: