# ELO MERCHANT CATEGORY RECOMMENDATION 
### COMPETITION ON KAGGLE
## Data Transformations of historical_transactions.csv
### 30-12-2018

In [1]:
## Variables specific for competition

ID = 'card_id'                                            
TARGET = 'target'    

RAW_DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/rawdata/'  
DIRECTORY = 'C:/Users/judit/Documents/learning/kaggle/Elo_201812/data/'
HIST_TRANS_FILE = RAW_DIRECTORY + 'historical_transactions.csv'
MERCHANTS_FILE = RAW_DIRECTORY + 'merchants.csv'
NEW_MERCH_TRANS_FILE = RAW_DIRECTORY + 'new_merchant_transactions.csv'
TRAIN_FILE = RAW_DIRECTORY + 'train.csv'    
TEST_FILE = RAW_DIRECTORY +'test.csv'
SAMPLE_SUBMISSION_FILE = RAW_DIRECTORY + 'sample_submission.csv'

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org
  ipython-dev@scipy.org""")


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import random
#import missingno
import pickle

random.seed(1)

In [3]:
def reduce_mem_usage(df, verbose = True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Starting memory usage: {:5.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Reduced memory usage: {:5.2f} MB ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem\
                                                                                                   ) / start_mem))
    return df

def create_date_features(df, source_column = 'purchase_date', preposition_of_new = 'purchase'):
    df[preposition_of_new + '_year'] = df[source_column].dt.year
    df[preposition_of_new + '_month'] = df[source_column].dt.month
    df[preposition_of_new + '_day'] = df[source_column].dt.day
    df[preposition_of_new + '_hour'] = df[source_column].dt.hour
    df[preposition_of_new + '_weekofyear'] = df[source_column].dt.weekofyear
    df[preposition_of_new + '_dayofweek'] = df[source_column].dt.dayofweek
    df[preposition_of_new + '_quarter'] = df[source_column].dt.quarter
    
    return df

In [4]:
hist = pd.read_csv(HIST_TRANS_FILE, parse_dates=["purchase_date"])
hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [5]:
hist.shape

(29112361, 14)

In [6]:
hist['authorized_flag'].unique()

array(['Y', 'N'], dtype=object)

In [7]:
hist[hist['authorized_flag'].isnull()].shape

(0, 14)

In [8]:
hist['authorized_flag'] = hist['authorized_flag'].apply(lambda x : 1 if x == 'Y' else 0)

In [9]:
hist['authorized_flag'].describe()

count    2.911236e+07
mean     9.135450e-01
std      2.810348e-01
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
Name: authorized_flag, dtype: float64

In [10]:
hist[hist['card_id'].isnull()].shape

(0, 14)

In [11]:
# card_id is a hexadecimal number. Convert it into decimal.
hist['card_id'] = hist['card_id'].apply(lambda s : int(s[5:], 16))
hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,1,336652921276,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,1,336652921276,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,1,336652921276,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,1,336652921276,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,1,336652921276,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [12]:
hist[hist['city_id'].isnull()].shape, hist['city_id'].min(), hist['city_id'].max()

((0, 14), -1, 347)

In [13]:
hist['city_id'] = hist['city_id'].apply(lambda x : np.nan if x == -1 else x)

In [14]:
hist['category_1'].unique()

array(['N', 'Y'], dtype=object)

In [15]:
hist[hist['category_1'] == 'Y'].shape, hist[hist['category_1'] == 'N'].shape

((2084029, 14), (27028332, 14))

In [16]:
hist[hist['category_1'].isnull()].shape

(0, 14)

In [17]:
hist['category_1'] = hist['category_1'].apply(lambda x : 1 if x == 'Y' else 0)

In [18]:
hist[hist['installments'].isnull()].shape

(0, 14)

In [19]:
hist['installments'].describe()

count    2.911236e+07
mean     6.484954e-01
std      2.795577e+00
min     -1.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.000000e+00
max      9.990000e+02
Name: installments, dtype: float64

In [20]:
hist.loc[hist['installments'] < 999, 'installments'].describe()

count    2.911217e+07
mean     6.420483e-01
std      1.174203e+00
min     -1.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.000000e+00
max      1.200000e+01
Name: installments, dtype: float64

In [21]:
hist[hist['installments'] == 999].shape

(188, 14)

In [22]:
hist[hist['installments'] == -1].shape

(178159, 14)

In [23]:
hist['category_3'].unique()

array(['A', 'B', 'C', nan], dtype=object)

In [24]:
hist[hist['category_3'].isnull()].shape

(178159, 14)

In [25]:
hist.loc[hist['category_3'].isnull(), 'installments'].unique()

array([-1], dtype=int64)

In [26]:
hist.loc[hist['category_3'] == 'A', 'installments'].unique()

array([0], dtype=int64)

In [27]:
hist.loc[hist['category_3'] == 'B', 'installments'].unique()

array([1], dtype=int64)

In [28]:
hist.loc[hist['category_3'] == 'C', 'installments'].unique()

array([  5,   3,   4,   2,  10,   6,  12,   8,   7,   9,  11, 999], dtype=int64)

In [29]:
hist[hist['category_3'] == 'A'].shape, hist[hist['category_3'] == 'B'].shape, hist[hist['category_3'] == 'C'].shape

((15411747, 14), (11677522, 14), (1844933, 14))

In [30]:
hist.loc[hist['category_3'] == 'A', 'card_id'].nunique(), hist.loc[hist['category_3'] == 'B', 'card_id'].nunique(),\
hist.loc[hist['category_3'] == 'C', 'card_id'].nunique(), hist.loc[hist['category_3'].isnull(), 'card_id'].nunique()

(163790, 240811, 164176, 74839)

In [31]:
hist['card_id'].nunique()

325540

In [32]:
hist.loc[hist['category_3'] == 'A', 'card_id'].nunique() + hist.loc[hist['category_3'] == 'B', 'card_id'].nunique() +\
hist.loc[hist['category_3'] == 'C', 'card_id'].nunique() + hist.loc[hist['category_3'].isnull(), 'card_id'].nunique()

643616

For the same card_id, different values of category_3 can be found.

In [33]:
# Valeria Elo said that both 999 and -1 refer to missing values 
# (https://www.kaggle.com/c/elo-merchant-category-recommendation/discussion/72993)
hist['installments'] = hist['installments'].apply(lambda x : np.nan if x == -1 else np.nan if x == 999 else x)
hist[hist['installments'].isnull()].shape, hist[hist['installments'] == -1].shape, hist[hist['installments'] == 999].shape

((178347, 14), (0, 14), (0, 14))

In [34]:
hist['category_3'] = hist['category_3'].apply(lambda s : 0 if s == 'A' else 1 if s == 'B' else 2 if s == 'C' else np.nan)
hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,1,336652921276,88.0,0,0.0,0.0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,1,336652921276,88.0,0,0.0,0.0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,1,336652921276,88.0,0,0.0,0.0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,1,336652921276,88.0,0,0.0,0.0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,1,336652921276,88.0,0,0.0,0.0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [35]:
hist[hist['merchant_category_id'].isnull()].shape

(0, 14)

In [36]:
hist['merchant_category_id'].nunique(), hist['merchant_category_id'].min(), hist['merchant_category_id'].max()

(327, -1, 891)

In [37]:
hist['merchant_category_id'] = hist['merchant_category_id'].apply(lambda x : np.nan if x == -1 else x)

In [38]:
hist[hist['merchant_id'].isnull()].shape

(138481, 14)

In [39]:
# merchant_id is a hexadecimal number. Convert it into decimal.
hist['merchant_id'] = hist['merchant_id'].apply(lambda s : int(s[5:], 16) if pd.notnull(s) else np.nan)
hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,1,336652921276,88.0,0,0.0,0.0,367.0,579495000000.0,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,1,336652921276,88.0,0,0.0,0.0,80.0,651204900000.0,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,1,336652921276,88.0,0,0.0,0.0,560.0,991427500000.0,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [40]:
hist['merchant_id'].describe()

count    2.897388e+07
mean     5.424526e+11
std      3.292712e+11
min      2.429567e+06
25%      2.614724e+11
50%      5.499674e+11
75%      8.297708e+11
max      1.099508e+12
Name: merchant_id, dtype: float64

In [41]:
sorted(hist['month_lag'].unique())

[-13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0]

In [42]:
hist[hist['month_lag'].isnull()].shape

(0, 14)

In [43]:
hist['purchase_amount'].describe()

count    2.911236e+07
mean     3.640090e-02
std      1.123522e+03
min     -7.469078e-01
25%     -7.203559e-01
50%     -6.883495e-01
75%     -6.032543e-01
max      6.010604e+06
Name: purchase_amount, dtype: float64

In [44]:
hist[hist['purchase_amount'].isnull()].shape

(0, 14)

In [45]:
hist['purchase_amount'].quantile([.000001, .00001, .0001, .001, .01, .1, .9, .99, .999, .9999, .99999, .999999, .9999999, 
                                  .99999995, .99999999, .999999995])

0.000001   -7.468928e-01
0.000010   -7.468928e-01
0.000100   -7.468928e-01
0.001000   -7.461360e-01
0.010000   -7.432413e-01
0.100000   -7.334140e-01
0.900000   -4.281955e-01
0.990000    1.220841e+00
0.999000    6.766348e+00
0.999900    2.805100e+01
0.999990    8.264703e+03
0.999999    8.247975e+04
1.000000    1.390453e+05
1.000000    1.570802e+05
1.000000    4.308898e+06
1.000000    5.159751e+06
Name: purchase_amount, dtype: float64

In [46]:
hist[hist['purchase_amount'] > 1000].shape, hist[hist['purchase_amount'] > 10000].shape, \
hist[hist['purchase_amount'] > 100000].shape, hist[hist['purchase_amount'] > 1000000].shape

((829, 14), (246, 14), (14, 14), (1, 14))

In [47]:
hist.loc[hist['purchase_amount'] > 1000, 'card_id'].nunique(), hist.loc[hist['purchase_amount'] > 10000, 'card_id'].nunique(),\
hist.loc[hist['purchase_amount'] > 100000, 'card_id'].nunique(), hist.loc[hist['purchase_amount'] > 1000000, 'card_id'].nunique()

(807, 243, 14, 1)

In [48]:
hist.loc[hist['purchase_amount'] > 100000]

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
6889081,0,468170464940,19.0,0,0.0,0.0,307.0,904870500000.0,-8,102185.3,2017-06-22 14:12:56,1.0,9,19
10018048,0,248842569016,69.0,0,1.0,1.0,454.0,549967400000.0,0,120215.1,2018-02-02 13:03:11,1.0,9,39
10946359,0,19587612463,283.0,0,0.0,0.0,823.0,505566700000.0,-6,105039.7,2017-08-12 19:09:21,1.0,9,25
11189165,0,670026812192,199.0,0,1.0,1.0,108.0,280562700000.0,-3,105187.5,2017-08-20 09:33:54,4.0,14,34
12097071,0,451801652232,291.0,0,0.0,0.0,299.0,246812300000.0,-4,135241.5,2017-10-21 00:00:00,1.0,9,41
15265708,0,756429525740,160.0,0,0.0,0.0,734.0,745979600000.0,-7,105186.5,2017-07-20 01:12:51,5.0,21,25
19397260,0,694115491240,232.0,0,0.0,0.0,108.0,604240200000.0,-9,119919.6,2017-05-20 09:46:32,4.0,13,34
19745877,0,370504714462,272.0,0,0.0,0.0,703.0,965560800000.0,-1,165298.7,2017-10-26 13:37:12,1.0,9,29
20537106,0,821513939280,87.0,0,1.0,1.0,166.0,818686300000.0,-12,147260.6,2017-02-24 12:07:25,3.0,11,29
22562437,0,240336263170,296.0,0,1.0,1.0,278.0,248865000000.0,-2,134758.8,2017-12-31 09:59:56,1.0,15,37


In [49]:
hist[hist['card_id'] == 468170464940]

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
6889004,1,468170464940,339.0,0,0.0,0.0,437.0,5.757736e+11,-9,-0.518234,2017-05-13 10:03:41,1.0,9,15
6889005,1,468170464940,231.0,0,0.0,0.0,557.0,1.083612e+12,-10,-0.183414,2017-04-24 10:05:08,1.0,9,29
6889006,1,468170464940,339.0,0,0.0,0.0,437.0,5.757736e+11,-1,0.174548,2018-01-25 11:27:54,1.0,9,15
6889007,1,468170464940,339.0,0,0.0,0.0,705.0,6.440884e+11,-11,-0.718598,2017-03-11 12:05:53,1.0,9,33
6889008,1,468170464940,69.0,0,0.0,0.0,171.0,1.066833e+12,-5,-0.717050,2017-09-30 13:55:10,1.0,9,34
6889009,1,468170464940,339.0,0,0.0,0.0,437.0,5.757736e+11,-10,-0.396009,2017-04-12 15:56:50,1.0,9,15
6889010,1,468170464940,339.0,0,0.0,0.0,705.0,1.153136e+11,-4,-0.733760,2017-10-14 16:16:04,1.0,9,33
6889011,1,468170464940,339.0,0,0.0,0.0,705.0,1.153136e+11,-4,-0.639618,2017-10-11 17:40:23,1.0,9,33
6889012,1,468170464940,339.0,0,0.0,0.0,367.0,6.066079e+11,-7,-0.664127,2017-07-08 13:15:16,1.0,9,16
6889013,1,468170464940,339.0,0,0.0,0.0,705.0,1.153136e+11,0,-0.636568,2018-02-22 09:15:03,1.0,9,33


In [50]:
hist.loc[hist['purchase_amount'] > 440, 'authorized_flag'].unique(), hist.loc[hist['purchase_amount'] > 440].shape

(array([0], dtype=int64), (935, 14))

In [51]:
hist.loc[((hist['purchase_amount'] <= 440) & (hist['authorized_flag'] == 0))].shape[0] / hist.loc[hist['purchase_amount'] <= 440
                                                                                                 ].shape[0]

0.08642565293778463

In [52]:
hist.loc[((hist['purchase_amount'] > 1.22) & (hist['authorized_flag'] == 0))].shape[0] / hist.loc[hist['purchase_amount'] > 1.22
                                                                                                 ].shape[0]

0.2681648248658558

In [53]:
hist.loc[((hist['purchase_amount'] > 6.77) & (hist['authorized_flag'] == 0))].shape[0] / hist.loc[hist['purchase_amount'] > 6.77
                                                                                                 ].shape[0]

0.35810049776550523

In [54]:
hist.loc[((hist['purchase_amount'] > 280) & (hist['authorized_flag'] == 0))].shape[0] / hist.loc[hist['purchase_amount'] > 280
                                                                                                 ].shape[0]

0.996993987975952

Should we use purchase data with very large purchase_amount? Only 1% of purchases is above 1.22, and 0.1% is above 6.77!  
Should we use the data of denied (authorized_flag == 'N' or 0) purchases?

In [55]:
hist['purchase_date'].describe()

count                29112361
unique               16395300
top       2017-11-24 00:00:00
freq                    26184
first     2017-01-01 00:00:08
last      2018-02-28 23:59:51
Name: purchase_date, dtype: object

In [56]:
hist[hist['purchase_date'].isnull()].shape

(0, 14)

In [57]:
hist = create_date_features(df = hist, source_column = 'purchase_date', preposition_of_new = 'purchase')

hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,category_2,state_id,subsector_id,purchase_year,purchase_month,purchase_day,purchase_hour,purchase_weekofyear,purchase_dayofweek,purchase_quarter
0,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-8,-0.703331,...,1.0,16,37,2017,6,25,15,25,6,2
1,1,336652921276,88.0,0,0.0,0.0,367.0,579495000000.0,-7,-0.733128,...,1.0,16,16,2017,7,15,12,28,5,3
2,1,336652921276,88.0,0,0.0,0.0,80.0,651204900000.0,-6,-0.720386,...,1.0,16,37,2017,8,9,22,32,2,3
3,1,336652921276,88.0,0,0.0,0.0,560.0,991427500000.0,-5,-0.735352,...,1.0,16,34,2017,9,2,10,35,5,3
4,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-11,-0.722865,...,1.0,16,37,2017,3,10,1,10,4,1


In [58]:
hist['reference_date'] = hist.apply(lambda row : row['purchase_date'] - pd.DateOffset(months = row['month_lag']), axis = 1)
hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,state_id,subsector_id,purchase_year,purchase_month,purchase_day,purchase_hour,purchase_weekofyear,purchase_dayofweek,purchase_quarter,reference_date
0,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-8,-0.703331,...,16,37,2017,6,25,15,25,6,2,2018-02-25 15:33:07
1,1,336652921276,88.0,0,0.0,0.0,367.0,579495000000.0,-7,-0.733128,...,16,16,2017,7,15,12,28,5,3,2018-02-15 12:10:45
2,1,336652921276,88.0,0,0.0,0.0,80.0,651204900000.0,-6,-0.720386,...,16,37,2017,8,9,22,32,2,3,2018-02-09 22:04:29
3,1,336652921276,88.0,0,0.0,0.0,560.0,991427500000.0,-5,-0.735352,...,16,34,2017,9,2,10,35,5,3,2018-02-02 10:06:26
4,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-11,-0.722865,...,16,37,2017,3,10,1,10,4,1,2018-02-10 01:14:19


In [59]:
hist = create_date_features(df = hist, source_column = 'reference_date', preposition_of_new = 'reference')

hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,purchase_dayofweek,purchase_quarter,reference_date,reference_year,reference_month,reference_day,reference_hour,reference_weekofyear,reference_dayofweek,reference_quarter
0,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-8,-0.703331,...,6,2,2018-02-25 15:33:07,2018,2,25,15,8,6,1
1,1,336652921276,88.0,0,0.0,0.0,367.0,579495000000.0,-7,-0.733128,...,5,3,2018-02-15 12:10:45,2018,2,15,12,7,3,1
2,1,336652921276,88.0,0,0.0,0.0,80.0,651204900000.0,-6,-0.720386,...,2,3,2018-02-09 22:04:29,2018,2,9,22,6,4,1
3,1,336652921276,88.0,0,0.0,0.0,560.0,991427500000.0,-5,-0.735352,...,5,3,2018-02-02 10:06:26,2018,2,2,10,5,4,1
4,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-11,-0.722865,...,4,1,2018-02-10 01:14:19,2018,2,10,1,6,5,1


In [60]:
hist['category_2'].unique()

array([  1.,  nan,   3.,   5.,   2.,   4.])

In [61]:
print(set(hist.loc[hist['category_2'] == 1,'state_id']))             # either Central-West or Southeast region
print(set(hist.loc[hist['category_2'] == 2,'state_id']))             # North region
print(set(hist.loc[hist['category_2'] == 3,'state_id']))             # Northeast region
print(set(hist.loc[hist['category_2'] == 4,'state_id']))             # either Central-West of Southeast region
print(set(hist.loc[hist['category_2'] == 5,'state_id']))             # South region
print(set(hist.loc[hist['category_2'].isnull(),'state_id']))         # unknown/abroad

{16, 9, 12, 15}
{6, 10, 18, 23, 24}
{1, 2, 3, 7, 8, 11, 17, 19}
{4, 13, 22, 14}
{21, 20, 5}
{-1}


In [62]:
hist.to_pickle(DIRECTORY + 'hist0.pkl')

In [63]:
hist.loc[hist['state_id'] == -1, 'category_2'].unique()

array([ nan])

In [64]:
hist.loc[hist['state_id'] == -1, 'city_id'].unique()

array([  nan,  331.,  170.,   20.,   75.,  179.,  315.,  244.,  322.])

In [65]:
hist.loc[hist['city_id'] == 322, 'state_id'].unique()

array([11,  4, 24, 14,  2,  5,  7, 20, -1], dtype=int64)

In [66]:
#print(hist.loc[hist['city_id'] == 322, 'state_id'].unique())
print(hist.loc[hist['city_id'] == 170, 'state_id'].unique())
print(hist.loc[hist['city_id'] == 75, 'state_id'].unique())
print(hist.loc[hist['city_id'] == 331, 'state_id'].unique())
print(hist.loc[hist['city_id'] == 179, 'state_id'].unique())
print(hist.loc[hist['city_id'] == 20, 'state_id'].unique())
print(hist.loc[hist['city_id'] == 244, 'state_id'].unique())
print(hist.loc[hist['city_id'] == 315, 'state_id'].unique())

[ 8 -1  5  4 20  9]
[-1]
[16  3 -1  9 20  5 13 24  7 15 23  1]
[-1]
[19 -1  9]
[20  2 23  5 -1  3  9]
[-1]


In [67]:
hist.loc[((hist['state_id'] == -1) & (hist['city_id'].isnull())), 'category_1'].unique()

array([0, 1], dtype=int64)

In [68]:
hist.loc[hist['category_1'] == 1, 'category_2'].unique()

array([ nan])

In [69]:
hist.loc[hist['category_1'] == 1, 'city_id'].unique()

array([ nan])

In [70]:
hist.loc[hist['category_1'] == 1, 'state_id'].unique()                     

array([-1], dtype=int64)

In [71]:
hist.loc[hist['category_2'].isnull(), 'category_1'].unique()

array([0, 1], dtype=int64)

In [72]:
# category_2 refers to the 5 regions of Brazil. There is also a federal district. NaN states both for the federal district 
# and for foreign purchases. Let's impute 6 for the federal district
hist['category_2'] = hist.apply(lambda row : row['category_2'] if (pd.notnull(row['category_2'])) else 
                                             6 if row['category_3'] == 0 else np.nan, axis = 1)

hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,purchase_dayofweek,purchase_quarter,reference_date,reference_year,reference_month,reference_day,reference_hour,reference_weekofyear,reference_dayofweek,reference_quarter
0,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-8,-0.703331,...,6,2,2018-02-25 15:33:07,2018,2,25,15,8,6,1
1,1,336652921276,88.0,0,0.0,0.0,367.0,579495000000.0,-7,-0.733128,...,5,3,2018-02-15 12:10:45,2018,2,15,12,7,3,1
2,1,336652921276,88.0,0,0.0,0.0,80.0,651204900000.0,-6,-0.720386,...,2,3,2018-02-09 22:04:29,2018,2,9,22,6,4,1
3,1,336652921276,88.0,0,0.0,0.0,560.0,991427500000.0,-5,-0.735352,...,5,3,2018-02-02 10:06:26,2018,2,2,10,5,4,1
4,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-11,-0.722865,...,4,1,2018-02-10 01:14:19,2018,2,10,1,6,5,1


In [73]:
hist.to_pickle(DIRECTORY + 'hist1.pkl')

In [74]:
hist[hist['state_id'].isnull()].shape, hist['state_id'].min()

((0, 29), -1)

In [75]:
hist['state_id'] = hist['state_id'].apply(lambda x : np.nan if x == -1 else x)

In [76]:
hist[hist['subsector_id'].isnull()].shape, hist['subsector_id'].min()

((0, 29), -1)

In [77]:
hist['subsector_id'] = hist['subsector_id'].apply(lambda x : np.nan if x == -1 else x)

In [78]:
hist = reduce_mem_usage(hist, verbose = True)

Starting memory usage: 6441.18 MB
Reduced memory usage: 1804.64 MB (72.0% reduction)


In [79]:
hist.to_pickle(DIRECTORY + 'hist2.pkl')

### Aggregates

In [103]:
def aggregate_features(df = hist, prefix = '', suffix = '', gb_cols = ['card_id'], 
                       agg_func = {'purchase_amount': ['sum', 'mean', 'max', 'min', 'std', 'size'],
                                   'installments': ['sum', 'median', 'mean', 'max', 'min', 'std']},
                       authorized = True, non_authorized = True, domestic = True, non_domestic = True,
                       max_month_lag = None, max_week_lag = None,
                       min_installments = None, max_installments = None): 
    if not authorized:
        df = df[df['authorized_flag'] == 0]
    if not non_authorized:
        df = df[df['authorized_flag'] == 1]
    if not domestic:
        df = df[df['category_2'].isnull()]
    if not non_domestic:
        df = df[df['category_2'].notnull()]
    if max_month_lag:
        df = df[df['month_lag'] <= max_month_lag]
    if max_week_lag:
        df = df[df['week_lag'] <= max_week_lag]
    if min_installments:
        df = df[df['installments'] >= min_installments]
    if max_installments:
        df = df[df['installments'] <= max_installments]
    agg_df = df.groupby(gb_cols).agg(agg_func)
    agg_df.columns = [prefix + '_'.join(col).strip() + suffix for col in agg_df.columns.values]  
    agg_df.reset_index(inplace = True)

    return agg_df

In [81]:
# First redefine reference dates as the earliest reference date for each card_id
agg_func = {'reference_date' : ['min']}
ref_dates = aggregate_features(df = hist, gb_cols = ['card_id'], agg_func = {'reference_date' : ['min']})
ref_dates.head()

Unnamed: 0,card_id,reference_date_min
0,7377857,2018-02-01 05:06:36
1,19103846,2018-02-01 19:42:58
2,22048496,2018-02-01 07:20:40
3,24721286,2017-10-01 02:08:20
4,25427418,2018-02-01 08:03:55


In [82]:
ref_dates.columns

Index(['card_id', 'reference_date_min'], dtype='object')

In [83]:
hist = hist.merge(ref_dates, how = 'left', on = 'card_id')
hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,purchase_quarter,reference_date,reference_year,reference_month,reference_day,reference_hour,reference_weekofyear,reference_dayofweek,reference_quarter,reference_date_min
0,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-8,-0.703331,...,2,2018-02-25 15:33:07,2018,2,25,15,8,6,1,2018-02-01 01:33:39
1,1,336652921276,88.0,0,0.0,0.0,367.0,579495000000.0,-7,-0.733128,...,3,2018-02-15 12:10:45,2018,2,15,12,7,3,1,2018-02-01 01:33:39
2,1,336652921276,88.0,0,0.0,0.0,80.0,651204900000.0,-6,-0.720386,...,3,2018-02-09 22:04:29,2018,2,9,22,6,4,1,2018-02-01 01:33:39
3,1,336652921276,88.0,0,0.0,0.0,560.0,991427500000.0,-5,-0.735352,...,3,2018-02-02 10:06:26,2018,2,2,10,5,4,1,2018-02-01 01:33:39
4,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-11,-0.722865,...,1,2018-02-10 01:14:19,2018,2,10,1,6,5,1,2018-02-01 01:33:39


In [84]:
hist['reference_date'] = hist['reference_date_min']
hist.drop('reference_date_min', inplace = True, axis = 1)
hist = create_date_features(hist, source_column = 'reference_date', preposition_of_new = 'reference')
hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,purchase_dayofweek,purchase_quarter,reference_date,reference_year,reference_month,reference_day,reference_hour,reference_weekofyear,reference_dayofweek,reference_quarter
0,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-8,-0.703331,...,6,2,2018-02-01 01:33:39,2018,2,1,1,5,3,1
1,1,336652921276,88.0,0,0.0,0.0,367.0,579495000000.0,-7,-0.733128,...,5,3,2018-02-01 01:33:39,2018,2,1,1,5,3,1
2,1,336652921276,88.0,0,0.0,0.0,80.0,651204900000.0,-6,-0.720386,...,2,3,2018-02-01 01:33:39,2018,2,1,1,5,3,1
3,1,336652921276,88.0,0,0.0,0.0,560.0,991427500000.0,-5,-0.735352,...,5,3,2018-02-01 01:33:39,2018,2,1,1,5,3,1
4,1,336652921276,88.0,0,0.0,0.0,80.0,962624900000.0,-11,-0.722865,...,4,1,2018-02-01 01:33:39,2018,2,1,1,5,3,1


In [85]:
hist[['card_id', 'purchase_date', 'reference_date', 'month_lag']].head(10)

Unnamed: 0,card_id,purchase_date,reference_date,month_lag
0,336652921276,2017-06-25 15:33:07,2018-02-01 01:33:39,-8
1,336652921276,2017-07-15 12:10:45,2018-02-01 01:33:39,-7
2,336652921276,2017-08-09 22:04:29,2018-02-01 01:33:39,-6
3,336652921276,2017-09-02 10:06:26,2018-02-01 01:33:39,-5
4,336652921276,2017-03-10 01:14:19,2018-02-01 01:33:39,-11
5,336652921276,2018-02-24 08:45:05,2018-02-01 01:33:39,0
6,336652921276,2017-03-21 00:10:51,2018-02-01 01:33:39,-11
7,336652921276,2017-11-18 20:05:55,2018-02-01 01:33:39,-3
8,336652921276,2017-06-01 22:02:56,2018-02-01 01:33:39,-8
9,336652921276,2017-03-16 15:41:22,2018-02-01 01:33:39,-11


In [86]:
hist['month_lag'] = -1 * hist['month_lag']
hist['week_lag'] = (hist.reference_date.dt.year - hist.purchase_date.dt.year
                   ) * 52 + (hist.reference_date.dt.weekofyear - hist.purchase_date.dt.weekofyear)
hist[['card_id', 'purchase_date', 'reference_date', 'month_lag', 'week_lag']].head()

Unnamed: 0,card_id,purchase_date,reference_date,month_lag,week_lag
0,336652921276,2017-06-25 15:33:07,2018-02-01 01:33:39,8,32
1,336652921276,2017-07-15 12:10:45,2018-02-01 01:33:39,7,29
2,336652921276,2017-08-09 22:04:29,2018-02-01 01:33:39,6,25
3,336652921276,2017-09-02 10:06:26,2018-02-01 01:33:39,5,22
4,336652921276,2017-03-10 01:14:19,2018-02-01 01:33:39,11,47


In [87]:
hist.to_pickle(DIRECTORY + 'hist3.pkl')

In [88]:
hist.columns

Index(['authorized_flag', 'card_id', 'city_id', 'category_1', 'installments',
       'category_3', 'merchant_category_id', 'merchant_id', 'month_lag',
       'purchase_amount', 'purchase_date', 'category_2', 'state_id',
       'subsector_id', 'purchase_year', 'purchase_month', 'purchase_day',
       'purchase_hour', 'purchase_weekofyear', 'purchase_dayofweek',
       'purchase_quarter', 'reference_date', 'reference_year',
       'reference_month', 'reference_day', 'reference_hour',
       'reference_weekofyear', 'reference_dayofweek', 'reference_quarter',
       'week_lag'],
      dtype='object')

In [89]:
def mode(series):
    if len(series.mode()) > 0:
        return series.mode().iloc[0]
    else:
        return np.nan

def nancnt(series):
    '''Returns the count of NaN values'''
    return series.isnull().sum()

def nanperc(series):
    '''Returns the percentile of NaN values'''
    return 100 * series.isnull().sum() / len(series)

agg_func = {
    'authorized_flag'      : ['sum', 'mean'],
    'city_id'              : ['nunique', mode, nancnt, nanperc],
    'category_1'           : ['sum', 'mean'],
    'installments'         : ['sum', 'median', 'mean', 'max', 'min', 'std', mode, nancnt, nanperc],
    'category_3'           : ['nunique', mode, nancnt, nanperc],
    'merchant_category_id' : ['nunique', mode, nancnt, nanperc],
    'merchant_id'          : ['nunique', mode, nancnt, nanperc],
    'month_lag'            : ['median', 'mean', 'max', 'min', 'std', mode],
    'purchase_amount'      : ['sum', 'median', 'mean', 'max', 'min', 'std', mode, 'size'],
    'category_2'           : ['nunique', mode, nancnt, nanperc],
    'state_id'             : ['nunique', mode, nancnt, nanperc],
    'subsector_id'         : ['nunique', mode, nancnt, nanperc],
    'purchase_year'        : ['mean', 'median', 'max', 'min', 'std', mode],
    'purchase_month'       : ['mean', 'median', 'max', 'min', 'std', mode],
    'purchase_day'         : ['mean', 'median', 'max', 'min', 'std', mode],
    'purchase_hour'        : ['mean', 'median', 'max', 'min', 'std', mode],
    'purchase_weekofyear'  : ['mean', 'median', 'max', 'min', 'std', mode],
    'purchase_dayofweek'   : ['mean', 'median', 'max', 'min', 'std', mode],
    'purchase_quarter'     : ['mean', 'median', 'max', 'min', 'std', mode],
    'week_lag'             : ['median', 'mean', 'max', 'min', 'std', mode]
}

# Aggregate features, use 
# - both authorized and non-authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the whole historic period.

hist_agg_1 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_all', gb_cols = ['card_id'], agg_func = agg_func,
                                authorized = True, non_authorized = True, 
                                domestic = True, non_domestic = True,
                                max_month_lag = None, max_week_lag = None,
                                min_installments = None, max_installments = None)

In [90]:
hist_agg_1.head()

Unnamed: 0,card_id,hist_authorized_flag_sum_all,hist_authorized_flag_mean_all,hist_city_id_nunique_all,hist_city_id_mode_all,hist_city_id_nancnt_all,hist_city_id_nanperc_all,hist_category_1_sum_all,hist_category_1_mean_all,hist_installments_sum_all,...,hist_purchase_quarter_max_all,hist_purchase_quarter_min_all,hist_purchase_quarter_std_all,hist_purchase_quarter_mode_all,hist_week_lag_median_all,hist_week_lag_mean_all,hist_week_lag_max_all,hist_week_lag_min_all,hist_week_lag_std_all,hist_week_lag_mode_all
0,7377857,114.0,0.765101,3,244.0,28.0,18.796875,28.0,0.187919,192.0,...,4,1,1.10027,2,26.0,23.771812,50,-4,14.899719,41
1,19103846,120.0,0.97561,17,314.0,8.0,6.503906,2.0,0.01626,201.0,...,4,1,1.471485,4,6.0,5.894309,18,-4,5.521661,6
2,22048496,62.0,0.939394,3,137.0,0.0,0.0,0.0,0.0,1.0,...,4,1,1.405948,1,12.0,19.666667,55,-2,18.372499,8
3,24721286,189.0,0.875,9,179.0,2.0,0.925781,2.0,0.009259,5.0,...,4,1,0.811616,3,9.5,11.865741,36,-5,10.160825,2
4,25427418,137.0,0.951389,8,161.0,4.0,2.777344,4.0,0.027778,268.0,...,4,1,1.361874,4,8.0,8.895833,25,-3,8.034644,3


In [91]:
hist_agg_1.to_pickle(DIRECTORY + 'hist_agg_1.pkl')

In [92]:
# Aggregate features, use 
# - both authorized and non-authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last 1 month of historic period.

hist_agg_2 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_lag0m', gb_cols = ['card_id'], agg_func = agg_func,
                                authorized = True, non_authorized = True, 
                                domestic = True, non_domestic = True,
                                max_month_lag = 0, max_week_lag = None,
                                min_installments = None, max_installments = None)

In [93]:
hist_agg_2.to_pickle(DIRECTORY + 'hist_agg_2.pkl')
hist_agg_2.head()

Unnamed: 0,card_id,hist_authorized_flag_sum_lag0m,hist_authorized_flag_mean_lag0m,hist_city_id_nunique_lag0m,hist_city_id_mode_lag0m,hist_city_id_nancnt_lag0m,hist_city_id_nanperc_lag0m,hist_category_1_sum_lag0m,hist_category_1_mean_lag0m,hist_installments_sum_lag0m,...,hist_purchase_quarter_max_lag0m,hist_purchase_quarter_min_lag0m,hist_purchase_quarter_std_lag0m,hist_purchase_quarter_mode_lag0m,hist_week_lag_median_lag0m,hist_week_lag_mean_lag0m,hist_week_lag_max_lag0m,hist_week_lag_min_lag0m,hist_week_lag_std_lag0m,hist_week_lag_mode_lag0m
0,7377857,114.0,0.765101,3,244.0,28.0,18.796875,28.0,0.187919,192.0,...,4,1,1.10027,2,26.0,23.771812,50,-4,14.899719,41
1,19103846,120.0,0.97561,17,314.0,8.0,6.503906,2.0,0.01626,201.0,...,4,1,1.471485,4,6.0,5.894309,18,-4,5.521661,6
2,22048496,62.0,0.939394,3,137.0,0.0,0.0,0.0,0.0,1.0,...,4,1,1.405948,1,12.0,19.666667,55,-2,18.372499,8
3,24721286,189.0,0.875,9,179.0,2.0,0.925781,2.0,0.009259,5.0,...,4,1,0.811616,3,9.5,11.865741,36,-5,10.160825,2
4,25427418,137.0,0.951389,8,161.0,4.0,2.777344,4.0,0.027778,268.0,...,4,1,1.361874,4,8.0,8.895833,25,-3,8.034644,3


In [94]:
# Aggregate features, use 
# - both authorized and non-authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last 2 months of historic period.

hist_agg_3 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_lag1m', gb_cols = ['card_id'], agg_func = agg_func,
                                authorized = True, non_authorized = True, 
                                domestic = True, non_domestic = True,
                                max_month_lag = 1, max_week_lag = None,
                                min_installments = None, max_installments = None)

In [95]:
hist_agg_3.to_pickle(DIRECTORY + 'hist_agg_3.pkl')

In [96]:
# Aggregate features, use 
# - both authorized and non-authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last 3 months of historic period.

hist_agg_4 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_lag2m', gb_cols = ['card_id'], agg_func = agg_func,
                                authorized = True, non_authorized = True, 
                                domestic = True, non_domestic = True,
                                max_month_lag = 2, max_week_lag = None,
                                min_installments = None, max_installments = None)
hist_agg_4.to_pickle(DIRECTORY + 'hist_agg_4.pkl')

In [97]:
# Aggregate features, use 
# - both authorized and non-authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last 4 months of historic period.

hist_agg_5 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_lag3m', gb_cols = ['card_id'], agg_func = agg_func,
                                authorized = True, non_authorized = True, 
                                domestic = True, non_domestic = True,
                                max_month_lag = 3, max_week_lag = None,
                                min_installments = None, max_installments = None)
hist_agg_5.to_pickle(DIRECTORY + 'hist_agg_5.pkl')

In [98]:
# Aggregate features, use 
# - both authorized and non-authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last 6 months of historic period.

hist_agg_6 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_lag5m', gb_cols = ['card_id'], agg_func = agg_func,
                                authorized = True, non_authorized = True, 
                                domestic = True, non_domestic = True,
                                max_month_lag = 5, max_week_lag = None,
                                min_installments = None, max_installments = None)
hist_agg_6.to_pickle(DIRECTORY + 'hist_agg_6.pkl')

In [99]:
hist_agg = hist_agg_1.merge(hist_agg_2, how = 'left', on = 'card_id')
hist_agg = hist_agg.merge(hist_agg_3, how = 'left', on = 'card_id')
hist_agg = hist_agg.merge(hist_agg_4, how = 'left', on = 'card_id')
hist_agg = hist_agg.merge(hist_agg_5, how = 'left', on = 'card_id')
hist_agg = hist_agg.merge(hist_agg_6, how = 'left', on = 'card_id')

hist_agg.to_pickle(DIRECTORY + 'hist_agg_1_6.pkl')

del hist_agg_1, hist_agg_2, hist_agg_3, hist_agg_4, hist_agg_5, hist_agg_6

In [100]:
# Aggregate features, use 
# - both authorized and non-authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last 9 months of historic period.

hist_agg_7 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_lag8m', gb_cols = ['card_id'], agg_func = agg_func,
                                authorized = True, non_authorized = True, 
                                domestic = True, non_domestic = True,
                                max_month_lag = 8, max_week_lag = None,
                                min_installments = None, max_installments = None)
hist_agg_7.to_pickle(DIRECTORY + 'hist_agg_7.pkl')

In [101]:
# Aggregate features, use 
# - both authorized and non-authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last 12 months of historic period.

hist_agg_8 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_lag11m', gb_cols = ['card_id'], agg_func = agg_func,
                                authorized = True, non_authorized = True, 
                                domestic = True, non_domestic = True,
                                max_month_lag = 11, max_week_lag = None,
                                min_installments = None, max_installments = None)
hist_agg_8.to_pickle(DIRECTORY + 'hist_agg_8.pkl')

In [104]:
# Aggregate features, use 
# - only authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the whole historic period.

hist_agg_9 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_all', gb_cols = ['card_id'], agg_func = agg_func,
                                authorized = True, non_authorized = False, 
                                domestic = True, non_domestic = True,
                                max_month_lag = None, max_week_lag = None,
                                min_installments = None, max_installments = None)
hist_agg_9.to_pickle(DIRECTORY + 'hist_agg_9.pkl')

In [105]:
# Aggregate features, use 
# - only authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last month of historic period.

hist_agg_10 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_lag0m', gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = True,
                                 max_month_lag = 0, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_10.to_pickle(DIRECTORY + 'hist_agg_10.pkl')

In [106]:
# Aggregate features, use 
# - only authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last 2 months of historic period.

hist_agg_11 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_lag1m', gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = True,
                                 max_month_lag = 1, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_11.to_pickle(DIRECTORY + 'hist_agg_11.pkl')

In [107]:
hist_agg = hist_agg.merge(hist_agg_7, how = 'left', on = 'card_id')
hist_agg = hist_agg.merge(hist_agg_8, how = 'left', on = 'card_id')
hist_agg = hist_agg.merge(hist_agg_9, how = 'left', on = 'card_id')
hist_agg = hist_agg.merge(hist_agg_10, how = 'left', on = 'card_id')
hist_agg = hist_agg.merge(hist_agg_11, how = 'left', on = 'card_id')

hist_agg.to_pickle(DIRECTORY + 'hist_agg_1_11.pkl')

del hist_agg_7, hist_agg_8, hist_agg_9, hist_agg_10, hist_agg_11

In [108]:
# Aggregate features, use 
# - only authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last 3 months of historic period.

hist_agg_12 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_lag2m', gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = True,
                                 max_month_lag = 2, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_12.to_pickle(DIRECTORY + 'hist_agg_12.pkl')
hist_agg = hist_agg.merge(hist_agg_12, how = 'left', on = 'card_id')
del hist_agg_12

In [109]:
# Aggregate features, use 
# - only authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last 4 months of historic period.

hist_agg_13 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_lag3m', gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = True,
                                 max_month_lag = 3, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_13.to_pickle(DIRECTORY + 'hist_agg_13.pkl')
hist_agg = hist_agg.merge(hist_agg_13, how = 'left', on = 'card_id')
del hist_agg_13

In [110]:
# Aggregate features, use 
# - only authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last 6 months of historic period.

hist_agg_14 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_lag5m', gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = True,
                                 max_month_lag = 5, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_14.to_pickle(DIRECTORY + 'hist_agg_14.pkl')
hist_agg = hist_agg.merge(hist_agg_14, how = 'left', on = 'card_id')
del hist_agg_14

In [111]:
# Aggregate features, use 
# - only authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last 9 months of historic period.

hist_agg_15 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_lag8m', gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = True,
                                 max_month_lag = 8, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_15.to_pickle(DIRECTORY + 'hist_agg_15.pkl')
hist_agg = hist_agg.merge(hist_agg_15, how = 'left', on = 'card_id')
del hist_agg_15

In [112]:
# Aggregate features, use 
# - only authorized transactions, 
# - both domestic and non-domestic transactions,
# - for the last 12 months of historic period.

hist_agg_16 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_lag11m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = True,
                                 max_month_lag = 11, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_16.to_pickle(DIRECTORY + 'hist_agg_16.pkl')
hist_agg = hist_agg.merge(hist_agg_16, how = 'left', on = 'card_id')
del hist_agg_16

In [113]:
# Aggregate features, use 
# - only authorized transactions, 
# - only domestic transactions,
# - for the whole historic period.

hist_agg_17 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_dom_all', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = None, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_17.to_pickle(DIRECTORY + 'hist_agg_17.pkl')
hist_agg = hist_agg.merge(hist_agg_17, how = 'left', on = 'card_id')
del hist_agg_17

In [114]:
# Aggregate features, use 
# - only authorized transactions, 
# - only domestic transactions,
# - for last month of historic period.

hist_agg_18 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_dom_lag0m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 0, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_18.to_pickle(DIRECTORY + 'hist_agg_18.pkl')
hist_agg = hist_agg.merge(hist_agg_18, how = 'left', on = 'card_id')
del hist_agg_18

In [115]:
# Aggregate features, use 
# - only authorized transactions, 
# - only domestic transactions,
# - for last 2 months of historic period.

hist_agg_19 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_dom_lag1m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 1, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_19.to_pickle(DIRECTORY + 'hist_agg_19.pkl')
hist_agg = hist_agg.merge(hist_agg_19, how = 'left', on = 'card_id')
del hist_agg_19

In [116]:
# Aggregate features, use 
# - only authorized transactions, 
# - only domestic transactions,
# - for last 3 months of historic period.

hist_agg_20 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_dom_lag2m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 2, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_20.to_pickle(DIRECTORY + 'hist_agg_20.pkl')
hist_agg = hist_agg.merge(hist_agg_20, how = 'left', on = 'card_id')
del hist_agg_20

In [117]:
# Aggregate features, use 
# - only authorized transactions, 
# - only domestic transactions,
# - for last 4 months of historic period.

hist_agg_21 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_dom_lag3m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 3, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_21.to_pickle(DIRECTORY + 'hist_agg_21.pkl')
hist_agg = hist_agg.merge(hist_agg_21, how = 'left', on = 'card_id')
del hist_agg_21

In [118]:
# Aggregate features, use 
# - only authorized transactions, 
# - only domestic transactions,
# - for last 6 months of historic period.

hist_agg_22 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_dom_lag5m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 5, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_22.to_pickle(DIRECTORY + 'hist_agg_22.pkl')
hist_agg = hist_agg.merge(hist_agg_22, how = 'left', on = 'card_id')
del hist_agg_22

In [119]:
# Aggregate features, use 
# - only authorized transactions, 
# - only domestic transactions,
# - for last 9 months of historic period.

hist_agg_23 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_dom_lag8m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 8, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_23.to_pickle(DIRECTORY + 'hist_agg_23.pkl')
hist_agg = hist_agg.merge(hist_agg_23, how = 'left', on = 'card_id')
del hist_agg_23

In [120]:
# Aggregate features, use 
# - only authorized transactions, 
# - only domestic transactions,
# - for last 12 months of historic period.

hist_agg_24 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_dom_lag11m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 11, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_24.to_pickle(DIRECTORY + 'hist_agg_24.pkl')
hist_agg = hist_agg.merge(hist_agg_24, how = 'left', on = 'card_id')
del hist_agg_24

In [121]:
# Aggregate features, use 
# - only authorized transactions, 
# - only nondomestic transactions,
# - for the whole historic period.

hist_agg_25 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_nondom_all', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = None, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_25.to_pickle(DIRECTORY + 'hist_agg_25.pkl')
hist_agg = hist_agg.merge(hist_agg_25, how = 'left', on = 'card_id')
del hist_agg_25

In [122]:
# Aggregate features, use 
# - only authorized transactions, 
# - only nondomestic transactions,
# - for the last month of historic period.

hist_agg_26 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_nondom_lag0m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 0, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_26.to_pickle(DIRECTORY + 'hist_agg_26.pkl')
hist_agg = hist_agg.merge(hist_agg_26, how = 'left', on = 'card_id')
del hist_agg_26

In [123]:
# Aggregate features, use 
# - only authorized transactions, 
# - only nondomestic transactions,
# - for the last 2 months historic period.

hist_agg_27 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_nondom_lag1m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 1, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_27.to_pickle(DIRECTORY + 'hist_agg_27.pkl')
hist_agg = hist_agg.merge(hist_agg_27, how = 'left', on = 'card_id')
del hist_agg_27

In [124]:
# Aggregate features, use 
# - only authorized transactions, 
# - only nondomestic transactions,
# - for the last 3 months historic period.

hist_agg_28 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_nondom_lag2m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 2, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_28.to_pickle(DIRECTORY + 'hist_agg_28.pkl')
hist_agg = hist_agg.merge(hist_agg_28, how = 'left', on = 'card_id')
del hist_agg_28

In [125]:
# Aggregate features, use 
# - only authorized transactions, 
# - only nondomestic transactions,
# - for the last 4 months historic period.

hist_agg_29 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_nondom_lag3m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 3, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_29.to_pickle(DIRECTORY + 'hist_agg_29.pkl')
hist_agg = hist_agg.merge(hist_agg_29, how = 'left', on = 'card_id')
del hist_agg_29

In [126]:
# Aggregate features, use 
# - only authorized transactions, 
# - only nondomestic transactions,
# - for the last 6 months historic period.

hist_agg_30 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_nondom_lag5m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 5, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_30.to_pickle(DIRECTORY + 'hist_agg_30.pkl')
hist_agg = hist_agg.merge(hist_agg_30, how = 'left', on = 'card_id')
del hist_agg_30

In [127]:
# Aggregate features, use 
# - only authorized transactions, 
# - only nondomestic transactions,
# - for the last 9 months historic period.

hist_agg_31 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_nondom_lag8m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 8, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_31.to_pickle(DIRECTORY + 'hist_agg_31.pkl')
hist_agg = hist_agg.merge(hist_agg_31, how = 'left', on = 'card_id')
del hist_agg_31

In [128]:
# Aggregate features, use 
# - only authorized transactions, 
# - only nondomestic transactions,
# - for the last 12 months historic period.

hist_agg_32 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_auth_nondom_lag11m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = False, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 11, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_32.to_pickle(DIRECTORY + 'hist_agg_32.pkl')
hist_agg = hist_agg.merge(hist_agg_32, how = 'left', on = 'card_id')
del hist_agg_32

In [129]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only domestic transactions,
# - for the whole historic period.

hist_agg_33 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_dom_all', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = None, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_33.to_pickle(DIRECTORY + 'hist_agg_33.pkl')
hist_agg = hist_agg.merge(hist_agg_33, how = 'left', on = 'card_id')
del hist_agg_33

In [130]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only domestic transactions,
# - for the last month of historic period.

hist_agg_34 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_dom_lag0m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 0, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_34.to_pickle(DIRECTORY + 'hist_agg_34.pkl')
hist_agg = hist_agg.merge(hist_agg_34, how = 'left', on = 'card_id')
del hist_agg_34

In [131]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only domestic transactions,
# - for the last 2 months of historic period.

hist_agg_35 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_dom_lag1m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 1, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_35.to_pickle(DIRECTORY + 'hist_agg_35.pkl')
hist_agg = hist_agg.merge(hist_agg_35, how = 'left', on = 'card_id')
del hist_agg_35

In [132]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only domestic transactions,
# - for the last 3 months of historic period.

hist_agg_36 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_dom_lag2m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 2, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_36.to_pickle(DIRECTORY + 'hist_agg_36.pkl')
hist_agg = hist_agg.merge(hist_agg_36, how = 'left', on = 'card_id')
del hist_agg_36

In [133]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only domestic transactions,
# - for the last 4 months of historic period.

hist_agg_37 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_dom_lag3m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 3, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_37.to_pickle(DIRECTORY + 'hist_agg_37.pkl')
hist_agg = hist_agg.merge(hist_agg_37, how = 'left', on = 'card_id')
del hist_agg_37

In [134]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only domestic transactions,
# - for the last 6 months of historic period.

hist_agg_38 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_dom_lag5m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 5, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_38.to_pickle(DIRECTORY + 'hist_agg_38.pkl')
hist_agg = hist_agg.merge(hist_agg_38, how = 'left', on = 'card_id')
del hist_agg_38

In [135]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only domestic transactions,
# - for the last 9 months of historic period.

hist_agg_39 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_dom_lag8m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 8, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_39.to_pickle(DIRECTORY + 'hist_agg_39.pkl')
hist_agg = hist_agg.merge(hist_agg_39, how = 'left', on = 'card_id')
del hist_agg_39

In [136]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only domestic transactions,
# - for the last 12 months of historic period.

hist_agg_40 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_dom_lag11m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = True, non_domestic = False,
                                 max_month_lag = 11, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_40.to_pickle(DIRECTORY + 'hist_agg_40.pkl')
hist_agg = hist_agg.merge(hist_agg_40, how = 'left', on = 'card_id')
del hist_agg_40

In [137]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only nondomestic transactions,
# - for the whole historic period.

hist_agg_41 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_nondom_all', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = None, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_41.to_pickle(DIRECTORY + 'hist_agg_41.pkl')
hist_agg = hist_agg.merge(hist_agg_41, how = 'left', on = 'card_id')
del hist_agg_41

In [138]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only nondomestic transactions,
# - for the last month historic period.

hist_agg_42 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_nondom_lag0m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 0, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_42.to_pickle(DIRECTORY + 'hist_agg_42.pkl')
hist_agg = hist_agg.merge(hist_agg_42, how = 'left', on = 'card_id')
del hist_agg_42

In [139]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only nondomestic transactions,
# - for the last 2 months historic period.

hist_agg_43 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_nondom_lag1m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 1, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_43.to_pickle(DIRECTORY + 'hist_agg_43.pkl')
hist_agg = hist_agg.merge(hist_agg_43, how = 'left', on = 'card_id')
del hist_agg_43

In [140]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only nondomestic transactions,
# - for the last 3 months historic period.

hist_agg_44 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_nondom_lag2m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 2, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_44.to_pickle(DIRECTORY + 'hist_agg_44.pkl')
hist_agg = hist_agg.merge(hist_agg_44, how = 'left', on = 'card_id')
del hist_agg_44

In [141]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only nondomestic transactions,
# - for the last 4 months historic period.

hist_agg_45 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_nondom_lag3m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 3, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_45.to_pickle(DIRECTORY + 'hist_agg_45.pkl')
hist_agg = hist_agg.merge(hist_agg_45, how = 'left', on = 'card_id')
del hist_agg_45

In [142]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only nondomestic transactions,
# - for the last 6 months historic period.

hist_agg_46 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_nondom_lag5m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 5, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_46.to_pickle(DIRECTORY + 'hist_agg_46.pkl')
hist_agg = hist_agg.merge(hist_agg_46, how = 'left', on = 'card_id')
del hist_agg_46

In [143]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only nondomestic transactions,
# - for the last 9 months historic period.

hist_agg_47 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_nondom_lag8m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 8, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_47.to_pickle(DIRECTORY + 'hist_agg_47.pkl')
hist_agg = hist_agg.merge(hist_agg_47, how = 'left', on = 'card_id')
del hist_agg_47

In [144]:
# Aggregate features, use 
# - both authorized and nonautorized transactions, 
# - only nondomestic transactions,
# - for the last 12 months historic period.

hist_agg_48 = aggregate_features(df = hist, prefix = 'hist_', suffix = '_nondom_lag11m', 
                                 gb_cols = ['card_id'], agg_func = agg_func,
                                 authorized = True, non_authorized = True, 
                                 domestic = False, non_domestic = True,
                                 max_month_lag = 11, max_week_lag = None,
                                 min_installments = None, max_installments = None)
hist_agg_48.to_pickle(DIRECTORY + 'hist_agg_48.pkl')
hist_agg = hist_agg.merge(hist_agg_48, how = 'left', on = 'card_id')
del hist_agg_48

In [145]:
hist_agg.to_pickle(DIRECTORY + 'hist_agg_1_48.pkl')

In [146]:
ref_dates.to_pickle(DIRECTORY + 'ref_dates.pkl')