# RE FEATURE ENGINEERING OF DATA

In [1]:
import numpy as np
import pandas as panda
from dateutil import relativedelta
import datetime

In [26]:
train_data_path = 'all/train.csv'
test_data_path = 'all/test.csv'
new_transaction_path = 'all/new_merchant_transactions.csv'
historical_transaction_path = 'all/historical_transactions.csv'
new_treated_transaction_path = 'all_new/treated_new_transaction.csv'
treated_historical_transaction_path = 'all_new/treated_historical_transactions.csv'



In [3]:
train_data = panda.read_csv(train_data_path)
train_data_types = train_data.dtypes
train_data_columns ={}
for index, name in enumerate(train_data.columns.tolist()):
    train_data_columns[name] = train_data_types[index]
train_data_columns

{'first_active_month': dtype('O'),
 'card_id': dtype('O'),
 'feature_1': dtype('int64'),
 'feature_2': dtype('int64'),
 'feature_3': dtype('int64'),
 'target': dtype('float64')}

In [4]:
del train_data

In [5]:

## simply passed the columns to reduce memory footprint
train_data = panda.read_csv(train_data_path, dtype = train_data_columns)

In [6]:
train_data.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [7]:
test_data = panda.read_csv(test_data_path)
test_data_types = test_data.dtypes
test_data_columns ={}
for index, name in enumerate(test_data.columns.tolist()):
    test_data_columns[name] = test_data_types[index]
test_data_columns

{'first_active_month': dtype('O'),
 'card_id': dtype('O'),
 'feature_1': dtype('int64'),
 'feature_2': dtype('int64'),
 'feature_3': dtype('int64')}

In [8]:
del test_data

In [9]:

## simply passed the columns to reduce memory footprint
test_data = panda.read_csv(test_data_path, dtype = test_data_columns)

In [10]:
test_data.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04,C_ID_0ab67a22ab,3,3,1
1,2017-01,C_ID_130fd0cbdd,2,3,0
2,2017-08,C_ID_b709037bc5,5,1,1
3,2017-12,C_ID_d27d835a9f,2,1,0
4,2015-12,C_ID_2b5e3df5c2,5,1,1


In [11]:
print(train_data.feature_1.value_counts(),'\n',test_data.feature_1.value_counts())

3    73573
2    55797
5    40625
4    19885
1    12037
Name: feature_1, dtype: int64 
 3    44719
2    34115
5    25051
4    12332
1     7406
Name: feature_1, dtype: int64


In [12]:
train_data['enc_feature_1_1'] = (train_data.feature_1 ==1) + 0
train_data['enc_feature_1_2'] = (train_data.feature_1 ==2) + 0
train_data['enc_feature_1_3'] = (train_data.feature_1 ==3) + 0
train_data['enc_feature_1_4'] = (train_data.feature_1 ==4) + 0
train_data['enc_feature_1_5'] = (train_data.feature_1 ==5) + 0


test_data['enc_feature_1_1'] = (test_data.feature_1 ==1) + 0
test_data['enc_feature_1_2'] = (test_data.feature_1 ==2) + 0
test_data['enc_feature_1_3'] = (test_data.feature_1 ==3) + 0
test_data['enc_feature_1_4'] = (test_data.feature_1 ==4) + 0
test_data['enc_feature_1_5'] = (test_data.feature_1 ==5) + 0

## at this point we have label encoded our categorical variables feature1,2,3. we will drop the feature columns , again to
## reduce memory footprint

In [13]:
test_data.drop(columns = ['feature_1','feature_2','feature_3'], inplace = True)
train_data.drop(columns = ['feature_1','feature_2','feature_3'], inplace = True)

In [14]:
train_data.head()

Unnamed: 0,first_active_month,card_id,target,enc_feature_1_1,enc_feature_1_2,enc_feature_1_3,enc_feature_1_4,enc_feature_1_5
0,2017-06,C_ID_92a2005557,-0.820283,0,0,0,0,1
1,2017-01,C_ID_3d0044924f,0.392913,0,0,0,1,0
2,2016-08,C_ID_d639edf6cd,0.688056,0,1,0,0,0
3,2017-09,C_ID_186d6a6901,0.142495,0,0,0,1,0
4,2017-11,C_ID_cdbd2c0db2,-0.159749,1,0,0,0,0


In [15]:



def get_months_passed( given):
    
    current = datetime.datetime.now()
    given = datetime.datetime.strptime(given + '-01', '%Y-%m-%d')
    
    r = relativedelta.relativedelta(current, given)
    return r.months + r.years*12


    
given_date = '2015-12'

get_months_passed( given_date)   

37

In [16]:
train_data['months_passed'] = train_data.first_active_month.map(lambda x: get_months_passed(x))


In [17]:
test_data.first_active_month.isnull().sum()
# test_data['months_passed'] = test_data.first_active_month.map(lambda x: get_months_passed(x))

1

In [18]:
test_data.first_active_month.value_counts()

2017-09    8805
2017-10    8451
2017-08    8299
2017-11    7807
2017-07    7344
2017-12    6233
2017-06    6147
2017-01    5974
2016-12    5751
2017-05    5598
2017-04    5371
2017-03    5058
2017-02    4903
2016-11    4444
2016-10    3335
2016-08    3108
2016-09    2663
2016-07    2610
2016-06    2407
2016-05    2096
2016-04    1740
2015-12    1393
2016-03    1258
2016-01     996
2016-02     987
2015-11     955
2015-10     936
2015-08     882
2015-07     862
2015-09     818
           ... 
2014-03     141
2014-02      97
2013-12      95
2014-01      90
2013-11      80
2013-10      80
2013-09      75
2013-08      73
2013-07      72
2013-06      56
2013-05      44
2013-04      40
2013-03      34
2018-01      28
2013-02      28
2012-08      21
2012-11      18
2012-09      18
2012-12      16
2012-07      15
2013-01      14
2012-06      14
2012-10      13
2012-05      12
2012-04      11
2012-03      10
2012-02       4
2011-11       3
2011-12       2
2012-01       1
Name: first_active_month

In [19]:
# one particular test data has null frist active month. we replace it with most appearing date
most_appearing = test_data.first_active_month.value_counts().idxmax()
test_data.first_active_month.fillna(value = most_appearing, inplace= True)
test_data.first_active_month.isnull().sum()

0

In [20]:
test_data['months_passed'] = test_data.first_active_month.map(lambda x: get_months_passed(x))

In [21]:
train_data.drop(columns = ['first_active_month'], inplace = True)
test_data.drop(columns = ['first_active_month'], inplace = True)


In [24]:
train_data.head()

Unnamed: 0,card_id,target,enc_feature_1_1,enc_feature_1_2,enc_feature_1_3,enc_feature_1_4,enc_feature_1_5,months_passed
0,C_ID_92a2005557,-0.820283,0,0,0,0,1,19
1,C_ID_3d0044924f,0.392913,0,0,0,1,0,24
2,C_ID_d639edf6cd,0.688056,0,1,0,0,0,29
3,C_ID_186d6a6901,0.142495,0,0,0,1,0,16
4,C_ID_cdbd2c0db2,-0.159749,1,0,0,0,0,14


In [23]:
## deres nothing else we can do now...with this data
## lets look at new transactions first , since it is a smaller data sets

new_transaction_data = panda.read_csv(new_transaction_path)
new_transaction_data_types = new_transaction_data.dtypes
new_transaction_column_types = {}
for index, column_name in enumerate(new_transaction_data.columns.tolist()):
    new_transaction_column_types[column_name] = new_transaction_data_types[index]
del new_transaction_data
new_transaction_data = panda.read_csv(new_transaction_path, dtype = new_transaction_column_types)
new_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [31]:
## lets deal with the easy categorical data first: authorized_flag, category_1,category_2,category_3
new_transaction_data.authorized_flag.isnull().sum()
new_transaction_data['authorized_flag'] = new_transaction_data.authorized_flag.map({'Y':1,'N':0})

In [35]:
new_transaction_data.category_1.isnull().sum()
new_transaction_data.category_1.value_counts()
new_transaction_data['category_1'] = new_transaction_data.category_1.map({'Y':1, 'N':0})

In [42]:
new_transaction_data.category_2.isnull().sum()
##quite a few null values in category 2
new_transaction_data.category_2.value_counts()
most_appearing = new_transaction_data.category_2.value_counts().idxmax()
new_transaction_data.category_2.fillna(value= most_appearing, inplace=True)
new_transaction_data.category_2.value_counts()

1.0    1169987
3.0     289525
5.0     259266
4.0     178590
2.0      65663
Name: category_2, dtype: int64

In [43]:
new_transaction_data['enc_category_2_1'] = (new_transaction_data.category_2==1) + 0
new_transaction_data['enc_category_2_2'] = (new_transaction_data.category_2==2) + 0
new_transaction_data['enc_category_2_3'] = (new_transaction_data.category_2==3) + 0
new_transaction_data['enc_category_2_4'] = (new_transaction_data.category_2==4) + 0
new_transaction_data['enc_category_2_5'] = (new_transaction_data.category_2==5) + 0

new_transaction_data.drop(columns = ['category_2'], inplace=True)
new_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,enc_category_2_1,enc_category_2_2,enc_category_2_3,enc_category_2_4,enc_category_2_5
0,1,C_ID_415bb3a509,107,0,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,9,19,1,0,0,0,0
1,1,C_ID_415bb3a509,140,0,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,9,19,1,0,0,0,0
2,1,C_ID_415bb3a509,330,0,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,9,14,1,0,0,0,0
3,1,C_ID_415bb3a509,-1,1,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,-1,8,1,0,0,0,0
4,1,C_ID_ef55cf8d4b,-1,1,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,-1,29,1,0,0,0,0


In [46]:
new_transaction_data.category_3.isnull().sum()
## again quite a few null values for category
most_appearing = new_transaction_data.category_3.value_counts().idxmax()
new_transaction_data.category_3.fillna(value = most_appearing, inplace = True)
new_transaction_data.category_3.value_counts(),new_transaction_data.category_3.isnull().sum()

(A    978166
 B    836178
 C    148687
 Name: category_3, dtype: int64, 0)

In [50]:
new_transaction_data['enc_category_3_A'] = (new_transaction_data.category_3=='A') +0
new_transaction_data['enc_category_3_B'] = (new_transaction_data.category_3=='B') +0
new_transaction_data['enc_category_3_C'] = (new_transaction_data.category_3=='C') +0
new_transaction_data.drop(columns = ['category_3'], inplace = True)

In [51]:
new_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,enc_category_2_1,enc_category_2_2,enc_category_2_3,enc_category_2_4,enc_category_2_5,enc_category_3_A,enc_category_3_B,enc_category_3_C
0,1,C_ID_415bb3a509,107,0,1,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,9,19,1,0,0,0,0,0,1,0
1,1,C_ID_415bb3a509,140,0,1,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,9,19,1,0,0,0,0,0,1,0
2,1,C_ID_415bb3a509,330,0,1,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,9,14,1,0,0,0,0,0,1,0
3,1,C_ID_415bb3a509,-1,1,1,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,-1,8,1,0,0,0,0,0,1,0
4,1,C_ID_ef55cf8d4b,-1,1,1,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,-1,29,1,0,0,0,0,0,1,0


In [57]:
new_transaction_data['enc_purchase_date'] = panda.DatetimeIndex(new_transaction_data.purchase_date).astype(np.int64)*1e-9
new_transaction_data.drop(columns = ['purchase_date'], inplace = True)


In [61]:
## we will take the 
## sum,mean,max of purchase amount, 
## sum,mean,max of month lag by card_id
## mean of authorized_flag,
## num of authorized ==1
## num of authorized ==0
## sum, mean, max of installments
## mean of category_1
## sum of enc_category_2* and enc_category_3*

def count_authorized_one(x):
    return x.eq(1).sum()
def count_authorized_zero(x):
    return x.eq(0).sum()
def count_category_1_one(x):
    return x.eq(1).sum()
def count_category_1_zero(x):
    return x.eq(0).sum()


aggregate = {
    
'purchase_amount' : ['sum', 'mean','max'] ,
'month_lag' : ['sum', 'mean','max'] ,
'enc_category_2_1':['sum'],
'enc_category_2_2':['sum'],
'enc_category_2_3':['sum'],
'enc_category_2_4':['sum'],
'enc_category_2_5':['sum'],    
'enc_category_3_A':['sum'],
'enc_category_3_B':['sum'],
'enc_category_3_C':['sum'],
'installments': ['sum','mean','max'],
'authorized_flag': ['mean', count_authorized_one, count_authorized_zero],
'category_1' :['mean', count_category_1_one,count_category_1_zero],
'enc_purchase_date' : ['mean','min','max']
    
}



In [76]:
group_by = new_transaction_data.groupby(['card_id'])

t = group_by.agg(aggregate)

In [77]:
t.columns.tolist()

t_cols = ['card_id']

In [78]:
t_cols.extend(['_'.join(list(i)) for i in t.columns.tolist() ])


In [79]:
t_cols

['card_id',
 'purchase_amount_sum',
 'purchase_amount_mean',
 'purchase_amount_max',
 'month_lag_sum',
 'month_lag_mean',
 'month_lag_max',
 'enc_category_2_1_sum',
 'enc_category_2_2_sum',
 'enc_category_2_3_sum',
 'enc_category_2_4_sum',
 'enc_category_2_5_sum',
 'enc_category_3_A_sum',
 'enc_category_3_B_sum',
 'enc_category_3_C_sum',
 'installments_sum',
 'installments_mean',
 'installments_max',
 'authorized_flag_mean',
 'authorized_flag_count_authorized_one',
 'authorized_flag_count_authorized_zero',
 'category_1_mean',
 'category_1_count_category_1_one',
 'category_1_count_category_1_zero',
 'enc_purchase_date_mean',
 'enc_purchase_date_min',
 'enc_purchase_date_max']

In [81]:
t.reset_index(inplace=True)

t.head()

Unnamed: 0_level_0,card_id,purchase_amount,purchase_amount,purchase_amount,month_lag,month_lag,month_lag,enc_category_2_1,enc_category_2_2,enc_category_2_3,...,installments,authorized_flag,authorized_flag,authorized_flag,category_1,category_1,category_1,enc_purchase_date,enc_purchase_date,enc_purchase_date
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,max,sum,mean,max,sum,sum,sum,...,max,mean,count_authorized_one,count_authorized_zero,mean,count_category_1_one,count_category_1_zero,mean,min,max
0,C_ID_00007093c1,-1.328524,-0.664262,-0.656749,4,2.0,2,1,0,1,...,1,1,2,0,0.0,0,2,1523023000.0,1522754000.0,1523291000.0
1,C_ID_0001238066,-14.850055,-0.571156,-0.078318,35,1.346154,2,23,0,0,...,10,1,26,0,0.076923,2,24,1522335000.0,1519923000.0,1525118000.0
2,C_ID_0001506ef0,-1.447354,-0.723677,-0.715352,2,1.0,1,0,0,2,...,0,1,2,0,0.0,0,2,1521474000.0,1521239000.0,1521710000.0
3,C_ID_0001793786,-0.22962,-0.007407,3.129932,41,1.322581,2,17,8,5,...,0,1,31,0,0.0,0,31,1512327000.0,1510761000.0,1514742000.0
4,C_ID_000183fdda,-6.590778,-0.599162,-0.10768,14,1.272727,2,0,0,11,...,4,1,11,0,0.0,0,11,1521578000.0,1519994000.0,1525100000.0


In [82]:
t.columns  = t_cols

In [83]:
t.head()

Unnamed: 0,card_id,purchase_amount_sum,purchase_amount_mean,purchase_amount_max,month_lag_sum,month_lag_mean,month_lag_max,enc_category_2_1_sum,enc_category_2_2_sum,enc_category_2_3_sum,...,installments_max,authorized_flag_mean,authorized_flag_count_authorized_one,authorized_flag_count_authorized_zero,category_1_mean,category_1_count_category_1_one,category_1_count_category_1_zero,enc_purchase_date_mean,enc_purchase_date_min,enc_purchase_date_max
0,C_ID_00007093c1,-1.328524,-0.664262,-0.656749,4,2.0,2,1,0,1,...,1,1,2,0,0.0,0,2,1523023000.0,1522754000.0,1523291000.0
1,C_ID_0001238066,-14.850055,-0.571156,-0.078318,35,1.346154,2,23,0,0,...,10,1,26,0,0.076923,2,24,1522335000.0,1519923000.0,1525118000.0
2,C_ID_0001506ef0,-1.447354,-0.723677,-0.715352,2,1.0,1,0,0,2,...,0,1,2,0,0.0,0,2,1521474000.0,1521239000.0,1521710000.0
3,C_ID_0001793786,-0.22962,-0.007407,3.129932,41,1.322581,2,17,8,5,...,0,1,31,0,0.0,0,31,1512327000.0,1510761000.0,1514742000.0
4,C_ID_000183fdda,-6.590778,-0.599162,-0.10768,14,1.272727,2,0,0,11,...,4,1,11,0,0.0,0,11,1521578000.0,1519994000.0,1525100000.0


In [85]:
t.to_csv('all_new/treated_new_transaction.csv', index = False)

In [7]:
##similar treatment for historical transaction, because i have seen from data dictionary that they both have same columns


In [4]:
historical_transaction_data = panda.read_csv(historical_transaction_path)
ht_tr_dtypes = historical_transaction_data.dtypes
ht_col_types = {}

for index, name in enumerate(historical_transaction_data.columns.tolist()):
    ht_col_types[name] = ht_tr_dtypes[index]

del historical_transaction_data



TypeError: parser_f() got an unexpected keyword argument 'dype'

In [5]:
historical_transaction_data = panda.read_csv(historical_transaction_path, dtype = ht_col_types)
len(historical_transaction_data)

29112361

In [6]:
historical_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [9]:
## lets deal with the easy categorical data first: authorized_flag, category_1,category_2,category_3
historical_transaction_data.authorized_flag.isnull().sum()
historical_transaction_data['authorized_flag'] = historical_transaction_data.authorized_flag.map({'Y':1,'N':0})

In [12]:
historical_transaction_data.category_1.isnull().sum()
historical_transaction_data.category_1.value_counts()
historical_transaction_data['category_1'] = historical_transaction_data.category_1.map({'Y':1, 'N':0})

In [16]:
historical_transaction_data.category_2.isnull().sum()
##quite a few null values in category 2
historical_transaction_data.category_2.value_counts()
most_appearing = historical_transaction_data.category_2.value_counts().idxmax()
historical_transaction_data.category_2.fillna(value= most_appearing, inplace=True)
historical_transaction_data.category_2.isnull().sum()

0

In [17]:
historical_transaction_data['enc_category_2_1'] = (historical_transaction_data.category_2==1) + 0
historical_transaction_data['enc_category_2_2'] = (historical_transaction_data.category_2==2) + 0
historical_transaction_data['enc_category_2_3'] = (historical_transaction_data.category_2==3) + 0
historical_transaction_data['enc_category_2_4'] = (historical_transaction_data.category_2==4) + 0
historical_transaction_data['enc_category_2_5'] = (historical_transaction_data.category_2==5) + 0

historical_transaction_data.drop(columns = ['category_2'], inplace=True)
historical_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,enc_category_2_1,enc_category_2_2,enc_category_2_3,enc_category_2_4,enc_category_2_5
0,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,16,37,1,0,0,0,0
1,1,C_ID_4e6213e9bc,88,0,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,16,16,1,0,0,0,0
2,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,16,37,1,0,0,0,0
3,1,C_ID_4e6213e9bc,88,0,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,16,34,1,0,0,0,0
4,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,16,37,1,0,0,0,0


In [20]:
historical_transaction_data.category_3.isnull().sum()
## again quite a few null values for category
most_appearing = historical_transaction_data.category_3.value_counts().idxmax()
historical_transaction_data.category_3.fillna(value = most_appearing, inplace = True)
historical_transaction_data.category_3.value_counts(),historical_transaction_data.category_3.isnull().sum()

(A    15589906
 B    11677522
 C     1844933
 Name: category_3, dtype: int64, 0)

In [21]:
historical_transaction_data['enc_category_3_A'] = (historical_transaction_data.category_3=='A') +0
historical_transaction_data['enc_category_3_B'] = (historical_transaction_data.category_3=='B') +0
historical_transaction_data['enc_category_3_C'] = (historical_transaction_data.category_3=='C') +0
historical_transaction_data.drop(columns = ['category_3'], inplace = True)

In [22]:
historical_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,enc_category_2_1,enc_category_2_2,enc_category_2_3,enc_category_2_4,enc_category_2_5,enc_category_3_A,enc_category_3_B,enc_category_3_C
0,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,16,37,1,0,0,0,0,1,0,0
1,1,C_ID_4e6213e9bc,88,0,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,16,16,1,0,0,0,0,1,0,0
2,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,16,37,1,0,0,0,0,1,0,0
3,1,C_ID_4e6213e9bc,88,0,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,16,34,1,0,0,0,0,1,0,0
4,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,16,37,1,0,0,0,0,1,0,0


In [23]:
historical_transaction_data['enc_purchase_date'] = panda.DatetimeIndex(historical_transaction_data.purchase_date).astype(np.int64)*1e-9
historical_transaction_data.drop(columns = ['purchase_date'], inplace = True)


In [25]:
## we will take the 
## sum,mean,max of purchase amount, 
## sum,mean,max of month lag by card_id
## mean of authorized_flag,
## num of authorized ==1
## num of authorized ==0
## sum, mean, max of installments
## mean of category_1
## sum of enc_category_2* and enc_category_3*

def count_authorized_one(x):
    return x.eq(1).sum()
def count_authorized_zero(x):
    return x.eq(0).sum()
def count_category_1_one(x):
    return x.eq(1).sum()
def count_category_1_zero(x):
    return x.eq(0).sum()


aggregate = {
    
'purchase_amount' : ['sum', 'mean','max'] ,
'month_lag' : ['sum', 'mean','max'] ,
'enc_category_2_1':['sum'],
'enc_category_2_2':['sum'],
'enc_category_2_3':['sum'],
'enc_category_2_4':['sum'],
'enc_category_2_5':['sum'],    
'enc_category_3_A':['sum'],
'enc_category_3_B':['sum'],
'enc_category_3_C':['sum'],
'installments': ['sum','mean','max'],
'authorized_flag': ['mean', count_authorized_one, count_authorized_zero],
'category_1' :['mean', count_category_1_one,count_category_1_zero],
'enc_purchase_date' : ['mean','min','max']
    
}



In [26]:
groupby = historical_transaction_data.groupby(['card_id'])
t = groupby.agg(aggregate)

In [27]:
t.head()

Unnamed: 0_level_0,purchase_amount,purchase_amount,purchase_amount,month_lag,month_lag,month_lag,enc_category_2_1,enc_category_2_2,enc_category_2_3,enc_category_2_4,...,installments,authorized_flag,authorized_flag,authorized_flag,category_1,category_1,category_1,enc_purchase_date,enc_purchase_date,enc_purchase_date
Unnamed: 0_level_1,sum,mean,max,sum,mean,max,sum,sum,sum,sum,...,max,mean,count_authorized_one,count_authorized_zero,mean,count_category_1_one,count_category_1_zero,mean,min,max
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C_ID_00007093c1,-76.845041,-0.515739,1.507069,-872,-5.852349,0,28,0,120,0,...,6,0.765101,114,35,0.187919,28,121,1503054000.0,1487081000.0,1519708000.0
C_ID_0001238066,-72.447201,-0.589002,0.768095,-223,-1.813008,0,103,0,0,0,...,10,0.97561,120,3,0.01626,2,121,1513977000.0,1506638000.0,1519748000.0
C_ID_0001506ef0,-34.601879,-0.524271,1.493545,-319,-4.833333,0,2,0,64,0,...,1,0.939394,62,4,0.0,0,66,1505646000.0,1484411000.0,1518871000.0
C_ID_0001793786,-36.786013,-0.170306,4.554145,-719,-3.328704,0,125,76,15,0,...,1,0.875,189,27,0.009259,2,214,1499412000.0,1484994000.0,1509481000.0
C_ID_000183fdda,-68.837938,-0.478041,2.764788,-353,-2.451389,0,11,1,131,0,...,10,0.951389,137,7,0.027778,4,140,1512119000.0,1502099000.0,1519592000.0


In [28]:
t_cols = ['card_id']
t_cols.extend(['_'.join(a) for a in t.columns.tolist()])
t_cols

['card_id',
 'purchase_amount_sum',
 'purchase_amount_mean',
 'purchase_amount_max',
 'month_lag_sum',
 'month_lag_mean',
 'month_lag_max',
 'enc_category_2_1_sum',
 'enc_category_2_2_sum',
 'enc_category_2_3_sum',
 'enc_category_2_4_sum',
 'enc_category_2_5_sum',
 'enc_category_3_A_sum',
 'enc_category_3_B_sum',
 'enc_category_3_C_sum',
 'installments_sum',
 'installments_mean',
 'installments_max',
 'authorized_flag_mean',
 'authorized_flag_count_authorized_one',
 'authorized_flag_count_authorized_zero',
 'category_1_mean',
 'category_1_count_category_1_one',
 'category_1_count_category_1_zero',
 'enc_purchase_date_mean',
 'enc_purchase_date_min',
 'enc_purchase_date_max']

In [29]:
t.reset_index(inplace=True)
t.columns = t_cols

In [30]:
t.head()

Unnamed: 0,card_id,purchase_amount_sum,purchase_amount_mean,purchase_amount_max,month_lag_sum,month_lag_mean,month_lag_max,enc_category_2_1_sum,enc_category_2_2_sum,enc_category_2_3_sum,...,installments_max,authorized_flag_mean,authorized_flag_count_authorized_one,authorized_flag_count_authorized_zero,category_1_mean,category_1_count_category_1_one,category_1_count_category_1_zero,enc_purchase_date_mean,enc_purchase_date_min,enc_purchase_date_max
0,C_ID_00007093c1,-76.845041,-0.515739,1.507069,-872,-5.852349,0,28,0,120,...,6,0.765101,114,35,0.187919,28,121,1503054000.0,1487081000.0,1519708000.0
1,C_ID_0001238066,-72.447201,-0.589002,0.768095,-223,-1.813008,0,103,0,0,...,10,0.97561,120,3,0.01626,2,121,1513977000.0,1506638000.0,1519748000.0
2,C_ID_0001506ef0,-34.601879,-0.524271,1.493545,-319,-4.833333,0,2,0,64,...,1,0.939394,62,4,0.0,0,66,1505646000.0,1484411000.0,1518871000.0
3,C_ID_0001793786,-36.786013,-0.170306,4.554145,-719,-3.328704,0,125,76,15,...,1,0.875,189,27,0.009259,2,214,1499412000.0,1484994000.0,1509481000.0
4,C_ID_000183fdda,-68.837938,-0.478041,2.764788,-353,-2.451389,0,11,1,131,...,10,0.951389,137,7,0.027778,4,140,1512119000.0,1502099000.0,1519592000.0


In [32]:
t.to_csv('all_new/treated_historical_transactions.csv', index = False)

In [33]:
# we dnt need historical transaction so deleting it to conserve memory
del historical_transaction_data

In [36]:
historical_card_ids = set(t.card_id.values.tolist())
len(historical_card_ids)

325540

In [66]:
train_card_ids = set(train_data.card_id.values.tolist())
test_card_ids = set(test_data.card_id.values.tolist())

# all train and test card ids are present in historical transactions
print('no of train card ids present in historical: ',len(historical_card_ids.intersection(train_card_ids)), \
      'no of train card ids: ',len(train_card_ids))

print('no of train card ids present in historical: ',len(historical_card_ids.intersection(test_card_ids)), \
      'no of train card ids: ',len(test_card_ids))

no of train card ids present in historical:  201917 no of train card ids:  201917
no of train card ids present in historical:  123623 no of train card ids:  123623


In [27]:
treated_new_transaction_path = 'all_new/treated_new_transaction.csv'
treated_new_transaction_data = panda.read_csv(treated_new_transaction_path)


treated_historical_transaction_data = panda.read_csv(treated_historical_transaction_path)




In [32]:
new_transaction_card_ids = set(treated_new_transaction_data.card_id.values.tolist())
historical_transaction_card_ids = set(treated_historical_transaction_data.card_id.values.tolist())
train_card_ids = set(train_data.card_id.values.tolist())
test_card_ids = set(test_data.card_id.values.tolist())



In [34]:

# all train and test card ids are present in historical transactions
# some train and test card ids are present in new transactions


print('no of train card ids present in new transaction: ',len(new_transaction_card_ids.intersection(train_card_ids)), \
      'no of train card ids: ',len(train_card_ids))

print('no of train card ids present in new transaction: ',len(new_transaction_card_ids.intersection(test_card_ids)), \
      'no of train card ids: ',len(test_card_ids))



print('no of train card ids present in history transaction: ',len(historical_transaction_card_ids.intersection(train_card_ids)), \
      'no of train card ids: ',len(train_card_ids))

print('no of train card ids present in history transaction: ',len(historical_transaction_card_ids.intersection(test_card_ids)), \
      'no of train card ids: ',len(test_card_ids))

no of train card ids present in new transaction:  179986 no of train card ids:  201917
no of train card ids present in new transaction:  110015 no of train card ids:  123623
no of train card ids present in history transaction:  201917 no of train card ids:  201917
no of train card ids present in history transaction:  123623 no of train card ids:  123623


In [35]:
len(treated_new_transaction_data), len(treated_historical_transaction_data)

(290001, 325540)

In [36]:
new_transaction_data = panda.read_csv('all/new_merchant_transactions.csv')
len(set(new_transaction_data.card_id.values.tolist()).intersection(set(train_data.card_id.values.tolist())))

179986

In [37]:
historical_transaction_data = panda.read_csv('all/historical_transactions.csv')

In [38]:
all_data = panda.concat([new_transaction_data, historical_transaction_data], axis =0)

In [39]:
del historical_transaction_data, treated_new_transaction_data,treated_historical_transaction_data,new_transaction_data

In [40]:
len(all_data), len(all_data.columns.tolist())

(31075392, 14)

In [42]:
## lets deal with the easy categorical data first: authorized_flag, category_1,category_2,category_3
all_data.authorized_flag.isnull().sum()
all_data['authorized_flag'] = all_data.authorized_flag.map({'Y':1,'N':0})

In [44]:
all_data.category_1.isnull().sum()
# all_data.category_1.value_counts()
all_data['category_1'] = all_data.category_1.map({'Y':1, 'N':0})

In [46]:
all_data.category_2.isnull().sum()
##quite a few null values in category 2
# all_data.category_2.value_counts()
most_appearing = all_data.category_2.value_counts().idxmax()
all_data.category_2.fillna(value= most_appearing, inplace=True)
all_data.category_2.isnull().sum()

0

In [47]:
all_data.category_2.value_counts()

1.0    19000050
3.0     4201320
5.0     3985181
4.0     2796643
2.0     1092198
Name: category_2, dtype: int64

In [48]:
all_data['enc_category_2_1'] = (all_data.category_2==1) + 0
all_data['enc_category_2_2'] = (all_data.category_2==2) + 0
all_data['enc_category_2_3'] = (all_data.category_2==3) + 0
all_data['enc_category_2_4'] = (all_data.category_2==4) + 0
all_data['enc_category_2_5'] = (all_data.category_2==5) + 0

all_data.drop(columns = ['category_2'], inplace=True)
all_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,enc_category_2_1,enc_category_2_2,enc_category_2_3,enc_category_2_4,enc_category_2_5
0,1,C_ID_415bb3a509,107,0,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,9,19,1,0,0,0,0
1,1,C_ID_415bb3a509,140,0,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,9,19,1,0,0,0,0
2,1,C_ID_415bb3a509,330,0,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,9,14,1,0,0,0,0
3,1,C_ID_415bb3a509,-1,1,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,-1,8,1,0,0,0,0
4,1,C_ID_ef55cf8d4b,-1,1,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,-1,29,1,0,0,0,0


In [50]:
all_data.category_3.isnull().sum()
## again quite a few null values for category
most_appearing = all_data.category_3.value_counts().idxmax()
all_data.category_3.fillna(value = most_appearing, inplace = True)
all_data.category_3.value_counts(),all_data.category_3.isnull().sum()

(A    16568072
 B    12513700
 C     1993620
 Name: category_3, dtype: int64, 0)

In [51]:
all_data['enc_category_3_A'] = (all_data.category_3=='A') +0
all_data['enc_category_3_B'] = (all_data.category_3=='B') +0
all_data['enc_category_3_C'] = (all_data.category_3=='C') +0
all_data.drop(columns = ['category_3'], inplace = True)

In [52]:
all_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,enc_category_2_1,enc_category_2_2,enc_category_2_3,enc_category_2_4,enc_category_2_5,enc_category_3_A,enc_category_3_B,enc_category_3_C
0,1,C_ID_415bb3a509,107,0,1,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,9,19,1,0,0,0,0,0,1,0
1,1,C_ID_415bb3a509,140,0,1,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,9,19,1,0,0,0,0,0,1,0
2,1,C_ID_415bb3a509,330,0,1,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,9,14,1,0,0,0,0,0,1,0
3,1,C_ID_415bb3a509,-1,1,1,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,-1,8,1,0,0,0,0,0,1,0
4,1,C_ID_ef55cf8d4b,-1,1,1,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,-1,29,1,0,0,0,0,0,1,0


In [53]:
all_data['enc_purchase_date'] = panda.DatetimeIndex(all_data.purchase_date).astype(np.int64)*1e-9
all_data.drop(columns = ['purchase_date'], inplace = True)


In [54]:
all_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,state_id,subsector_id,enc_category_2_1,enc_category_2_2,enc_category_2_3,enc_category_2_4,enc_category_2_5,enc_category_3_A,enc_category_3_B,enc_category_3_C,enc_purchase_date
0,1,C_ID_415bb3a509,107,0,1,307,M_ID_b0c793002c,1,-0.557574,9,19,1,0,0,0,0,0,1,0,1520780000.0
1,1,C_ID_415bb3a509,140,0,1,307,M_ID_88920c89e8,1,-0.56958,9,19,1,0,0,0,0,0,1,0,1521486000.0
2,1,C_ID_415bb3a509,330,0,1,507,M_ID_ad5237ef6b,2,-0.551037,9,14,1,0,0,0,0,0,1,0,1524752000.0
3,1,C_ID_415bb3a509,-1,1,1,661,M_ID_9e84cda3b1,1,-0.671925,-1,8,1,0,0,0,0,0,1,0,1520416000.0
4,1,C_ID_ef55cf8d4b,-1,1,1,166,M_ID_3c86fa3831,1,-0.659904,-1,29,1,0,0,0,0,0,1,0,1521753000.0


In [55]:
## we will take the 
## sum,mean,max of purchase amount, 
## sum,mean,max of month lag by card_id
## mean of authorized_flag,
## num of authorized ==1
## num of authorized ==0
## sum, mean, max of installments
## mean of category_1
## sum of enc_category_2* and enc_category_3*

def count_authorized_one(x):
    return x.eq(1).sum()
def count_authorized_zero(x):
    return x.eq(0).sum()
def count_category_1_one(x):
    return x.eq(1).sum()
def count_category_1_zero(x):
    return x.eq(0).sum()


aggregate = {
    
'purchase_amount' : ['sum', 'mean','max'] ,
'month_lag' : ['sum', 'mean','max'] ,
'enc_category_2_1':['sum'],
'enc_category_2_2':['sum'],
'enc_category_2_3':['sum'],
'enc_category_2_4':['sum'],
'enc_category_2_5':['sum'],    
'enc_category_3_A':['sum'],
'enc_category_3_B':['sum'],
'enc_category_3_C':['sum'],
'installments': ['sum','mean','max'],
'authorized_flag': ['mean', count_authorized_one, count_authorized_zero],
'category_1' :['mean', count_category_1_one,count_category_1_zero],
'enc_purchase_date' : ['mean','min','max']
    
}



In [56]:
groupby = all_data.groupby(['card_id'])
t = groupby.agg(aggregate)

In [57]:
t_cols = ['card_id']
t_cols.extend(['_'.join(a) for a in t.columns.tolist()])
t_cols

['card_id',
 'purchase_amount_sum',
 'purchase_amount_mean',
 'purchase_amount_max',
 'month_lag_sum',
 'month_lag_mean',
 'month_lag_max',
 'enc_category_2_1_sum',
 'enc_category_2_2_sum',
 'enc_category_2_3_sum',
 'enc_category_2_4_sum',
 'enc_category_2_5_sum',
 'enc_category_3_A_sum',
 'enc_category_3_B_sum',
 'enc_category_3_C_sum',
 'installments_sum',
 'installments_mean',
 'installments_max',
 'authorized_flag_mean',
 'authorized_flag_count_authorized_one',
 'authorized_flag_count_authorized_zero',
 'category_1_mean',
 'category_1_count_category_1_one',
 'category_1_count_category_1_zero',
 'enc_purchase_date_mean',
 'enc_purchase_date_min',
 'enc_purchase_date_max']

In [59]:
t.reset_index(inplace = True)
t.columns = t_cols

In [60]:
t.head()

Unnamed: 0,card_id,purchase_amount_sum,purchase_amount_mean,purchase_amount_max,month_lag_sum,month_lag_mean,month_lag_max,enc_category_2_1_sum,enc_category_2_2_sum,enc_category_2_3_sum,...,installments_max,authorized_flag_mean,authorized_flag_count_authorized_one,authorized_flag_count_authorized_zero,category_1_mean,category_1_count_category_1_one,category_1_count_category_1_zero,enc_purchase_date_mean,enc_purchase_date_min,enc_purchase_date_max
0,C_ID_00007093c1,-78.173565,-0.517706,1.507069,-868,-5.748344,2,29,0,121,...,6,0.768212,116,35,0.18543,28,123,1503319000.0,1487081000.0,1523291000.0
1,C_ID_0001238066,-87.297257,-0.585888,0.768095,-188,-1.261745,2,126,0,0,...,10,0.979866,146,3,0.026846,4,145,1515435000.0,1506638000.0,1525118000.0
2,C_ID_0001506ef0,-36.049233,-0.530136,1.493545,-317,-4.661765,1,2,0,66,...,1,0.941176,64,4,0.0,0,68,1506111000.0,1484411000.0,1521710000.0
3,C_ID_0001793786,-37.015633,-0.149861,4.554145,-678,-2.744939,2,142,84,20,...,1,0.890688,220,27,0.008097,2,245,1501033000.0,1484994000.0,1514742000.0
4,C_ID_000183fdda,-75.428717,-0.486637,2.764788,-339,-2.187097,2,11,1,142,...,10,0.954839,148,7,0.025806,4,151,1512790000.0,1502099000.0,1525100000.0


In [62]:
merged_train_data = t.merge(train_data, on='card_id', how='left')

In [65]:
merged_train_data.isnull().sum()
merged_train_data.dropna(inplace= True) # drop rows if any columns contains nan

In [66]:
len(merged_train_data), len(train_data)

(201917, 201917)

In [67]:
merged_test_data = t.merge(test_data, on ='card_id', how='left')

In [69]:
merged_test_data.isnull().sum()
merged_test_data.dropna(inplace= True) # drop rows if any columns contains nan

In [70]:
len(merged_test_data), len(test_data)

(123623, 123623)

In [71]:
merged_test_data.head()

Unnamed: 0,card_id,purchase_amount_sum,purchase_amount_mean,purchase_amount_max,month_lag_sum,month_lag_mean,month_lag_max,enc_category_2_1_sum,enc_category_2_2_sum,enc_category_2_3_sum,...,category_1_count_category_1_zero,enc_purchase_date_mean,enc_purchase_date_min,enc_purchase_date_max,enc_feature_1_1,enc_feature_1_2,enc_feature_1_3,enc_feature_1_4,enc_feature_1_5,months_passed
1,C_ID_0001238066,-87.297257,-0.585888,0.768095,-188,-1.261745,2,126,0,0,...,145,1515435000.0,1506638000.0,1525118000.0,0.0,0.0,0.0,1.0,0.0,16.0
3,C_ID_0001793786,-37.015633,-0.149861,4.554145,-678,-2.744939,2,142,84,20,...,245,1501033000.0,1484994000.0,1514742000.0,0.0,0.0,0.0,0.0,1.0,24.0
5,C_ID_00024e244b,-41.552753,-0.593611,2.258395,-444,-6.342857,0,3,0,67,...,70,1501653000.0,1483370000.0,1517910000.0,0.0,1.0,0.0,0.0,0.0,29.0
6,C_ID_0002709b5a,-49.678369,-0.653663,-0.336684,-312,-4.105263,2,8,53,0,...,70,1507859000.0,1484314000.0,1525075000.0,0.0,0.0,0.0,0.0,1.0,41.0
8,C_ID_000298032a,-16.634288,-0.53659,2.577843,-121,-3.903226,2,31,0,0,...,31,1505880000.0,1486988000.0,1522340000.0,0.0,1.0,0.0,0.0,0.0,25.0


In [72]:
merged_test_data.to_csv('all_new/merged_test_data.csv', index=False)
merged_train_data.to_csv('all_new/merged_train_data.csv', index=False)