# RE RE FEATURE ENGINEERING OF DATA

In [25]:
import numpy as np
import pandas as panda
from dateutil import relativedelta
import datetime

In [49]:
train_data_path = 'all_new/train.csv'
test_data_path = 'all_new/test.csv'
new_transaction_path = 'all_new/new_merchant_transactions.csv'
historical_transaction_path = 'all_new/historical_transactions.csv'
new_treated_transaction_path = 'all_new/treated_new_transaction.csv'
treated_historical_transaction_path = 'all_new/treated_historical_transactions.csv'



In [27]:
train_data = panda.read_csv(train_data_path)
train_data_types = train_data.dtypes
train_data_columns ={}
for index, name in enumerate(train_data.columns.tolist()):
    train_data_columns[name] = train_data_types[index]
train_data_columns

{'first_active_month': dtype('O'),
 'card_id': dtype('O'),
 'feature_1': dtype('int64'),
 'feature_2': dtype('int64'),
 'feature_3': dtype('int64'),
 'target': dtype('float64')}

In [28]:
del train_data

In [29]:

## simply passed the columns to reduce memory footprint
train_data = panda.read_csv(train_data_path, dtype = train_data_columns)

In [30]:
train_data.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [31]:
test_data = panda.read_csv(test_data_path)
test_data_types = test_data.dtypes
test_data_columns ={}
for index, name in enumerate(test_data.columns.tolist()):
    test_data_columns[name] = test_data_types[index]
test_data_columns

{'first_active_month': dtype('O'),
 'card_id': dtype('O'),
 'feature_1': dtype('int64'),
 'feature_2': dtype('int64'),
 'feature_3': dtype('int64')}

In [32]:
del test_data

In [33]:

## simply passed the columns to reduce memory footprint
test_data = panda.read_csv(test_data_path, dtype = test_data_columns)

In [34]:
test_data.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04,C_ID_0ab67a22ab,3,3,1
1,2017-01,C_ID_130fd0cbdd,2,3,0
2,2017-08,C_ID_b709037bc5,5,1,1
3,2017-12,C_ID_d27d835a9f,2,1,0
4,2015-12,C_ID_2b5e3df5c2,5,1,1


In [35]:
print(train_data.feature_1.value_counts(),'\n',test_data.feature_1.value_counts())

3    73573
2    55797
5    40625
4    19885
1    12037
Name: feature_1, dtype: int64 
 3    44719
2    34115
5    25051
4    12332
1     7406
Name: feature_1, dtype: int64


In [36]:
train_data['enc_feature_1_1'] = (train_data.feature_1 ==1) + 0
train_data['enc_feature_1_2'] = (train_data.feature_1 ==2) + 0
train_data['enc_feature_1_3'] = (train_data.feature_1 ==3) + 0
train_data['enc_feature_1_4'] = (train_data.feature_1 ==4) + 0
train_data['enc_feature_1_5'] = (train_data.feature_1 ==5) + 0


test_data['enc_feature_1_1'] = (test_data.feature_1 ==1) + 0
test_data['enc_feature_1_2'] = (test_data.feature_1 ==2) + 0
test_data['enc_feature_1_3'] = (test_data.feature_1 ==3) + 0
test_data['enc_feature_1_4'] = (test_data.feature_1 ==4) + 0
test_data['enc_feature_1_5'] = (test_data.feature_1 ==5) + 0

## at this point we have label encoded our categorical variables feature1,2,3. we will drop the feature columns , again to
## reduce memory footprint

In [37]:
test_data.drop(columns = ['feature_1','feature_2','feature_3'], inplace = True)
train_data.drop(columns = ['feature_1','feature_2','feature_3'], inplace = True)

In [38]:
train_data.head()

Unnamed: 0,first_active_month,card_id,target,enc_feature_1_1,enc_feature_1_2,enc_feature_1_3,enc_feature_1_4,enc_feature_1_5
0,2017-06,C_ID_92a2005557,-0.820283,0,0,0,0,1
1,2017-01,C_ID_3d0044924f,0.392913,0,0,0,1,0
2,2016-08,C_ID_d639edf6cd,0.688056,0,1,0,0,0
3,2017-09,C_ID_186d6a6901,0.142495,0,0,0,1,0
4,2017-11,C_ID_cdbd2c0db2,-0.159749,1,0,0,0,0


In [39]:



def get_months_passed( given):
    
    current = datetime.datetime.now()
    given = datetime.datetime.strptime(given + '-01', '%Y-%m-%d')
    
    r = relativedelta.relativedelta(current, given)
    return r.months + r.years*12


    
given_date = '2015-12'

get_months_passed( given_date)   

37

In [40]:
train_data['months_passed'] = train_data.first_active_month.map(lambda x: get_months_passed(x))


In [41]:
test_data.first_active_month.isnull().sum()
# test_data['months_passed'] = test_data.first_active_month.map(lambda x: get_months_passed(x))

1

In [42]:
test_data.first_active_month.value_counts()

2017-09    8805
2017-10    8451
2017-08    8299
2017-11    7807
2017-07    7344
2017-12    6233
2017-06    6147
2017-01    5974
2016-12    5751
2017-05    5598
2017-04    5371
2017-03    5058
2017-02    4903
2016-11    4444
2016-10    3335
2016-08    3108
2016-09    2663
2016-07    2610
2016-06    2407
2016-05    2096
2016-04    1740
2015-12    1393
2016-03    1258
2016-01     996
2016-02     987
2015-11     955
2015-10     936
2015-08     882
2015-07     862
2015-09     818
           ... 
2014-03     141
2014-02      97
2013-12      95
2014-01      90
2013-11      80
2013-10      80
2013-09      75
2013-08      73
2013-07      72
2013-06      56
2013-05      44
2013-04      40
2013-03      34
2018-01      28
2013-02      28
2012-08      21
2012-11      18
2012-09      18
2012-12      16
2012-07      15
2012-06      14
2013-01      14
2012-10      13
2012-05      12
2012-04      11
2012-03      10
2012-02       4
2011-11       3
2011-12       2
2012-01       1
Name: first_active_month

In [44]:
# one particular test data has null frist active month. we replace it with most appearing date
most_appearing = test_data.first_active_month.value_counts().idxmax()
test_data.first_active_month.fillna(value = most_appearing, inplace= True)
test_data.first_active_month.isnull().sum()

0

In [45]:
test_data['months_passed'] = test_data.first_active_month.map(lambda x: get_months_passed(x))

In [46]:
train_data.drop(columns = ['first_active_month'], inplace = True)
test_data.drop(columns = ['first_active_month'], inplace = True)


In [47]:
train_data.head()

Unnamed: 0,card_id,target,enc_feature_1_1,enc_feature_1_2,enc_feature_1_3,enc_feature_1_4,enc_feature_1_5,months_passed
0,C_ID_92a2005557,-0.820283,0,0,0,0,1,19
1,C_ID_3d0044924f,0.392913,0,0,0,1,0,24
2,C_ID_d639edf6cd,0.688056,0,1,0,0,0,29
3,C_ID_186d6a6901,0.142495,0,0,0,1,0,16
4,C_ID_cdbd2c0db2,-0.159749,1,0,0,0,0,14


In [50]:
## deres nothing else we can do now...with this data
## lets look at new transactions first , since it is a smaller data sets

new_transaction_data = panda.read_csv(new_transaction_path)
new_transaction_data_types = new_transaction_data.dtypes
new_transaction_column_types = {}
for index, column_name in enumerate(new_transaction_data.columns.tolist()):
    new_transaction_column_types[column_name] = new_transaction_data_types[index]
del new_transaction_data
new_transaction_data = panda.read_csv(new_transaction_path, dtype = new_transaction_column_types)
new_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [51]:
## lets deal with the easy categorical data first: authorized_flag, category_1,category_2,category_3
new_transaction_data.authorized_flag.isnull().sum()
new_transaction_data['authorized_flag'] = new_transaction_data.authorized_flag.map({'Y':1,'N':0})

In [52]:
new_transaction_data.category_1.isnull().sum()
new_transaction_data.category_1.value_counts()
new_transaction_data['category_1'] = new_transaction_data.category_1.map({'Y':1, 'N':0})

In [53]:
new_transaction_data.category_2.isnull().sum()
##quite a few null values in category 2
new_transaction_data.category_2.value_counts()
most_appearing = new_transaction_data.category_2.value_counts().idxmax()
new_transaction_data.category_2.fillna(value= most_appearing, inplace=True)
new_transaction_data.category_2.value_counts()

1.0    1169987
3.0     289525
5.0     259266
4.0     178590
2.0      65663
Name: category_2, dtype: int64

In [54]:
new_transaction_data['enc_category_2_1'] = (new_transaction_data.category_2==1) + 0
new_transaction_data['enc_category_2_2'] = (new_transaction_data.category_2==2) + 0
new_transaction_data['enc_category_2_3'] = (new_transaction_data.category_2==3) + 0
new_transaction_data['enc_category_2_4'] = (new_transaction_data.category_2==4) + 0
new_transaction_data['enc_category_2_5'] = (new_transaction_data.category_2==5) + 0

new_transaction_data.drop(columns = ['category_2'], inplace=True)
new_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,enc_category_2_1,enc_category_2_2,enc_category_2_3,enc_category_2_4,enc_category_2_5
0,1,C_ID_415bb3a509,107,0,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,9,19,1,0,0,0,0
1,1,C_ID_415bb3a509,140,0,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,9,19,1,0,0,0,0
2,1,C_ID_415bb3a509,330,0,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,9,14,1,0,0,0,0
3,1,C_ID_415bb3a509,-1,1,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,-1,8,1,0,0,0,0
4,1,C_ID_ef55cf8d4b,-1,1,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,-1,29,1,0,0,0,0


In [55]:
new_transaction_data.category_3.isnull().sum()
## again quite a few null values for category
most_appearing = new_transaction_data.category_3.value_counts().idxmax()
new_transaction_data.category_3.fillna(value = most_appearing, inplace = True)
new_transaction_data.category_3.value_counts(),new_transaction_data.category_3.isnull().sum()

(A    978166
 B    836178
 C    148687
 Name: category_3, dtype: int64, 0)

In [56]:
new_transaction_data['enc_category_3_A'] = (new_transaction_data.category_3=='A') +0
new_transaction_data['enc_category_3_B'] = (new_transaction_data.category_3=='B') +0
new_transaction_data['enc_category_3_C'] = (new_transaction_data.category_3=='C') +0
new_transaction_data.drop(columns = ['category_3'], inplace = True)

In [57]:
new_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,enc_category_2_1,enc_category_2_2,enc_category_2_3,enc_category_2_4,enc_category_2_5,enc_category_3_A,enc_category_3_B,enc_category_3_C
0,1,C_ID_415bb3a509,107,0,1,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,9,19,1,0,0,0,0,0,1,0
1,1,C_ID_415bb3a509,140,0,1,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,9,19,1,0,0,0,0,0,1,0
2,1,C_ID_415bb3a509,330,0,1,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,9,14,1,0,0,0,0,0,1,0
3,1,C_ID_415bb3a509,-1,1,1,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,-1,8,1,0,0,0,0,0,1,0
4,1,C_ID_ef55cf8d4b,-1,1,1,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,-1,29,1,0,0,0,0,0,1,0


In [58]:
new_transaction_data['enc_purchase_date'] = panda.DatetimeIndex(new_transaction_data.purchase_date).astype(np.int64)*1e-9
new_transaction_data.drop(columns = ['purchase_date'], inplace = True)


In [66]:
new_transaction_data.city_id.isnull().sum()
new_transaction_data.city_id.value_counts()
most_appearing = new_transaction_data.city_id.value_counts().idxmax()
new_transaction_data.city_id.replace({-1:most_appearing}, inplace = True)
new_transaction_data.city_id.isnull().sum()

0

In [69]:
new_transaction_data.state_id.isnull().sum()
new_transaction_data.state_id.value_counts()
most_appearing = new_transaction_data.state_id.value_counts().idxmax()
new_transaction_data.state_id.replace({-1:most_appearing}, inplace = True)
new_transaction_data.state_id.value_counts()


9     844891
15    158304
16    139765
5     120158
19     87980
20     78953
22     65300
4      63532
21     60155
7      49342
11     46905
13     39432
3      38908
2      27814
12     27027
18     22139
1      17908
10     17273
23     13069
24     12825
8      12218
14     10326
17      8450
6        357
Name: state_id, dtype: int64

In [71]:
new_transaction_data.subsector_id.isnull().sum()
new_transaction_data.subsector_id.value_counts()
most_appearing = new_transaction_data.subsector_id.value_counts().idxmax()
new_transaction_data.subsector_id.replace({-1:most_appearing}, inplace = True)
new_transaction_data.subsector_id.value_counts()

37    340073
19    200528
27    194977
34    183700
33    172585
29    138039
16    119297
21     74176
1      66913
7      54145
18     42079
20     38433
36     36621
30     31443
15     29772
25     23805
17     23233
39     21487
12     21351
32     16673
22     16460
2      15851
31     15000
8      14979
38     13057
41     12848
10      9849
9       7590
3       6237
35      5233
4       4302
13      2419
26      2221
24      1907
14      1488
23      1423
5       1316
40       953
11       560
28         8
Name: subsector_id, dtype: int64

In [72]:
## we will take the 
## sum,mean,max of purchase amount, 
## sum,mean,max of month lag by card_id
## mean of authorized_flag,
## num of authorized ==1
## num of authorized ==0
## sum, mean, max of installments
## mean of category_1
## sum of enc_category_2* and enc_category_3*

def count_authorized_one(x):
    return x.eq(1).sum()
def count_authorized_zero(x):
    return x.eq(0).sum()
def count_category_1_one(x):
    return x.eq(1).sum()
def count_category_1_zero(x):
    return x.eq(0).sum()

def count_of_unique_cities(x):
    return x.nunique()

def count_of_unique_subsector_id(x):
    return x.nunique()

def count_of_unique_state_id(x):
    return x.nunique()

aggregate = {
    
'purchase_amount' : ['sum', 'mean','max'] ,
'month_lag' : ['sum', 'mean','max'] ,
'enc_category_2_1':['sum'],
'enc_category_2_2':['sum'],
'enc_category_2_3':['sum'],
'enc_category_2_4':['sum'],
'enc_category_2_5':['sum'],    
'enc_category_3_A':['sum'],
'enc_category_3_B':['sum'],
'enc_category_3_C':['sum'],
'installments': ['sum','mean','max'],
'authorized_flag': ['mean', count_authorized_one, count_authorized_zero],
'category_1' :['mean', count_category_1_one,count_category_1_zero],
'enc_purchase_date' : ['mean','min','max'],
'city_id' : [count_of_unique_cities],
'state_id': [count_of_unique_state_id],
'subsector_id' : [count_of_unique_subsector_id]
    
}



In [73]:
group_by = new_transaction_data.groupby(['card_id'])

t = group_by.agg(aggregate)

In [79]:
t.head()

Unnamed: 0_level_0,purchase_amount,purchase_amount,purchase_amount,month_lag,month_lag,month_lag,enc_category_2_1,enc_category_2_2,enc_category_2_3,enc_category_2_4,...,authorized_flag,category_1,category_1,category_1,enc_purchase_date,enc_purchase_date,enc_purchase_date,city_id,state_id,subsector_id
Unnamed: 0_level_1,sum,mean,max,sum,mean,max,sum,sum,sum,sum,...,count_authorized_zero,mean,count_category_1_one,count_category_1_zero,mean,min,max,count_of_unique_cities,count_of_unique_state_id,count_of_unique_subsector_id
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C_ID_00007093c1,-1.328524,-0.664262,-0.656749,4,2.0,2,1,0,1,0,...,0,0.0,0,2,1523023000.0,1522754000.0,1523291000.0,2,2,2
C_ID_0001238066,-14.850055,-0.571156,-0.078318,35,1.346154,2,23,0,0,0,...,0,0.076923,2,24,1522335000.0,1519923000.0,1525118000.0,7,3,9
C_ID_0001506ef0,-1.447354,-0.723677,-0.715352,2,1.0,1,0,0,2,0,...,0,0.0,0,2,1521474000.0,1521239000.0,1521710000.0,1,1,2
C_ID_0001793786,-0.22962,-0.007407,3.129932,41,1.322581,2,17,8,5,0,...,0,0.0,0,31,1512327000.0,1510761000.0,1514742000.0,7,4,14
C_ID_000183fdda,-6.590778,-0.599162,-0.10768,14,1.272727,2,0,0,11,0,...,0,0.0,0,11,1521578000.0,1519994000.0,1525100000.0,2,2,6


In [74]:
t.columns.tolist()

t_cols = ['card_id']
new_names = ['_'.join(list(i)) for i in t.columns.tolist() ]


In [75]:
t_cols.extend(['new_'+i for i in new_names])
t_cols

['card_id',
 'new_purchase_amount_sum',
 'new_purchase_amount_mean',
 'new_purchase_amount_max',
 'new_month_lag_sum',
 'new_month_lag_mean',
 'new_month_lag_max',
 'new_enc_category_2_1_sum',
 'new_enc_category_2_2_sum',
 'new_enc_category_2_3_sum',
 'new_enc_category_2_4_sum',
 'new_enc_category_2_5_sum',
 'new_enc_category_3_A_sum',
 'new_enc_category_3_B_sum',
 'new_enc_category_3_C_sum',
 'new_installments_sum',
 'new_installments_mean',
 'new_installments_max',
 'new_authorized_flag_mean',
 'new_authorized_flag_count_authorized_one',
 'new_authorized_flag_count_authorized_zero',
 'new_category_1_mean',
 'new_category_1_count_category_1_one',
 'new_category_1_count_category_1_zero',
 'new_enc_purchase_date_mean',
 'new_enc_purchase_date_min',
 'new_enc_purchase_date_max',
 'new_city_id_count_of_unique_cities',
 'new_state_id_count_of_unique_state_id',
 'new_subsector_id_count_of_unique_subsector_id']

In [81]:
len(t.columns.tolist()), len(t_cols)

(30, 30)

In [82]:
t.columns  = t_cols

In [80]:
t.reset_index(inplace=True)

t.head()

Unnamed: 0_level_0,card_id,purchase_amount,purchase_amount,purchase_amount,month_lag,month_lag,month_lag,enc_category_2_1,enc_category_2_2,enc_category_2_3,...,authorized_flag,category_1,category_1,category_1,enc_purchase_date,enc_purchase_date,enc_purchase_date,city_id,state_id,subsector_id
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,max,sum,mean,max,sum,sum,sum,...,count_authorized_zero,mean,count_category_1_one,count_category_1_zero,mean,min,max,count_of_unique_cities,count_of_unique_state_id,count_of_unique_subsector_id
0,C_ID_00007093c1,-1.328524,-0.664262,-0.656749,4,2.0,2,1,0,1,...,0,0.0,0,2,1523023000.0,1522754000.0,1523291000.0,2,2,2
1,C_ID_0001238066,-14.850055,-0.571156,-0.078318,35,1.346154,2,23,0,0,...,0,0.076923,2,24,1522335000.0,1519923000.0,1525118000.0,7,3,9
2,C_ID_0001506ef0,-1.447354,-0.723677,-0.715352,2,1.0,1,0,0,2,...,0,0.0,0,2,1521474000.0,1521239000.0,1521710000.0,1,1,2
3,C_ID_0001793786,-0.22962,-0.007407,3.129932,41,1.322581,2,17,8,5,...,0,0.0,0,31,1512327000.0,1510761000.0,1514742000.0,7,4,14
4,C_ID_000183fdda,-6.590778,-0.599162,-0.10768,14,1.272727,2,0,0,11,...,0,0.0,0,11,1521578000.0,1519994000.0,1525100000.0,2,2,6


In [83]:
del new_transaction_data

In [84]:
t.head()

Unnamed: 0,card_id,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_max,new_month_lag_sum,new_month_lag_mean,new_month_lag_max,new_enc_category_2_1_sum,new_enc_category_2_2_sum,new_enc_category_2_3_sum,...,new_authorized_flag_count_authorized_zero,new_category_1_mean,new_category_1_count_category_1_one,new_category_1_count_category_1_zero,new_enc_purchase_date_mean,new_enc_purchase_date_min,new_enc_purchase_date_max,new_city_id_count_of_unique_cities,new_state_id_count_of_unique_state_id,new_subsector_id_count_of_unique_subsector_id
0,C_ID_00007093c1,-1.328524,-0.664262,-0.656749,4,2.0,2,1,0,1,...,0,0.0,0,2,1523023000.0,1522754000.0,1523291000.0,2,2,2
1,C_ID_0001238066,-14.850055,-0.571156,-0.078318,35,1.346154,2,23,0,0,...,0,0.076923,2,24,1522335000.0,1519923000.0,1525118000.0,7,3,9
2,C_ID_0001506ef0,-1.447354,-0.723677,-0.715352,2,1.0,1,0,0,2,...,0,0.0,0,2,1521474000.0,1521239000.0,1521710000.0,1,1,2
3,C_ID_0001793786,-0.22962,-0.007407,3.129932,41,1.322581,2,17,8,5,...,0,0.0,0,31,1512327000.0,1510761000.0,1514742000.0,7,4,14
4,C_ID_000183fdda,-6.590778,-0.599162,-0.10768,14,1.272727,2,0,0,11,...,0,0.0,0,11,1521578000.0,1519994000.0,1525100000.0,2,2,6


In [85]:
t.to_csv('all_new/treated_new_transaction_1.csv', index = False)

In [7]:
##similar treatment for historical transaction, because i have seen from data dictionary that they both have same columns


In [86]:
historical_transaction_data = panda.read_csv(historical_transaction_path)
ht_tr_dtypes = historical_transaction_data.dtypes
ht_col_types = {}

for index, name in enumerate(historical_transaction_data.columns.tolist()):
    ht_col_types[name] = ht_tr_dtypes[index]

del historical_transaction_data



In [87]:
historical_transaction_data = panda.read_csv(historical_transaction_path, dtype = ht_col_types)
len(historical_transaction_data)

29112361

In [88]:
historical_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [89]:
## lets deal with the easy categorical data first: authorized_flag, category_1,category_2,category_3
historical_transaction_data.authorized_flag.isnull().sum()
historical_transaction_data['authorized_flag'] = historical_transaction_data.authorized_flag.map({'Y':1,'N':0})

In [90]:
historical_transaction_data.category_1.isnull().sum()
historical_transaction_data.category_1.value_counts()
historical_transaction_data['category_1'] = historical_transaction_data.category_1.map({'Y':1, 'N':0})

In [91]:
historical_transaction_data.category_2.isnull().sum()
##quite a few null values in category 2
historical_transaction_data.category_2.value_counts()
most_appearing = historical_transaction_data.category_2.value_counts().idxmax()
historical_transaction_data.category_2.fillna(value= most_appearing, inplace=True)
historical_transaction_data.category_2.isnull().sum()

0

In [92]:
historical_transaction_data['enc_category_2_1'] = (historical_transaction_data.category_2==1) + 0
historical_transaction_data['enc_category_2_2'] = (historical_transaction_data.category_2==2) + 0
historical_transaction_data['enc_category_2_3'] = (historical_transaction_data.category_2==3) + 0
historical_transaction_data['enc_category_2_4'] = (historical_transaction_data.category_2==4) + 0
historical_transaction_data['enc_category_2_5'] = (historical_transaction_data.category_2==5) + 0

historical_transaction_data.drop(columns = ['category_2'], inplace=True)
historical_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,enc_category_2_1,enc_category_2_2,enc_category_2_3,enc_category_2_4,enc_category_2_5
0,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,16,37,1,0,0,0,0
1,1,C_ID_4e6213e9bc,88,0,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,16,16,1,0,0,0,0
2,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,16,37,1,0,0,0,0
3,1,C_ID_4e6213e9bc,88,0,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,16,34,1,0,0,0,0
4,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,16,37,1,0,0,0,0


In [93]:
historical_transaction_data.category_3.isnull().sum()
## again quite a few null values for category
most_appearing = historical_transaction_data.category_3.value_counts().idxmax()
historical_transaction_data.category_3.fillna(value = most_appearing, inplace = True)
historical_transaction_data.category_3.value_counts(),historical_transaction_data.category_3.isnull().sum()

(A    15589906
 B    11677522
 C     1844933
 Name: category_3, dtype: int64, 0)

In [94]:
historical_transaction_data['enc_category_3_A'] = (historical_transaction_data.category_3=='A') +0
historical_transaction_data['enc_category_3_B'] = (historical_transaction_data.category_3=='B') +0
historical_transaction_data['enc_category_3_C'] = (historical_transaction_data.category_3=='C') +0
historical_transaction_data.drop(columns = ['category_3'], inplace = True)

In [95]:
historical_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,enc_category_2_1,enc_category_2_2,enc_category_2_3,enc_category_2_4,enc_category_2_5,enc_category_3_A,enc_category_3_B,enc_category_3_C
0,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,16,37,1,0,0,0,0,1,0,0
1,1,C_ID_4e6213e9bc,88,0,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,16,16,1,0,0,0,0,1,0,0
2,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,16,37,1,0,0,0,0,1,0,0
3,1,C_ID_4e6213e9bc,88,0,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,16,34,1,0,0,0,0,1,0,0
4,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,16,37,1,0,0,0,0,1,0,0


In [96]:
historical_transaction_data['enc_purchase_date'] = panda.DatetimeIndex(historical_transaction_data.purchase_date).astype(np.int64)*1e-9
historical_transaction_data.drop(columns = ['purchase_date'], inplace = True)


In [98]:
historical_transaction_data.city_id.value_counts()
most_appearing = historical_transaction_data.city_id.value_counts().idxmax()
historical_transaction_data.city_id.replace({-1:most_appearing}, inplace = True)
historical_transaction_data.city_id.value_counts()

69     7314065
19     1151077
158    1113013
17      971714
331     714414
143     668769
88      659328
137     603023
333     561144
87      444940
212     406196
57      382232
149     359540
25      356757
233     355108
117     322414
160     317617
213     248371
277     247294
286     231877
76      216048
4       212403
107     210182
53      209668
20      207650
291     199983
140     190359
344     177694
161     168745
51      164222
        ...   
32        8382
81        8278
54        8201
235       7918
326       7912
293       7804
327       7499
112       7461
84        7367
237       7285
227       7233
254       7143
71        6781
204       6745
306       6128
221       6096
130       5657
122       5458
205       5385
215       4791
89        4760
178       4430
263       4003
207       3905
315       3845
319       1645
92         758
273        620
316        149
185         45
Name: city_id, Length: 307, dtype: int64

In [100]:
historical_transaction_data.subsector_id.value_counts()
most_appearing = historical_transaction_data.subsector_id.value_counts().idxmax()
historical_transaction_data.subsector_id.replace({-1:most_appearing}, inplace = True)
historical_transaction_data.subsector_id.value_counts()

33    5596398
37    4531755
19    3147440
29    2813095
34    2759431
27    1658301
16    1651384
21     837612
7      711977
1      695263
15     665634
8      541510
18     395679
36     331696
20     299301
30     292335
39     259206
25     259053
41     196118
12     192067
22     179252
31     142945
17     132915
32     129516
2      110596
38      89005
3       87853
10      81954
9       77760
4       59102
35      39269
26      25140
13      23388
5       23171
14      21825
23      19724
24      18134
40      13424
11       2009
28        124
Name: subsector_id, dtype: int64

In [102]:
historical_transaction_data.state_id.value_counts()
most_appearing = historical_transaction_data.state_id.value_counts().idxmax()
historical_transaction_data.state_id.replace({-1:most_appearing}, inplace = True)
historical_transaction_data.state_id.value_counts()

9     13221745
16     2143420
15     2079255
5      1655124
19     1204287
20     1141197
22      971714
21      929594
4       840820
7       640712
13      634006
11      631127
3       542220
12      385643
18      384876
2       371728
1       247753
10      222463
23      206011
24      205952
14      171513
8       157829
17      116139
6         7233
Name: state_id, dtype: int64

In [103]:
## we will take the 
## sum,mean,max of purchase amount, 
## sum,mean,max of month lag by card_id
## mean of authorized_flag,
## num of authorized ==1
## num of authorized ==0
## sum, mean, max of installments
## mean of category_1
## sum of enc_category_2* and enc_category_3*

def count_authorized_one(x):
    return x.eq(1).sum()
def count_authorized_zero(x):
    return x.eq(0).sum()
def count_category_1_one(x):
    return x.eq(1).sum()
def count_category_1_zero(x):
    return x.eq(0).sum()

def count_of_unique_cities(x):
    return x.nunique()

def count_of_unique_subsector_id(x):
    return x.nunique()

def count_of_unique_state_id(x):
    return x.nunique()

aggregate = {
    
'purchase_amount' : ['sum', 'mean','max'] ,
'month_lag' : ['sum', 'mean','max'] ,
'enc_category_2_1':['sum'],
'enc_category_2_2':['sum'],
'enc_category_2_3':['sum'],
'enc_category_2_4':['sum'],
'enc_category_2_5':['sum'],    
'enc_category_3_A':['sum'],
'enc_category_3_B':['sum'],
'enc_category_3_C':['sum'],
'installments': ['sum','mean','max'],
'authorized_flag': ['mean', count_authorized_one, count_authorized_zero],
'category_1' :['mean', count_category_1_one,count_category_1_zero],
'enc_purchase_date' : ['mean','min','max'],
'city_id' : [count_of_unique_cities],
'state_id': [count_of_unique_state_id],
'subsector_id' : [count_of_unique_subsector_id]
    
}





In [104]:
groupby = historical_transaction_data.groupby(['card_id'])
t = groupby.agg(aggregate)

In [105]:
t_cols = ['card_id']
new_cols = ['_'.join(a) for a in t.columns.tolist()]
t_cols.extend(['hist_'+a for a in new_cols])
len(t.columns.tolist()),len(t_cols)

(29, 30)

In [106]:

t_cols

['card_id',
 'hist_purchase_amount_sum',
 'hist_purchase_amount_mean',
 'hist_purchase_amount_max',
 'hist_month_lag_sum',
 'hist_month_lag_mean',
 'hist_month_lag_max',
 'hist_enc_category_2_1_sum',
 'hist_enc_category_2_2_sum',
 'hist_enc_category_2_3_sum',
 'hist_enc_category_2_4_sum',
 'hist_enc_category_2_5_sum',
 'hist_enc_category_3_A_sum',
 'hist_enc_category_3_B_sum',
 'hist_enc_category_3_C_sum',
 'hist_installments_sum',
 'hist_installments_mean',
 'hist_installments_max',
 'hist_authorized_flag_mean',
 'hist_authorized_flag_count_authorized_one',
 'hist_authorized_flag_count_authorized_zero',
 'hist_category_1_mean',
 'hist_category_1_count_category_1_one',
 'hist_category_1_count_category_1_zero',
 'hist_enc_purchase_date_mean',
 'hist_enc_purchase_date_min',
 'hist_enc_purchase_date_max',
 'hist_city_id_count_of_unique_cities',
 'hist_state_id_count_of_unique_state_id',
 'hist_subsector_id_count_of_unique_subsector_id']

In [107]:
t.reset_index(inplace=True)
t.columns = t_cols

In [108]:
t.head()

Unnamed: 0,card_id,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_max,hist_month_lag_sum,hist_month_lag_mean,hist_month_lag_max,hist_enc_category_2_1_sum,hist_enc_category_2_2_sum,hist_enc_category_2_3_sum,...,hist_authorized_flag_count_authorized_zero,hist_category_1_mean,hist_category_1_count_category_1_one,hist_category_1_count_category_1_zero,hist_enc_purchase_date_mean,hist_enc_purchase_date_min,hist_enc_purchase_date_max,hist_city_id_count_of_unique_cities,hist_state_id_count_of_unique_state_id,hist_subsector_id_count_of_unique_subsector_id
0,C_ID_00007093c1,-76.845041,-0.515739,1.507069,-872,-5.852349,0,28,0,120,...,35,0.187919,28,121,1503054000.0,1487081000.0,1519708000.0,4,3,13
1,C_ID_0001238066,-72.447201,-0.589002,0.768095,-223,-1.813008,0,103,0,0,...,3,0.01626,2,121,1513977000.0,1506638000.0,1519748000.0,17,5,17
2,C_ID_0001506ef0,-34.601879,-0.524271,1.493545,-319,-4.833333,0,2,0,64,...,4,0.0,0,66,1505646000.0,1484411000.0,1518871000.0,3,2,12
3,C_ID_0001793786,-36.786013,-0.170306,4.554145,-719,-3.328704,0,125,76,15,...,27,0.009259,2,214,1499412000.0,1484994000.0,1509481000.0,9,3,24
4,C_ID_000183fdda,-68.837938,-0.478041,2.764788,-353,-2.451389,0,11,1,131,...,7,0.027778,4,140,1512119000.0,1502099000.0,1519592000.0,8,6,21


In [109]:
t.to_csv('all_new/treated_historical_transactions_1.csv', index = False)