In [1]:
import pandas as panda

import numpy as np

import datetime, time

from matplotlib.pyplot import plot

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import seaborn as sns


from sklearn.linear_model import LinearRegression, RANSACRegressor, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline

In [2]:

new_transaction_data_path = 'all/new_merchant_transactions.csv'

new_transaction_data = panda.read_csv(new_transaction_data_path)

new_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [3]:
## lets chcek fo NaN , empty values, inf values


print(new_transaction_data.isnull().sum())
print(new_transaction_data[['category_2']].isnull().sum())
print(new_transaction_data[['category_3']].isnull().sum())
# print(new_transaction_data['merchant_id'].value_counts())

## category_2, category_3 are encoded values.. so lable values. so we will replace missing values with most appearing value

## mechant id being a char value, we will skip the ones. with empty values.

authorized_flag              0
card_id                      0
city_id                      0
category_1                   0
installments                 0
category_3               55922
merchant_category_id         0
merchant_id              26216
month_lag                    0
purchase_amount              0
purchase_date                0
category_2              111745
state_id                     0
subsector_id                 0
dtype: int64
category_2    111745
dtype: int64
category_3    55922
dtype: int64


In [4]:
print(new_transaction_data.category_2.value_counts(),'num of null :',new_transaction_data.category_2.isnull().sum(), ' max values: ', new_transaction_data.category_2.value_counts().idxmax())




new_transaction_data.category_2.fillna(new_transaction_data.category_2.value_counts().idxmax(), inplace = True)

new_transaction_data.category_2.isnull().sum()

1.0    1058242
3.0     289525
5.0     259266
4.0     178590
2.0      65663
Name: category_2, dtype: int64 num of null : 111745  max values:  1.0


0

In [5]:
print(new_transaction_data.category_3.value_counts(),'num of null :',new_transaction_data.category_3.isnull().sum(), ' max values: ', new_transaction_data.category_3.value_counts().idxmax())




new_transaction_data.category_3.fillna(new_transaction_data.category_3.value_counts().idxmax(), inplace = True)

new_transaction_data.category_3.isnull().sum()

A    922244
B    836178
C    148687
Name: category_3, dtype: int64 num of null : 55922  max values:  A


0

In [6]:
# new_transaction_data['merchant_id'] = new_transaction_data.merchant_id.apply(lambda x: np.NaN if x=='' else x)

# print(len(new_transaction_data))

# new_transaction_data.dropna(subset=['merchant_id'], axis = 0, inplace = True)

In [7]:
print(len(new_transaction_data))

1963031


now that we have dealt with empty values , lets check which particular values we want to add to our original data set


1. i want a column for each card id that says how many times card has been used, how many times approved, how many times rejected


2. i want another column starting total value of purchase made on that card

 


In [8]:
## lets check to see if all cards ids present in test data is also present in new_transaction_Data
## we will do a in check and then length check for the same
## before we compare we will also uppercase and strip the values for card ids in both data sets

train_data_path = 'all/train.csv'

train_data = panda.read_csv(train_data_path)

print(train_data.head())



  first_active_month          card_id  feature_1  feature_2  feature_3  \
0            2017-06  C_ID_92a2005557          5          2          1   
1            2017-01  C_ID_3d0044924f          4          1          0   
2            2016-08  C_ID_d639edf6cd          2          2          0   
3            2017-09  C_ID_186d6a6901          4          3          0   
4            2017-11  C_ID_cdbd2c0db2          1          3          0   

     target  
0 -0.820283  
1  0.392913  
2  0.688056  
3  0.142495  
4 -0.159749  


In [9]:
train_data.card_id = train_data.card_id.apply(lambda x: x.upper().strip())
new_transaction_data.card_id = new_transaction_data.card_id.apply(lambda x: x.upper().strip())
# new_transaction_data.card_id
np.unique(train_data.card_id.values).shape

(201917,)

In [10]:
merged_data = new_transaction_data[['card_id']].merge(train_data[['card_id']], how='inner', on ='card_id')
merged_data.shape

(1219685, 1)

we can see that out of 201917 cards, data for only 179400 are present.

1. we will calculate the total amount of purchases for these cards

2. no of denied transactions for these cards

3. no of approved transactons for these cards

In [11]:
count_transaction = new_transaction_data.groupby(['card_id'])['card_id','city_id'].count()
# print(count_transaction.head())
count_transaction = count_transaction[['card_id']]
count_transaction.columns = ['no_of_transaction']

count_transaction = count_transaction.reset_index()

count_transaction.head()

Unnamed: 0,card_id,no_of_transaction
0,C_ID_00007093C1,2
1,C_ID_0001238066,26
2,C_ID_0001506EF0,2
3,C_ID_0001793786,31
4,C_ID_000183FDDA,11


In [12]:
temp = panda.DataFrame({'id':[1,1,2,3,3],'approved':['Y','Y','N','N','N'], 'c':[22,33,44,55,66]})
temp

Unnamed: 0,id,approved,c
0,1,Y,22
1,1,Y,33
2,2,N,44
3,3,N,55
4,3,N,66


In [13]:
temp[['N','Y']]= temp.approved.str.get_dummies()

temp

Unnamed: 0,id,approved,c,N,Y
0,1,Y,22,0,1
1,1,Y,33,0,1
2,2,N,44,1,0
3,3,N,55,1,0
4,3,N,66,1,0


In [14]:
print(temp.groupby(['id'])['N'].sum(), ', type: ',type(temp.groupby(['id'])['N'].sum()))

tt = panda.DataFrame(temp.groupby(['id'])['N'].sum())
tt['id'] = tt.index

tt

id
1    0
2    1
3    2
Name: N, dtype: int64 , type:  <class 'pandas.core.series.Series'>


Unnamed: 0_level_0,N,id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,1
2,1,2
3,2,3


In [15]:
print(temp, tt)

   id approved   c  N  Y
0   1        Y  22  0  1
1   1        Y  33  0  1
2   2        N  44  1  0
3   3        N  55  1  0
4   3        N  66  1  0     N  id
id       
1   0   1
2   1   2
3   2   3


In [16]:
new_transaction_data.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415BB3A509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415BB3A509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415BB3A509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415BB3A509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,1.0,-1,8
4,Y,C_ID_EF55CF8D4B,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,1.0,-1,29


In [17]:
count_transaction.head()

Unnamed: 0,card_id,no_of_transaction
0,C_ID_00007093C1,2
1,C_ID_0001238066,26
2,C_ID_0001506EF0,2
3,C_ID_0001793786,31
4,C_ID_000183FDDA,11


In [18]:
merged_transaction = new_transaction_data.merge(count_transaction, how='right', on='card_id')

merged_transaction

## at this point we have a data set where we have a column no of trnsaction . whch gives us the num of transactions done for the 
## particular card




Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,no_of_transaction
0,Y,C_ID_415BB3A509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19,4
1,Y,C_ID_415BB3A509,140,N,1,B,307,M_ID_88920c89e8,1,-0.569580,2018-03-19 18:53:37,1.0,9,19,4
2,Y,C_ID_415BB3A509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14,4
3,Y,C_ID_415BB3A509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,1.0,-1,8,4
4,Y,C_ID_EF55CF8D4B,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,1.0,-1,29,22
5,Y,C_ID_EF55CF8D4B,231,N,1,B,367,M_ID_8874615e00,2,-0.633007,2018-04-02 12:53:28,1.0,9,16,22
6,Y,C_ID_EF55CF8D4B,69,N,1,B,333,M_ID_6d061b5ddc,1,5.263697,2018-03-28 19:50:19,1.0,9,10,22
7,Y,C_ID_EF55CF8D4B,231,N,1,B,307,M_ID_df1e022f41,2,-0.553787,2018-04-05 08:06:52,1.0,9,19,22
8,Y,C_ID_EF55CF8D4B,69,N,1,B,278,M_ID_d15eae0468,2,-0.596643,2018-04-07 18:37:40,1.0,9,37,22
9,Y,C_ID_EF55CF8D4B,69,N,1,B,437,M_ID_5f9bffd028,1,-0.607191,2018-03-17 18:10:41,1.0,9,15,22


In [19]:
merged_transaction['authorized_flag'].value_counts()
## all values are Y. so we can safely append 0 as num of times rejected across each card

merged_transaction['no_of_rejects'] = [0]*len(merged_transaction)

merged_transaction.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,no_of_transaction,no_of_rejects
0,Y,C_ID_415BB3A509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19,4,0
1,Y,C_ID_415BB3A509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19,4,0
2,Y,C_ID_415BB3A509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14,4,0
3,Y,C_ID_415BB3A509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,1.0,-1,8,4,0
4,Y,C_ID_EF55CF8D4B,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,1.0,-1,29,22,0


In [20]:
## since all transaction are approved, no of approved will be same as no of transaction

merged_transaction['no_of_approved']= merged_transaction['no_of_transaction']

In [21]:
merged_transaction.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,no_of_transaction,no_of_rejects,no_of_approved
0,Y,C_ID_415BB3A509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19,4,0,4
1,Y,C_ID_415BB3A509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19,4,0,4
2,Y,C_ID_415BB3A509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14,4,0,4
3,Y,C_ID_415BB3A509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,1.0,-1,8,4,0,4
4,Y,C_ID_EF55CF8D4B,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,1.0,-1,29,22,0,22


#### at this point we have , no of transactions for a particular card. we also have no of times card has been rejected and we also have no of times card has been approved.

we will put in another column which shows the total amount of purchase for a card. 


after this we will join our data with training data

In [22]:
total_amount_purchase = merged_transaction.groupby(['card_id'])['purchase_amount'].sum()

total_amount_purchase = total_amount_purchase.reset_index()
total_amount_purchase.columns = ['card_id', 'total_purchase']
total_amount_purchase.head()



Unnamed: 0,card_id,total_purchase
0,C_ID_00007093C1,-1.328524
1,C_ID_0001238066,-14.850055
2,C_ID_0001506EF0,-1.447354
3,C_ID_0001793786,-0.22962
4,C_ID_000183FDDA,-6.590778


In [23]:
purchase_merged = merged_transaction.merge(total_amount_purchase, how='right' , on ='card_id')

purchase_merged.head()
## remember purchase amounts are normalized

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,no_of_transaction,no_of_rejects,no_of_approved,total_purchase
0,Y,C_ID_415BB3A509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19,4,0,4,-2.350116
1,Y,C_ID_415BB3A509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19,4,0,4,-2.350116
2,Y,C_ID_415BB3A509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14,4,0,4,-2.350116
3,Y,C_ID_415BB3A509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,1.0,-1,8,4,0,4,-2.350116
4,Y,C_ID_EF55CF8D4B,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,1.0,-1,29,22,0,22,5.653665


In [24]:
purchase_merged.isnull().sum() ## joins have not introduced any null values

authorized_flag             0
card_id                     0
city_id                     0
category_1                  0
installments                0
category_3                  0
merchant_category_id        0
merchant_id             26216
month_lag                   0
purchase_amount             0
purchase_date               0
category_2                  0
state_id                    0
subsector_id                0
no_of_transaction           0
no_of_rejects               0
no_of_approved              0
total_purchase              0
dtype: int64

In [25]:
purchase_merged.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1963031 entries, 0 to 1963030
Data columns (total 18 columns):
authorized_flag         object
card_id                 object
city_id                 int64
category_1              object
installments            int64
category_3              object
merchant_category_id    int64
merchant_id             object
month_lag               int64
purchase_amount         float64
purchase_date           object
category_2              float64
state_id                int64
subsector_id            int64
no_of_transaction       int64
no_of_rejects           int64
no_of_approved          int64
total_purchase          float64
dtypes: float64(3), int64(9), object(6)
memory usage: 284.6+ MB
