Learning from + all credits to: https://www.kaggle.com/chauhuynh/my-first-kernel-3-699/notebook

In [2]:
import numpy as np
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
np.random.seed(23)

In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_hist_trans = pd.read_csv('data/historical_transactions.csv')
df_new_merchant_trans = pd.read_csv('data/new_merchant_transactions.csv')

In [4]:
for df in [df_hist_trans, df_new_merchant_trans]:
    df['category_2'].fillna(1.0, inplace=True)
    df['category_3'].fillna('A', inplace=True)
    df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)

In [5]:
for df in [df_hist_trans, df_new_merchant_trans]:
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['year'] = df['purchase_date'].dt.year
    df['month'] = df['purchase_date'].dt.month
    df['dayofweek'] = df['purchase_date'].dt.dayofweek
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['hour'] = df['purchase_date'].dt.hour
    df['weekend'] = (df.purchase_date.dt.dayofweek >=5).astype(int)
    df['authorized_flag'] = df['authorized_flag'].map({'Y':1, 'N':0})
    df['category_1'] = df['category_1'].map({'Y':1, 'N':0})
    df['month_diff'] = (((datetime.datetime.today()-df['purchase_date']).dt.days)//30) + df['month_lag']

In [6]:
df_hist_trans.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,category_2,state_id,subsector_id,year,month,dayofweek,weekofyear,hour,weekend,month_diff
0,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_e020e9b302,-8,-0.703331,...,1.0,16,37,2017,6,6,25,15,1,11
1,1,C_ID_4e6213e9bc,88,0,0,A,367,M_ID_86ec983688,-7,-0.733128,...,1.0,16,16,2017,7,5,28,12,1,12
2,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_979ed661fc,-6,-0.720386,...,1.0,16,37,2017,8,2,32,22,0,12
3,1,C_ID_4e6213e9bc,88,0,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,...,1.0,16,34,2017,9,5,35,10,1,12
4,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_e020e9b302,-11,-0.722865,...,1.0,16,37,2017,3,4,10,1,0,12


In [10]:
df_hist_trans.to_csv('data/hist_txn_tmp.csv', index = False, index_label=False)
df_new_merchant_trans.to_csv('data/new_merchant_txn_tmp.csv', index = False, index_label=False)

In [9]:
def get_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

    '''
    for k in aggs.keys():
        for agg in aggs[k]:
            name+_+k+agg
    '''

In [11]:
aggs = {}
for col in ['month','hour','weekofyear','dayofweek','year','subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']

In [14]:
aggs

{'dayofweek': ['nunique'],
 'hour': ['nunique'],
 'merchant_category_id': ['nunique'],
 'merchant_id': ['nunique'],
 'month': ['nunique'],
 'subsector_id': ['nunique'],
 'weekofyear': ['nunique'],
 'year': ['nunique']}

In [15]:
aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['authorized_flag'] = ['sum', 'mean']
aggs['weekend'] = ['sum', 'mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size']

In [16]:
aggs

{'authorized_flag': ['sum', 'mean'],
 'card_id': ['size'],
 'category_1': ['sum', 'mean'],
 'dayofweek': ['nunique'],
 'hour': ['nunique'],
 'installments': ['sum', 'max', 'min', 'mean', 'var'],
 'merchant_category_id': ['nunique'],
 'merchant_id': ['nunique'],
 'month': ['nunique'],
 'month_diff': ['mean'],
 'month_lag': ['max', 'min', 'mean', 'var'],
 'purchase_amount': ['sum', 'max', 'min', 'mean', 'var'],
 'purchase_date': ['max', 'min'],
 'subsector_id': ['nunique'],
 'weekend': ['sum', 'mean'],
 'weekofyear': ['nunique'],
 'year': ['nunique']}

In [17]:
for col in ['category_2','category_3']:
    df_hist_trans[col+'_mean'] = df_hist_trans.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']  

In [21]:
aggs

{'authorized_flag': ['sum', 'mean'],
 'card_id': ['size'],
 'category_1': ['sum', 'mean'],
 'category_2_mean': ['mean'],
 'category_3_mean': ['mean'],
 'dayofweek': ['nunique'],
 'hour': ['nunique'],
 'installments': ['sum', 'max', 'min', 'mean', 'var'],
 'merchant_category_id': ['nunique'],
 'merchant_id': ['nunique'],
 'month': ['nunique'],
 'month_diff': ['mean'],
 'month_lag': ['max', 'min', 'mean', 'var'],
 'purchase_amount': ['sum', 'max', 'min', 'mean', 'var'],
 'purchase_date': ['max', 'min'],
 'subsector_id': ['nunique'],
 'weekend': ['sum', 'mean'],
 'weekofyear': ['nunique'],
 'year': ['nunique']}

In [None]:
new_columns = get_new_columns('hist',aggs)
df_hist_trans_group = df_hist_trans.groupby('card_id').agg(aggs)
df_hist_trans_group.columns = new_columns
df_hist_trans_group.reset_index(drop=False,inplace=True)
df_hist_trans_group['hist_purchase_date_diff'] = (df_hist_trans_group['hist_purchase_date_max'] - df_hist_trans_group['hist_purchase_date_min']).dt.days
df_hist_trans_group['hist_purchase_date_average'] = df_hist_trans_group['hist_purchase_date_diff']/df_hist_trans_group['hist_card_id_size']
df_hist_trans_group['hist_purchase_date_uptonow'] = (datetime.datetime.today() - df_hist_trans_group['hist_purchase_date_max']).dt.days