# MIDS w207 - Final Project
## Elo Merchant Category Recommendation Kaggle Challenge

### Team 3
- Vinicio De Sola
- Kevin Hanna
- Pri Nonis
- Bradley Nott

In [1]:
import numpy   as np # linear algebra
import pandas  as pd # data processing
import os.path as op # file system access
import os      as os
import gc      as gc
import time    as ti

In [40]:
def root_mean_squared_error(y_true, y_pred) :
    return np.sqrt(((y_pred - y_true) ** 2).mean())

def mean_squared_error(y_true, y_pred) :
    return ((y_pred - y_true) ** 2).mean()

def setup_environment() :
    globals()['csv_base'] = '../input'  if 'working' in os.getcwd() else './input'
    globals()['pkl_base'] = '../pickle' if 'working' in os.getcwd() else './pickle'
    
    os.makedirs(csv_base, exist_ok = True)
    os.makedirs(pkl_base, exist_ok = True)
    
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    
    import warnings
    warnings.filterwarnings('ignore')

setup_environment()

# Data Loading and Cleanup

In [157]:
def compress(df, verbose = True) :
    smu = df.memory_usage().sum() / 1024**2 / 8
    con = {'f' : {                                   np.finfo(np.float16).max : np.float16, np.finfo(np.float32).max : np.float32, np.finfo(np.float64).max : np.float64},
           'u' : {np.iinfo(np.uint8).max : np.uint8, np.iinfo(np.uint16).max  : np.uint16,  np.iinfo(np.uint32).max  : np.uint32,  np.iinfo(np.uint64).max  : np.uint64},
           'i' : {np.iinfo(np.int8).max  : np.int8,  np.iinfo(np.int16).max   : np.int16,   np.iinfo(np.int32).max   : np.int32,   np.iinfo(np.int64).max   : np.int64}}

    for c in df.columns :
        if  con.get(df[c].dtype.kind) :
            df[c] = df[c].astype(con[df[c].dtype.kind].get(min((n for n in con[df[c].dtype.kind].keys() if n > max(df[c].max(), abs(df[c].min()))))))

    emu = df.memory_usage().sum() / 1024**2 / 8

    if  verbose :
        print(f'Memory Use Decreased to {emu:5.2f} MB [{100 * (smu - emu) / emu:5.1f}% Reduction]')

    return df, 100 * (smu - emu) / emu

def read(csv_path, dates = [], brize = [], dummy = [], delna = False, index = None, regen = False) :
    pkl_path = op.join(pkl_base, csv_path).replace('.csv', '.pkl')
    csv_path = op.join(csv_base, csv_path)
    srt_time = ti.time()
    
    if  op.exists(pkl_path) and not regen :
        df       = pd.read_pickle(pkl_path)
        csv_path = pkl_path
        rp       = 0.0
    else                    :
        df = pd.read_csv(csv_path, parse_dates = dates, memory_map = True)
        if  index :
            df = df.set_index(index)
        df, rp = compress(df, verbose = False)

        df.to_pickle(pkl_path)

    print(f'Loading : {op.basename(csv_path):>29} in {ti.time()-srt_time:5.1f} Seconds, Shape is {str(df.shape):>14}, Memory Usage is {df.memory_usage().sum() / 1024**2 / 8:6.2f} MB [Reduction of {rp:5.1f} %].')

    return df

def load() :
    data           = {}
    data['train' ] = read('train.csv', dates = ['first_active_month'], index = 'card_id', regen = True)
    data['test'  ] = read('test.csv',  dates = ['first_active_month'], index = 'card_id', regen = True)
  # data['mercs' ] = read('merchants.csv', index = 'merchant_id')
    data['tx_new'] = read('new_merchant_transactions.csv', dates = ['purchase_date'])
    data['tx_old'] = read('historical_transactions.csv',   dates = ['purchase_date'])
    data['target'] = data['train'].pop('target')
    print(f'\nLoading : Done.')

    return data

def clean(data) :
  # replace missing first active month
    data['test'].loc['C_ID_c27b4f80f7', 'first_active_month'] = data['test']['first_active_month'].min()
    
    return data

In [158]:
data = clean(load())

Loading :                     train.csv in   0.5 Seconds, Shape is    (201917, 5), Memory Usage is   0.51 MB [Reduction of 128.6 %].
Loading :                      test.csv in   0.3 Seconds, Shape is    (123623, 4), Memory Usage is   0.28 MB [Reduction of 110.5 %].
Loading : new_merchant_transactions.pkl in   0.9 Seconds, Shape is  (1963031, 14), Memory Usage is  14.27 MB [Reduction of   0.0 %].
Loading :   historical_transactions.pkl in  10.1 Seconds, Shape is (29112361, 14), Memory Usage is 218.64 MB [Reduction of   0.0 %].

Loading : Done.


In [6]:
data['tx_new'].describe()

Unnamed: 0,city_id,installments,merchant_category_id,month_lag,purchase_amount,category_2,state_id,subsector_id
count,1963031.0,1963031.0,1963031.0,1963031.0,1963031.0,1851286.0,1963031.0,1963031.0
mean,134.3867,0.6829643,430.9701,1.476515,,,10.88067,25.97624
std,101.5152,1.584069,246.3385,0.4994483,,0.0,6.038542,10.12908
min,-1.0,-1.0,-1.0,1.0,-0.7470703,1.0,-1.0,-1.0
25%,69.0,0.0,278.0,1.0,-0.7167969,1.0,9.0,19.0
50%,110.0,1.0,367.0,1.0,-0.6748047,1.0,9.0,29.0
75%,212.0,1.0,683.0,2.0,-0.581543,3.0,15.0,34.0
max,347.0,999.0,891.0,2.0,263.25,5.0,24.0,41.0


# Feature Engineering

In [100]:
def draft(data) :

    np.random.seed(0)
    
    plan = {}
    
    plan['scorer'        ] = root_mean_squared_error
    plan['baseline_model'] = None
    plan['best_model'    ] = None
    plan['baseline_score'] =   0.0
    plan['best_score'    ] = 100.0
    plan['baseline_feats'] = []
    plan['best_feats'    ] = []

    plan['train_size'] = len(data['train' ])
    plan['train_pcnt'] = 0.8
    plan['train_mask'] = np.random.rand(plan['train_size']) < plan['train_pcnt']
    plan['devel_mask'] =               ~plan['train_mask']

    plan['y_train'] = data['target'][plan['train_mask']]
    plan['y_devel'] = data['target'][plan['devel_mask']]

    plan['first_active_month_max'] = max(data['train' ].first_active_month.max(),
                                         data['test'  ].first_active_month.max())
    plan['purchase_date_max'     ] = max(data['tx_old'].purchase_date.max(),
                                         data['tx_new'].purchase_date.max())
    plan['purchase_date_ref'     ] =     data['tx_old'].purchase_date.max()

    return plan

In [101]:
plan = draft(data)

In [133]:
def engineer_transactions(tf, prefix) :

  # binarize boolean Y/N flag variables
    tf['category_1'     ] = tf['category_1'     ].eq('Y').mul(1)
    tf['authorized_flag'] = tf['authorized_flag'].eq('Y').mul(1)

  # convert categorical variables to dummy/indicator, preserve original categorical variable
    tf = pd.concat([tf, pd.get_dummies(tf[['category_2', 'category_3']],
                                columns = ['category_2', 'category_3'])], axis = 1)

  # recover purchase history by denormolizing
    tf['month_diff'    ] = (plan['purchase_date_ref'] - tf['purchase_date']) // np.timedelta64(1, 'M') \
                         + (                            tf['month_lag'    ])

  # extract purchase_month from date
    tf['purchase_month'] = tf['purchase_date'].dt.month

  # convert datetime to numerical
    tf['purchase_ndate'] = tf['purchase_date'].astype(np.int64) * 1e-9

    return tf
    
def engineer_transactions_aggregated(tf, prefix, regen = False) :

    pkl_path = op.join(pkl_base, f'engineered_{prefix}_transactions_aggregated.pkl')
    srt_time = ti.time()
    
    if  op.exists(pkl_path) and not regen :

        df = pd.read_pickle(pkl_path)
        
    else :

        tf = engineer_transactions(tf, tf)

      # ─────────────────────────────────────────────────────────────────────────────────────────────────────────────
      # aggregate transactions per card
      # ─────────────────────────────────────────────────────────────────────────────────────────────────────────────

        aggregations = \
        {
            'category_1'           : ['mean', 'sum'],

            'category_2_1.0'       : ['mean'],
            'category_2_2.0'       : ['mean'],
            'category_2_3.0'       : ['mean'],
            'category_2_4.0'       : ['mean'],
            'category_2_5.0'       : ['mean'],
            'category_3_A'         : ['mean'],
            'category_3_B'         : ['mean'],
            'category_3_C'         : ['mean'],

            'merchant_id'          : ['nunique'],
            'merchant_category_id' : ['nunique'],
            'state_id'             : ['nunique'],
            'city_id'              : ['nunique'],
            'subsector_id'         : ['nunique'],

            'purchase_amount'      : ['mean', 'sum', 'max', 'min', 'std'],
            'installments'         : ['mean', 'sum', 'max', 'min', 'std'],
            'purchase_month'       : ['mean',        'max', 'min', 'std'],
            'purchase_ndate'       : [np.ptp,        'max', 'min'       ],
            'month_lag'            : ['mean',        'max', 'min', 'std'],
            'month_diff'           : ['mean'                            ]
        }

        aggregations_kev = \
        {
            'purchase_amount'      : ['sum', 'mean', 'median', 'min', 'max', 'std'],
            'subsector_id'         : ['nunique']
            'merchant_category_id' : ['nunique'],
            'merchant_id'          : ['nunique'],
            'installments'         : ['sum', 'mean'],
            'city_id'              : ['nunique'],
            'state_id'             : ['nunique'], 
            'category_1'           : ['sum'],
            'category_2_1.0'       : ['sum'],
            'category_2_2.0'       : ['sum'],
            'category_2_3.0'       : ['sum'],
            'category_2_4.0'       : ['sum'],
            'category_2_5.0'       : ['sum'],
            'category_3_A'         : ['sum'],
            'category_3_B'         : ['sum'],
            'category_3_C'         : ['sum'],

            'month_lag'           : ['mean', 'min', 'max', 'std'],
            'authorized_flag'     : ['sum'],
            'month_diff'          : ['mean', 'min', 'max', 'std'],
            'purchase_date'       : [np.ptp, 'min', 'max']
        }

      # add aggregations
        df = tf.groupby(['card_id']).agg(aggregations)
        df.columns = ['_'.join((prefix,) + c) for c in df.columns.values]
        df.reset_index(inplace = True)
       
      # add aggregated transaction count
        tc = tf.groupby('card_id').size().reset_index(name = f'{prefix}_transaction_count')
        df = pd.merge(tc, df, on = 'card_id', how = 'left')

        df.to_pickle(pkl_path)
    
    print(f'Engineering : {prefix:>5} Transactions in {ti.time()-srt_time:5.1f} Seconds.')
    
    return df

In [144]:
def engineer_cards(df, aggs, plan, prefix, regen = False) :

    pkl_path = op.join(pkl_base, f'engineered_{prefix}_cards.pkl')
    srt_time = ti.time()
    
    if  op.exists(pkl_path) and not regen :

        df = pd.read_pickle(pkl_path)

    else :
    
      # convert categorical variables to dummy/indicator, preserve original categorical variable
        df = pd.concat([df, pd.get_dummies(df[['feature_1', 'feature_2']],
                                    columns = ['feature_1', 'feature_2'])], axis = 1)

      # normalized active days of card from first date the shopper made a purchase through Elo
        df['active_days'] = (plan['first_active_month_max'] - df['first_active_month']).dt.days

        for agg in aggs :
            df = pd.merge( df, aggs[agg], on = 'card_id', how = 'left')

        df.to_pickle(pkl_path)
            
    print(f'Engineering : {prefix:>5} Cards        in {ti.time()-srt_time:5.1f} Seconds.')
        
    return df

In [145]:
def engineer(data, plan, delete = False) :

  # extract approved old transactions
    data['tx_app'] = data['tx_old'][data['tx_old']['authorized_flag'] == 'Y']

  # aggregate transaction features
    aggs           = {}
    aggs['tx_new'] = engineer_transactions_aggregated(data['tx_new'], prefix = 'new')
    aggs['tx_old'] = engineer_transactions_aggregated(data['tx_old'], prefix = 'old')
    aggs['tx_app'] = engineer_transactions_aggregated(data['tx_app'], prefix = 'app')

  # join aggregated features to train and test sets
    feat           = {}
    feat['train' ] = engineer_cards(data['train'], aggs, plan, prefix = 'train', regen = True)
    feat['test'  ] = engineer_cards(data['test' ], aggs, plan, prefix = 'test' , regen = True)
  
    if  delete :
        del aggs
        del data
        del globals()['data']
        d = gc.collect()
    
    print(f'\nEngineering : Done.')

    return feat

In [148]:
feat = engineer(data, plan)

Engineering :   new Transactions in   0.1 Seconds.
Engineering :   old Transactions in   0.1 Seconds.
Engineering :   app Transactions in   0.1 Seconds.
Engineering : train Cards        in   3.9 Seconds.
Engineering :  test Cards        in   2.4 Seconds.

Engineering : Done.


In [149]:
feat['train'].head()

Unnamed: 0,card_id,first_active_month,feature_1,feature_2,feature_3,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,active_days,new_transaction_count,new_category_1_mean,new_category_1_sum,new_category_2_1.0_mean,new_category_2_2.0_mean,new_category_2_3.0_mean,new_category_2_4.0_mean,new_category_2_5.0_mean,new_category_3_A_mean,new_category_3_B_mean,new_category_3_C_mean,new_merchant_id_nunique,new_merchant_category_id_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_purchase_amount_mean,new_purchase_amount_sum,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_mean,new_installments_sum,new_installments_max,new_installments_min,new_installments_std,new_purchase_month_mean,new_purchase_month_max,new_purchase_month_min,new_purchase_month_std,new_purchase_ndate_ptp,new_purchase_ndate_max,new_purchase_ndate_min,new_month_lag_mean,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_month_diff_mean,old_transaction_count,old_category_1_mean,old_category_1_sum,old_category_2_1.0_mean,old_category_2_2.0_mean,old_category_2_3.0_mean,old_category_2_4.0_mean,old_category_2_5.0_mean,old_category_3_A_mean,old_category_3_B_mean,old_category_3_C_mean,old_merchant_id_nunique,old_merchant_category_id_nunique,old_state_id_nunique,old_city_id_nunique,old_subsector_id_nunique,old_purchase_amount_mean,old_purchase_amount_sum,old_purchase_amount_max,old_purchase_amount_min,old_purchase_amount_std,old_installments_mean,old_installments_sum,old_installments_max,old_installments_min,old_installments_std,old_purchase_month_mean,old_purchase_month_max,old_purchase_month_min,old_purchase_month_std,old_purchase_ndate_ptp,old_purchase_ndate_max,old_purchase_ndate_min,old_month_lag_mean,old_month_lag_max,old_month_lag_min,old_month_lag_std,old_month_diff_mean,app_transaction_count,app_category_1_mean,app_category_1_sum,app_category_2_1.0_mean,app_category_2_2.0_mean,app_category_2_3.0_mean,app_category_2_4.0_mean,app_category_2_5.0_mean,app_category_3_A_mean,app_category_3_B_mean,app_category_3_C_mean,app_merchant_id_nunique,app_merchant_category_id_nunique,app_state_id_nunique,app_city_id_nunique,app_subsector_id_nunique,app_purchase_amount_mean,app_purchase_amount_sum,app_purchase_amount_max,app_purchase_amount_min,app_purchase_amount_std,app_installments_mean,app_installments_sum,app_installments_max,app_installments_min,app_installments_std,app_purchase_month_mean,app_purchase_month_max,app_purchase_month_min,app_purchase_month_std,app_purchase_ndate_ptp,app_purchase_ndate_max,app_purchase_ndate_min,app_month_lag_mean,app_month_lag_max,app_month_lag_min,app_month_lag_std,app_month_diff_mean
0,C_ID_92a2005557,2017-06-01,5,2,1,0,0,0,0,1,0,1,0,245,23.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,23.0,14.0,1.0,3.0,10.0,-0.575684,-13.242188,-0.296143,-0.724609,0.135742,0.0,0.0,0.0,0.0,0.0,3.478261,4.0,3.0,0.510754,4742309.0,1525001000.0,1520259000.0,1.478261,2.0,1.0,0.510754,-0.043478,260,0.0,0,0.988462,0.0,0.0,0.0,0.011538,0.984615,0.015385,0.0,94,41,3,7,21,-0.638341,-165.968735,2.258394,-0.739395,0.212139,0.015385,4,1,0,0.123314,8.057692,12,1,3.474193,20977987.0,1519551000.0,1498573000.0,-3.911538,0,-8,2.397687,-0.05,247,0.0,0,0.987854,0.0,0.0,0.0,0.012146,1.0,0.0,0.0,93,41,3,7,21,-0.637235,-157.397018,2.258394,-0.739395,0.216518,0.0,0,0,0,0.0,7.979757,12,1,3.52857,20977987.0,1519551000.0,1498573000.0,-3.882591,0,-8,2.429155,-0.048583
1,C_ID_3d0044924f,2017-01-01,4,1,0,0,0,0,1,0,1,0,0,396,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.0,5.0,1.0,1.0,4.0,-0.726074,-4.355469,-0.70166,-0.739258,0.014381,1.0,6.0,1.0,1.0,0.0,2.5,3.0,2.0,0.547723,4887632.0,1522393000.0,1517505000.0,1.5,2.0,1.0,0.547723,1.0,350,0.088571,31,0.911429,0.0,0.0,0.0,0.0,0.0,0.788571,0.205714,142,57,3,9,24,-0.600018,-210.006332,4.6303,-0.7424,0.384967,1.551429,543,10,-1,1.510777,6.22,12,1,3.848142,33717687.0,1517438000.0,1483720000.0,-5.031429,0,-12,3.804934,0.957143,339,0.085546,29,0.914454,0.0,0.0,0.0,0.0,0.0,0.80236,0.19174,141,57,3,9,24,-0.616175,-208.883453,4.6303,-0.7424,0.355554,1.477876,501,10,-1,1.350634,6.144543,12,1,3.859177,33717687.0,1517438000.0,1483720000.0,-5.050147,0,-12,3.836969,0.961652
2,C_ID_d639edf6cd,2016-08-01,2,2,0,0,1,0,0,0,0,1,0,549,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,-0.700195,-0.700195,-0.700195,-0.700195,,0.0,0.0,0.0,0.0,,4.0,4.0,4.0,,0.0,1524937000.0,1524937000.0,2.0,2.0,2.0,,0.0,43,0.0,0,0.093023,0.0,0.0,0.0,0.906977,1.0,0.0,0.0,13,8,2,5,7,-0.678311,-29.167391,-0.145847,-0.730138,0.08738,0.0,0,0,0,0.0,4.55814,12,1,3.275467,35635623.0,1519759000.0,1484123000.0,-8.604651,0,-13,3.842987,-0.023256,41,0.0,0,0.097561,0.0,0.0,0.0,0.902439,1.0,0.0,0.0,13,8,2,5,7,-0.678742,-27.828424,-0.145847,-0.730138,0.08923,0.0,0,0,0,0.0,4.634146,12,1,3.329836,35635623.0,1519759000.0,1484123000.0,-8.487805,0,-13,3.893083,-0.02439
3,C_ID_186d6a6901,2017-09-01,4,3,0,0,0,0,1,0,0,0,1,153,7.0,0.142857,1.0,0.0,0.0,0.0,0.857143,0.0,0.0,0.857143,0.0,7.0,6.0,2.0,2.0,5.0,-0.665039,-4.65625,-0.566895,-0.734375,0.065918,0.714286,5.0,1.0,-1.0,0.755929,3.714286,4.0,3.0,0.48795,3625505.0,1524049000.0,1520424000.0,1.714286,2.0,1.0,0.48795,0.0,77,0.155844,12,0.155844,0.0,0.0,0.688312,0.0,0.0,0.883117,0.090909,50,25,5,7,13,-0.642745,-49.491364,1.445596,-0.740897,0.261624,1.090909,84,3,-1,0.588974,7.74026,12,1,3.904797,13375339.0,1519818000.0,1506443000.0,-2.831169,0,-5,1.802065,-0.038961,77,0.155844,12,0.155844,0.0,0.0,0.688312,0.0,0.0,0.883117,0.090909,50,25,5,7,13,-0.642745,-49.491364,1.445596,-0.740897,0.261624,1.090909,84,3,-1,0.588974,7.74026,12,1,3.904797,13375339.0,1519818000.0,1506443000.0,-2.831169,0,-5,1.802065,-0.038961
4,C_ID_cdbd2c0db2,2017-11-01,1,3,0,1,0,0,0,0,0,0,1,92,36.0,0.055556,2.0,0.055556,0.0,0.194444,0.694444,0.0,0.0,0.944444,0.027778,36.0,17.0,5.0,5.0,10.0,-0.553711,-19.921875,0.450928,-0.739258,0.223877,0.972222,35.0,2.0,-1.0,0.376913,3.555556,4.0,3.0,0.503953,4949682.0,1524941000.0,1519992000.0,1.555556,2.0,1.0,0.503953,-0.055556,133,0.112782,15,0.075188,0.0,0.0,0.804511,0.007519,0.0,0.947368,0.052632,66,26,6,6,17,-0.366073,-48.687656,7.193041,-0.746156,1.352094,1.368421,182,12,1,1.896862,5.406015,12,1,5.003086,9405641.0,1519850000.0,1510445000.0,-1.285714,0,-3,1.0267,-0.022556,128,0.09375,12,0.078125,0.0,0.0,0.820312,0.007812,0.0,0.96875,0.03125,65,26,6,6,17,-0.539379,-69.040466,6.992617,-0.746156,0.737087,1.125,144,12,1,1.003929,5.554688,12,1,5.041261,9405641.0,1519850000.0,1510445000.0,-1.320312,0,-3,1.02668,-0.023438


In [159]:
print('\n'.join(sorted(feat['train'].columns.values)))

active_days
app_category_1_mean
app_category_1_sum
app_category_2_1.0_mean
app_category_2_2.0_mean
app_category_2_3.0_mean
app_category_2_4.0_mean
app_category_2_5.0_mean
app_category_3_A_mean
app_category_3_B_mean
app_category_3_C_mean
app_city_id_nunique
app_installments_max
app_installments_mean
app_installments_min
app_installments_std
app_installments_sum
app_merchant_category_id_nunique
app_merchant_id_nunique
app_month_diff_mean
app_month_lag_max
app_month_lag_mean
app_month_lag_min
app_month_lag_std
app_purchase_amount_max
app_purchase_amount_mean
app_purchase_amount_min
app_purchase_amount_std
app_purchase_amount_sum
app_purchase_month_max
app_purchase_month_mean
app_purchase_month_min
app_purchase_month_std
app_purchase_ndate_max
app_purchase_ndate_min
app_purchase_ndate_ptp
app_state_id_nunique
app_subsector_id_nunique
app_transaction_count
card_id
feature_1
feature_1_1
feature_1_2
feature_1_3
feature_1_4
feature_1_5
feature_2
feature_2_1
feature_2_2
feature_2_3
feature_3
fi

In [18]:
feat['train'].describe()

Unnamed: 0,feature_1,feature_2,feature_3,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,active_days,new_transaction_count,new_category_1_mean,new_category_1_sum,new_category_2_1.0_mean,new_category_2_2.0_mean,new_category_2_3.0_mean,new_category_2_4.0_mean,new_category_2_5.0_mean,new_category_3_A_mean,new_category_3_B_mean,new_category_3_C_mean,new_merchant_id_nunique,new_merchant_category_id_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_purchase_amount_mean,new_purchase_amount_sum,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_mean,new_installments_sum,new_installments_max,new_installments_min,new_installments_std,new_purchase_month_mean,new_purchase_month_max,new_purchase_month_min,new_purchase_month_std,new_purchase_ndate_ptp,new_purchase_ndate_max,new_purchase_ndate_min,new_month_lag_mean,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_month_diff_mean,old_transaction_count,old_category_1_mean,old_category_1_sum,old_category_2_1.0_mean,old_category_2_2.0_mean,old_category_2_3.0_mean,old_category_2_4.0_mean,old_category_2_5.0_mean,old_category_3_A_mean,old_category_3_B_mean,old_category_3_C_mean,old_merchant_id_nunique,old_merchant_category_id_nunique,old_state_id_nunique,old_city_id_nunique,old_subsector_id_nunique,old_purchase_amount_mean,old_purchase_amount_sum,old_purchase_amount_max,old_purchase_amount_min,old_purchase_amount_std,old_installments_mean,old_installments_sum,old_installments_max,old_installments_min,old_installments_std,old_purchase_month_mean,old_purchase_month_max,old_purchase_month_min,old_purchase_month_std,old_purchase_ndate_ptp,old_purchase_ndate_max,old_purchase_ndate_min,old_month_lag_mean,old_month_lag_max,old_month_lag_min,old_month_lag_std,old_month_diff_mean
count,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,153199.0,179986.0,179986.0,179986.0,179986.0,153199.0,179986.0,179986.0,179986.0,153199.0,179986.0,179986.0,179986.0,179986.0,179986.0,179986.0,153199.0,179986.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0
mean,3.105311,1.74541,0.565569,0.059614,0.276336,0.364372,0.098481,0.201197,0.441974,0.370642,0.187384,381.978981,6.776555,0.049146,0.2173,0.524707,0.035994,0.15384,0.081576,0.128451,0.478555,0.385585,0.100245,6.683353,5.339232,1.582051,2.30943,4.453919,,,-0.131714,,0.231445,0.744392,4.62656,1.479748,0.328637,0.519733,4.084927,4.519679,3.648789,0.524177,2543707.0,1520773000.0,1518229000.0,1.476221,1.815386,1.158679,0.396778,1.11446,89.29416,0.10108,6.379359,0.494796,0.036169,0.1481,0.077481,0.121337,0.466704,0.411174,0.110311,33.679571,18.27803,2.73086,4.901628,11.460788,2.459435,14.42581,66.68344,-0.729992,12.64487,0.863678,57.854049,4.253163,0.055993,0.79586,6.51463,11.212201,1.552871,3.421927,21098360.0,1515851000.0,1494753000.0,-3.840994,-0.166058,-7.877777,2.408912,1.097642
std,1.18616,0.751362,0.495683,0.23677,0.447186,0.481255,0.297965,0.400896,0.496623,0.482978,0.39022,293.710176,6.694579,0.164295,0.593864,0.455389,0.170197,0.335172,0.253042,0.310203,0.497842,0.429808,0.218113,6.650089,4.255338,0.873557,1.670109,3.058439,0.0,0.0,0.0,0.0,0.592285,1.138139,8.195394,3.359907,1.036922,1.094625,2.280113,2.477625,2.258646,0.854921,1743585.0,6347417.0,6331257.0,0.3308,0.387985,0.365378,0.227721,2.360083,104.60598,0.208051,19.57249,0.431898,0.164164,0.316865,0.238988,0.291834,0.489023,0.405691,0.178446,30.694613,11.47046,1.516658,3.496264,5.18197,1115.16626,13448.94,13448.46,0.091895,3865.223,1.135805,100.708774,22.941158,0.723225,3.057892,1.834034,1.849911,1.624113,1.154434,10219700.0,6458842.0,10294180.0,2.123494,0.636972,3.843313,1.173817,2.361138
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,-0.74707,-70.8125,-0.74707,-0.74707,0.0,-1.0,-6.0,-1.0,-1.0,0.0,1.0,1.0,1.0,0.0,0.0,1488579000.0,1488339000.0,1.0,1.0,1.0,0.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,-0.745371,-2088.012,-0.745315,-0.746908,0.0,-0.692308,-9.0,0.0,-1.0,0.0,1.072727,2.0,1.0,0.122169,152065.0,1485935000.0,1483229000.0,-12.452381,-11.0,-13.0,0.122169,-0.833333
25%,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,153.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,2.0,-0.692871,-5.257812,-0.647949,-0.73584,0.039307,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,0.351866,746530.5,1520338000.0,1518432000.0,1.230769,2.0,1.0,0.351091,0.0,26.0,0.0,0.0,0.048872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,10.0,2.0,3.0,8.0,-0.679118,-65.84727,-0.3472026,-0.743903,0.07668257,0.0,0.0,0.0,0.0,0.0,5.447368,12.0,1.0,2.818309,11928120.0,1514747000.0,1484342000.0,-5.520619,0.0,-12.0,1.376629,-0.051282
50%,3.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,306.0,5.0,0.0,0.0,0.631579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,4.0,1.0,2.0,4.0,-0.641602,-2.6875,-0.513184,-0.723633,0.08905,0.333333,1.0,1.0,0.0,0.0,3.5,4.0,3.0,0.5,2845612.0,1523342000.0,1520263000.0,1.5,2.0,1.0,0.5,0.0,55.0,0.0,0.0,0.433333,0.0,0.0,0.0,0.0,0.005988,0.307692,0.005348,24.0,16.0,2.0,4.0,11.0,-0.621435,-30.38694,0.123548,-0.740296,0.1604841,1.0,15.0,2.0,0.0,0.288104,6.566434,12.0,1.0,3.606741,20158730.0,1518897000.0,1492623000.0,-3.571429,0.0,-8.0,2.2734,-0.015625
75%,4.0,2.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,488.0,9.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.857143,0.090909,9.0,7.0,2.0,3.0,6.0,-0.531738,-1.211914,-0.181885,-0.691895,0.212036,1.0,7.0,2.0,1.0,0.755929,4.0,4.0,3.0,0.534522,4083677.0,1524643000.0,1521240000.0,1.714286,2.0,1.0,0.534522,1.0,111.0,0.090909,5.0,0.978495,0.0,0.013605,0.0,0.012658,1.0,0.842975,0.157895,44.0,24.0,3.0,6.0,15.0,-0.501713,-12.46937,1.277163,-0.731881,0.3617415,1.344262,77.0,6.0,1.0,1.152827,7.709677,12.0,1.0,4.198858,31255460.0,1519670000.0,1504185000.0,-2.034965,0.0,-4.0,3.446564,0.974026
max,5.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2284.0,109.0,1.0,16.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,109.0,40.0,14.0,28.0,24.0,44.1875,109.125,76.6875,44.1875,47.34375,34.62069,1004.0,999.0,12.0,185.476878,12.0,12.0,12.0,7.778175,5343723.0,1525133000.0,1525132000.0,2.0,2.0,2.0,0.707107,12.0,2912.0,1.0,1786.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,398.0,95.0,20.0,67.0,34.0,500883.0,6010596.0,6010604.0,28.55479,1735112.0,79.307692,2413.0,999.0,10.0,276.341994,11.985075,12.0,11.0,7.778175,36626920.0,1519862000.0,1514762000.0,-0.014925,0.0,-1.0,7.0,12.0


# Baseline - Linear Regression Model

In [174]:
def prep(plan, feat, features) :
    return feat['train'][features][plan['train_mask']].fillna(0),\
           feat['train'][features][plan['devel_mask']].fillna(0),\
           feat['test' ][features].fillna(0)

def grade(plan, kind, y_pred, y_test, tag = '', baseline = False) :
    if  baseline :
        plan[    'best_score'] = \
        plan['baseline_score'] = plan['scorer'](plan['y_devel'], y_pred)
        plan[    'best_ytest'] = \
        plan['baseline_ytest'] = y_test
        tag                   += '⭕'

    score   =  plan['scorer'](plan['y_devel'], y_pred)
    improve = (plan['baseline_score'] - score) / plan['baseline_score'] * 100

    if  score <= plan['best_score'] and not baseline :
        plan['best_ytest'] = y_test
        plan['best_score'] = score
        tag               += '⭐'

    print(f'{kind:<17} : Score is {score:6.3f} [{improve:+6.3f}%]' +
         (f' {tag}' if tag else ''))

    return plan

In [181]:
def regression(plan, feat, features, baseline = False) :
    kind    = 'Linear Regression'
    x_train,\
    x_devel,\
    x_test  = prep(plan, feat, features)

    from sklearn.linear_model  import LinearRegression

    model   = LinearRegression() \
                .fit(x_train, plan['y_train'])

    return grade(plan, kind, model.predict(x_devel), model.predict(x_test ), baseline = baseline)

In [182]:
def regression_play(plan, feat) :
    plan = regression(plan, feat, features = ['feature_1', 'feature_2', 'feature_3'               ], baseline = True ) # base features in train          - baseline
    plan = regression(plan, feat, features = ['old_purchase_amount_sum', 'new_purchase_amount_sum'], baseline = False) # old vs new purchase amounts     - hunch
    plan = regression(plan, feat, features = feat['train']._get_numeric_data().columns,              baseline = False) # all numeric engineered features - kitchen sink

    return plan

plan = regression_play(plan, feat)

Linear Regression : Score is  3.777 [+0.000%] ⭕
Linear Regression : Score is  3.789 [-0.306%]
Linear Regression : Score is  3.799 [-0.586%]


# Lasso Model

In [173]:
def lasso(plan, feat, features, alphas) :
    kind    = 'Linear Lasso'
    x_train,\
    x_devel,\
    x_test  = prep(plan, feat, features)

    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import KFold
    from sklearn.linear_model    import Lasso

    params  = [{'alpha' : alphas}]
    folds   = 5

    grid    = GridSearchCV(Lasso(random_state = 0), params, cv = folds, scoring = 'neg_mean_squared_error') \
                .fit(x_train, plan['y_train'])
    model   = grid.best_estimator_

    return grade(plan, kind, model.predict(x_devel), model.predict(x_test ), tag = f'(Alpha = {grid.best_estimator_.alpha:8.5f}) ')

In [183]:
def lasso_play(plan, feat) :
    plan = lasso(plan, feat, features = ['feature_1', 'feature_2', 'feature_3'               ], alphas = [0.00001, 0.001, 0.5, 10])
    plan = lasso(plan, feat, features = ['old_purchase_amount_sum', 'new_purchase_amount_sum'], alphas = np.logspace(-4, -0.5, 30))
  # plan = lasso(plan, feat, features = feat['train']._get_numeric_data().columns,              alphas = [0.001]                  )

    return plan

plan = lasso_play(plan, feat)

Linear Lasso      : Score is  3.777 [-0.000%] (Alpha =  0.00100) 
Linear Lasso      : Score is  3.789 [-0.307%] (Alpha =  0.00010) 


# XGBoost

In [206]:
def xgboost(plan, feat, features, params = {}) :
    kind    = 'XGBoost'
    x_train,\
    x_devel,\
    x_test  = prep(plan, feat, features)

    y_pred  = np.ones(len(x_devel)) * -0.3928
    y_test  = np.ones(len(x_test )) * -0.3928

    return grade(plan, kind, y_pred, y_test)

In [207]:
def xgboost_play(plan, feat) :
    plan = xgboost(plan, feat, features = ['feature_1', 'feature_2', 'feature_3'               ])
    
    return plan

plan = xgboost_play(plan, feat)

XGBoost           : Score is  3.778 [-0.014%]


In [205]:
plan['y_devel'].mean()

-0.3928

# Neural Network

In [198]:
def neural(plan, feat, features, params = {}) :
    kind    = 'Neural Network'
    x_train,\
    x_devel,\
    x_test  = prep(plan, feat, features)

    y_pred  = np.zeros(len(x_devel))
    y_test  = np.zeros(len(x_test ))

    return grade(plan, kind, y_pred, y_test)

In [199]:
def neural_play(plan, feat) :
    plan = neural(plan, feat, features = ['feature_1', 'feature_2', 'feature_3'               ])
    
    return plan

plan = neural_play(plan, feat)

Neural Network    : Score is  3.798 [-0.554%]


# Make Submission

In [200]:
def submit(plan, feat) :

    model  = plan['best_model']
    feats  = plan['best_feats']
    score  = plan['best_score']

    x_test = feat['test'][feats]
    y_pred = model.predict(x_test)
    
    submission = pd.DataFrame({ 'card_id' : feat['test']['card_id'].values,
                                'target'  : y_pred })
    
    display(submission.head())
    
    submission.to_csv('submission.csv', index = False)  

In [201]:
submit(plan, feat)

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-0.427957
1,C_ID_130fd0cbdd,-0.388911
2,C_ID_b709037bc5,-0.463057
3,C_ID_d27d835a9f,-0.314101
4,C_ID_2b5e3df5c2,-0.463057
