# MIDS w207 - Final Project
## Elo Merchant Category Recommendation Kaggle Challenge

### Team 3
- Vinicio De Sola
- Kevin Hanna
- Pri Nonis
- Bradley Nott

In [5]:
import numpy   as np # linear algebra
import pandas  as pd # data processing
import os.path as op # file system access
import os      as os
import gc      as gc
import time    as ti

In [6]:
def root_mean_squared_error(y_true, y_pred) :
    return np.sqrt(((y_pred - y_true) ** 2).mean())

def mean_squared_error(y_true, y_pred) :
    return ((y_pred - y_true) ** 2).mean()

def setup_environment() :
    globals()['csv_base'] = '../input'  if 'working' in os.getcwd() else './input'
    globals()['pkl_base'] = '../pickle' if 'working' in os.getcwd() else './pickle'
    
    os.makedirs(csv_base, exist_ok = True)
    os.makedirs(pkl_base, exist_ok = True)
    
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    
    import warnings
    warnings.filterwarnings('ignore')

setup_environment()

# Data Loading and Cleanup

In [3]:
def compress(df, verbose = True) :
    smu = df.memory_usage().sum() / 1024**2 / 8
    con = {'f' : {                                   np.finfo(np.float16).max : np.float16, np.finfo(np.float32).max : np.float32, np.finfo(np.float64).max : np.float64},
           'u' : {np.iinfo(np.uint8).max : np.uint8, np.iinfo(np.uint16).max  : np.uint16,  np.iinfo(np.uint32).max  : np.uint32,  np.iinfo(np.uint64).max  : np.uint64},
           'i' : {np.iinfo(np.int8).max  : np.int8,  np.iinfo(np.int16).max   : np.int16,   np.iinfo(np.int32).max   : np.int32,   np.iinfo(np.int64).max   : np.int64}}

    for c in df.columns :
        if  con.get(df[c].dtype.kind) :
            df[c] = df[c].astype(con[df[c].dtype.kind].get(min((n for n in con[df[c].dtype.kind].keys() if n > max(df[c].max(), abs(df[c].min()))))))

    emu = df.memory_usage().sum() / 1024**2 / 8

    if  verbose :
        print(f'Memory Use Decreased to {emu:5.2f} MB [{100 * (smu - emu) / emu:5.1f}% Reduction]')

    return df, 100 * (smu - emu) / emu

def read(csv_path, dates = [], brize = [], dummy = [], delna = False, index = None, regen = False) :
    pkl_path = op.join(pkl_base, csv_path).replace('.csv', '.pkl')
    csv_path = op.join(csv_base, csv_path)
    srt_time = ti.time()
    
    if  op.exists(pkl_path) and not regen :
        df       = pd.read_pickle(pkl_path)
        csv_path = pkl_path
        rp       = 0.0
    else                    :
        df = pd.read_csv(csv_path, parse_dates = dates, memory_map = True)
        if  index :
            df = df.set_index(index)
        df, rp = compress(df, verbose = False)

        df.to_pickle(pkl_path)

    print(f'Loading : {op.basename(csv_path):>29} in {ti.time()-srt_time:5.1f} Seconds, Shape is {str(df.shape):>14}, Memory Usage is {df.memory_usage().sum() / 1024**2 / 8:6.2f} MB [Reduction of {rp:5.1f} %].')

    return df

def load() :
    data           = {}
    data['train' ] = read('train.csv', dates = ['first_active_month'], index = 'card_id', regen = True)
    data['test'  ] = read('test.csv',  dates = ['first_active_month'], index = 'card_id', regen = True)
  # data['mercs' ] = read('merchants.csv', index = 'merchant_id')
    data['tx_new'] = read('new_merchant_transactions.csv', dates = ['purchase_date'])
    data['tx_old'] = read('historical_transactions.csv',   dates = ['purchase_date'])
    data['target'] = data['train'].pop('target')
    print(f'\nLoading : Done.')

    return data

def clean(data) :
  # replace missing first active month
    data['test'].loc['C_ID_c27b4f80f7', 'first_active_month'] = data['test']['first_active_month'].min()
    
    return data

In [7]:
data = clean(load())

Loading :                     train.csv in   0.7 Seconds, Shape is    (201917, 5), Memory Usage is   0.51 MB [Reduction of 128.6 %].
Loading :                      test.csv in   0.3 Seconds, Shape is    (123623, 4), Memory Usage is   0.28 MB [Reduction of 110.5 %].
Loading : new_merchant_transactions.csv in   8.9 Seconds, Shape is  (1963031, 14), Memory Usage is  14.27 MB [Reduction of  83.6 %].
Loading :   historical_transactions.csv in 112.7 Seconds, Shape is (29112361, 14), Memory Usage is 218.64 MB [Reduction of  77.8 %].

Loading : Done.


# Feature Engineering

In [8]:
def draft(data) :

    np.random.seed(0)
    
    plan = {}
    
    plan['scorer'        ] = root_mean_squared_error
    plan['baseline_model'] = None
    plan['best_model'    ] = None
    plan['baseline_score'] =   0.0
    plan['best_score'    ] = 100.0
    plan['baseline_feats'] = []
    plan['best_feats'    ] = []

    plan['train_size'] = len(data['train' ])
    plan['train_pcnt'] = 0.8
    plan['train_mask'] = np.random.rand(plan['train_size']) < plan['train_pcnt']
    plan['devel_mask'] =               ~plan['train_mask']

    plan['target' ] = data['target']
    plan['y_train'] = data['target'][plan['train_mask']]
    plan['y_devel'] = data['target'][plan['devel_mask']]

    plan['first_active_month_max'] = max(data['train' ].first_active_month.max(),
                                         data['test'  ].first_active_month.max())
    plan['purchase_date_max'     ] = max(data['tx_old'].purchase_date.max(),
                                         data['tx_new'].purchase_date.max())
    plan['purchase_date_ref'     ] =     data['tx_old'].purchase_date.max()

    return plan

In [9]:
plan = draft(data)

In [10]:
def engineer_transactions(tf, prefix) :

  # binarize boolean Y/N flag variables
    tf['category_1'     ] = tf['category_1'     ].eq('Y').mul(1)
    tf['authorized_flag'] = tf['authorized_flag'].eq('Y').mul(1)

  # convert categorical variables to dummy/indicator, preserve original categorical variable
    tf = pd.concat([tf, pd.get_dummies(tf[['category_2', 'category_3']],
                                columns = ['category_2', 'category_3'])], axis = 1)

  # recover purchase history by denormolizing
    tf['month_diff'    ] = (plan['purchase_date_ref'] - tf['purchase_date']) // np.timedelta64(1, 'M') \
                         + (                            tf['month_lag'    ])

  # extract purchase_month from date
    tf['purchase_month'] = tf['purchase_date'].dt.month

  # convert datetime to numerical
    tf['purchase_ndate'] = tf['purchase_date'].astype(np.int64) * 1e-9

    return tf
    
def engineer_transactions_aggregated(tf, prefix, regen = False) :

    pkl_path = op.join(pkl_base, f'engineered_{prefix}_transactions_aggregated.pkl')
    srt_time = ti.time()
    
    if  op.exists(pkl_path) and not regen :

        df = pd.read_pickle(pkl_path)
        
    else :

        tf = engineer_transactions(tf, tf)

      # ─────────────────────────────────────────────────────────────────────────────────────────────────────────────
      # aggregate transactions per card
      # ─────────────────────────────────────────────────────────────────────────────────────────────────────────────

        aggregations = \
        {
            'category_1'           : ['mean', 'sum'],

            'category_2_1.0'       : ['mean'],
            'category_2_2.0'       : ['mean'],
            'category_2_3.0'       : ['mean'],
            'category_2_4.0'       : ['mean'],
            'category_2_5.0'       : ['mean'],
            'category_3_A'         : ['mean'],
            'category_3_B'         : ['mean'],
            'category_3_C'         : ['mean'],

            'merchant_id'          : ['nunique'],
            'merchant_category_id' : ['nunique'],
            'state_id'             : ['nunique'],
            'city_id'              : ['nunique'],
            'subsector_id'         : ['nunique'],

            'purchase_amount'      : ['mean', 'sum', 'max', 'min', 'std'],
            'installments'         : ['mean', 'sum', 'max', 'min', 'std'],
            'purchase_month'       : ['mean',        'max', 'min', 'std'],
            'purchase_ndate'       : [np.ptp,        'max', 'min'       ],
            'month_lag'            : ['mean',        'max', 'min', 'std'],
            'month_diff'           : ['mean'                            ]
        }

        aggregations_kev = \
        {
            'purchase_amount'      : ['sum', 'mean', 'median', 'min', 'max', 'std'],
            'subsector_id'         : ['nunique'],
            'merchant_category_id' : ['nunique'],
            'merchant_id'          : ['nunique'],
            'installments'         : ['sum', 'mean'],
            'city_id'              : ['nunique'],
            'state_id'             : ['nunique'], 
            'category_1'           : ['sum'],
            'category_2_1.0'       : ['sum'],
            'category_2_2.0'       : ['sum'],
            'category_2_3.0'       : ['sum'],
            'category_2_4.0'       : ['sum'],
            'category_2_5.0'       : ['sum'],
            'category_3_A'         : ['sum'],
            'category_3_B'         : ['sum'],
            'category_3_C'         : ['sum'],

            'month_lag'           : ['mean', 'min', 'max', 'std'],
            'authorized_flag'     : ['sum'],
            'month_diff'          : ['mean', 'min', 'max', 'std'],
            'purchase_date'       : [np.ptp, 'min', 'max']
        }

      # add aggregations
        df = tf.groupby(['card_id']).agg(aggregations)
        df.columns = ['_'.join((prefix,) + c) for c in df.columns.values]
        df.reset_index(inplace = True)
       
      # add aggregated transaction count
        tc = tf.groupby('card_id').size().reset_index(name = f'{prefix}_transaction_count')
        df = pd.merge(tc, df, on = 'card_id', how = 'left')

        df.to_pickle(pkl_path)
    
    print(f'Engineering : {prefix:>5} Transactions in {ti.time()-srt_time:5.1f} Seconds.')
    
    return df

In [11]:
def engineer_cards(df, aggs, plan, prefix, regen = False) :

    pkl_path = op.join(pkl_base, f'engineered_{prefix}_cards.pkl')
    srt_time = ti.time()
    
    if  op.exists(pkl_path) and not regen :

        df = pd.read_pickle(pkl_path)

    else :
    
      # convert categorical variables to dummy/indicator, preserve original categorical variable
        df = pd.concat([df, pd.get_dummies(df[['feature_1', 'feature_2']],
                                    columns = ['feature_1', 'feature_2'])], axis = 1)

      # normalized active days of card from first date the shopper made a purchase through Elo
        df['active_days'] = (plan['first_active_month_max'] - df['first_active_month']).dt.days

        for agg in aggs :
            df = pd.merge( df, aggs[agg], on = 'card_id', how = 'left')

        df.to_pickle(pkl_path)
            
    print(f'Engineering : {prefix:>5} Cards        in {ti.time()-srt_time:5.1f} Seconds.')
        
    return df

In [12]:
def engineer(data, plan, delete = False) :

  # extract approved old transactions
    data['tx_app'] = data['tx_old'][data['tx_old']['authorized_flag'] == 'Y']

  # aggregate transaction features
    aggs           = {}
    aggs['tx_new'] = engineer_transactions_aggregated(data['tx_new'], prefix = 'new')
    aggs['tx_old'] = engineer_transactions_aggregated(data['tx_old'], prefix = 'old')
    aggs['tx_app'] = engineer_transactions_aggregated(data['tx_app'], prefix = 'app')

  # join aggregated features to train and test sets
    feat           = {}
    feat['train' ] = engineer_cards(data['train'], aggs, plan, prefix = 'train', regen = True)
    feat['test'  ] = engineer_cards(data['test' ], aggs, plan, prefix = 'test' , regen = True)
  
    if  delete :
        del aggs
        del data
        del globals()['data']
        d = gc.collect()
    
    print(f'\nEngineering : Done.')

    return feat

In [13]:
feat = engineer(data, plan)

Engineering :   new Transactions in  50.6 Seconds.
Engineering :   old Transactions in 218.4 Seconds.
Engineering :   app Transactions in 202.0 Seconds.
Engineering : train Cards        in   2.5 Seconds.
Engineering :  test Cards        in   1.9 Seconds.

Engineering : Done.


In [None]:
feat['train'].head()

In [None]:
print('\n'.join(sorted(feat['train'].columns.values)))

# Baseline - Linear Regression Model

In [53]:
def select(feat, include = [], exclude = []) :
    columns = feat['train'].columns.values
    
    if  include :
        columns = [c for c in columns if c     in include]
    if  exclude :
        columns = [c for c in columns if c not in exclude]
        
    return columns

def prep(plan, feat, include = [], exclude = []) :
    play            = {}
    play['feats'  ] = select(feat, include, exclude)

    play['target' ] = plan['target']                          # full train labels
    play['train'  ] = feat['train' ][play['feats']].fillna(0) # full train data
    play['test'   ] = feat['test'  ][play['feats']].fillna(0) # full test  data

    play['x_train'] = feat['train' ][play['feats']][plan['train_mask']].fillna(0) # train data   split 80%
    play['y_train'] = plan['target'][plan['train_mask']]                          # train labels split 80%

    play['x_devel'] = feat['train' ][play['feats']][plan['devel_mask']].fillna(0) # train data   split 20%
    play['y_devel'] = plan['target'][plan['devel_mask']]                          # train labels split 80%

    play['x_test' ] = feat['test'  ][play['feats'  ]].fillna(0)
    
    return play

def grade(plan, kind, y_pred, y_test, tag = '', baseline = False) :
    if  baseline :
        plan[    'best_score'] = \
        plan['baseline_score'] = plan['scorer'](plan['y_devel'], y_pred)
        plan[    'best_ytest'] = \
        plan['baseline_ytest'] = y_test
        tag                   += '⭕'

    score   =  plan['scorer'](plan['y_devel'], y_pred)
    improve = (plan['baseline_score'] - score) / plan['baseline_score'] * 100

    if  score <= plan['best_score'] and not baseline :
        plan['best_ytest'] = y_test
        plan['best_score'] = score
        tag               += '⭐'

    print(f'{kind:<17} : Score is {score:6.3f} [{improve:+6.3f}%]' +
         (f' {tag}' if tag else ''))

    return plan

In [34]:
def regression(plan, play, opts = {}, baseline = False) :
    kind    = 'Linear Regression'

    from sklearn.linear_model  import LinearRegression

    model   = LinearRegression() \
                .fit(play['x_train'], play['y_train'])

    return grade(plan, kind, model.predict(play['x_devel']), model.predict(play['x_test']), baseline = baseline)

In [35]:
def regression_play(plan, feat) :
    plan = regression(plan, play = prep(plan, feat, include = ['feature_1', 'feature_2', 'feature_3'               ]), baseline = True ) # base features in train          - baseline
    plan = regression(plan, play = prep(plan, feat, include = ['old_purchase_amount_sum', 'new_purchase_amount_sum']), baseline = False) # old vs new purchase amounts     - experiment
    plan = regression(plan, play = prep(plan, feat, exclude = ['card_id', 'first_active_month'                     ]), baseline = False) # all numeric engineered features - kitchen sink

    return plan

plan = regression_play(plan, feat)

Linear Regression : Score is  3.777 [+0.000%] ⭕
Linear Regression : Score is  3.789 [-0.306%]
Linear Regression : Score is  3.799 [-0.586%]


# Lasso Model

In [36]:
def lasso(plan, play, opts) :
    kind    = 'Linear Lasso'

    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import KFold
    from sklearn.linear_model    import Lasso

    params  = [{'alpha' : opts}]
    folds   = 5

    grid    = GridSearchCV(Lasso(random_state = 0), params, cv = folds, scoring = 'neg_mean_squared_error') \
                .fit(play['x_train'], play['y_train'])
    model   = grid.best_estimator_

    return grade(plan, kind, model.predict(play['x_devel']), model.predict(play['x_test']), tag = f'(Alpha = {grid.best_estimator_.alpha:8.5f}) ')

In [37]:
def lasso_play(plan, feat) :
    plan = lasso(plan, play = prep(plan, feat, include = ['feature_1', 'feature_2', 'feature_3'               ]), opts = [0.00001, 0.001, 0.5, 10])
    plan = lasso(plan, play = prep(plan, feat, include = ['old_purchase_amount_sum', 'new_purchase_amount_sum']), opts = np.logspace(-4, -0.5, 30))
  # plan = lasso(plan, play = prep(plan, feat, exclude = ['card_id'                                           ]), opts = [0.001]                  )

    return plan

plan = lasso_play(plan, feat)

Linear Lasso      : Score is  3.777 [-0.000%] (Alpha =  0.00100) 
Linear Lasso      : Score is  3.789 [-0.307%] (Alpha =  0.00010) 


# Light Gradient Boost


In [59]:
def lgm(plan, play, opts) :
    kind = 'Light Gradient Boost'

    from sklearn.model_selection import KFold
    from lightgbm                import Dataset, train
    
    rmse    = 0
    folds   = KFold(n_splits = opts['n_splits'], shuffle = opts['shuffle'], random_state = opts['r_state'])
    y_train = np.zeros(len(play['train']))
    y_test  = np.zeros(len(play['test' ]))

    for f, (tin, vin) in enumerate(folds.split(play['train'].values, play['target'].values)) :
        
        print(f'Fold {f}')
        
        tdf = Dataset(play['train'].iloc[tin], label = play['target'].iloc[tin], categorical_feature = ['feature_2', 'feature_3']) # train data fold
        vdf = Dataset(play['train'].iloc[vin], label = play['target'].iloc[vin], categorical_feature = ['feature_2', 'feature_3']) # valid data fold
    
        clf = train(params = opts['params'],
                    train_set = tdf,
                    num_boost_round = opts['rounds'],
                    valid_sets = [tdf, vdf],
                    verbose_eval = opts['v_eval'],
                    early_stopping_rounds = opts['e_stop'])

        y_train[vin] = clf.predict(play['train'].iloc[vin], num_iteration = clf.best_iteration)
        y_test      += clf.predict(play['test' ],           num_iteration = clf.best_iteration) / folds.n_splits
      # rmse        += mean_squared_error(play['target'].iloc[vin], y_pred) ** 0.5

    y_pred = y_train[plan['y_devel']]

    print(f"CV Score : {mean_squared_error(y_train, play['target']) ** 0.5:<8.5f}")

    return grade(plan, kind, y_pred, y_test)

In [None]:
def lgm_play(plan, feat) :

    opts_elo = \
    {   'n_splits': 5,
        'shuffle' : True,
        'r_state' : 15,
        'rounds'  : 10000,
        'e_stop'  : 200,
        'v_eval'  : 0,
        'params'  :
         {  'num_leaves'       : 111,
            'min_data_in_leaf' : 149, 
            'objective'        :'regression',
            'max_depth'        : 9,
            'learning_rate'    : 0.005,
            'boosting'         : 'gbdt',
            'feature_fraction' : 0.7522,
            'bagging_freq'     : 1,
            'bagging_fraction' : 0.7083 ,
            'bagging_seed'     : 11,
            'metric'           : 'rmse',
            'lambda_l1'        : 0.2634,
            'random_state'     : 133,
            'verbosity'        : -1
         }
    }

    opts_kev = \
    {   'n_splits': 10,
        'shuffle' : True,
        'r_state' : 15,
        'rounds'  : 5000,
        'e_stop'  : 100,
        'v_eval'  : 0,
        'params'  :
        {   'num_leaves' : 125,
            'num_trees'  : 150,
            'objective'  : 'regression',
            'metric'     : 'rmse'
        }
    }
    
    plan = lgm(plan, play = prep(plan, feat, exclude = ['card_id', 'first_active_month']), opts = opts_elo)
    plan = lgm(plan, play = prep(plan, feat, exclude = ['card_id', 'first_active_month']), opts = opts_kev)

    return plan

plan = lgm_play(plan, feat)

Fold 0


# XGBoost

In [None]:
def xgboost(plan, feat, features, params = {}) :
    kind    = 'XGBoost'
    x_train,\
    x_devel,\
    x_test  = prep(plan, feat, features)

    y_pred  = np.ones(len(x_devel)) * -0.3928
    y_test  = np.ones(len(x_test )) * -0.3928

    return grade(plan, kind, y_pred, y_test)

In [None]:
def xgboost_play(plan, feat) :
    plan = xgboost(plan, feat, features = ['feature_1', 'feature_2', 'feature_3'               ])
    
    return plan

plan = xgboost_play(plan, feat)

In [None]:
plan['y_devel'].mean()

# Neural Network

In [None]:
def neural(plan, feat, features, params = {}) :
    kind    = 'Neural Network'
    x_train,\
    x_devel,\
    x_test  = prep(plan, feat, features)

    y_pred  = np.zeros(len(x_devel))
    y_test  = np.zeros(len(x_test ))

    return grade(plan, kind, y_pred, y_test)

In [None]:
def neural_play(plan, feat) :
    plan = neural(plan, feat, features = ['feature_1', 'feature_2', 'feature_3'               ])
    
    return plan

plan = neural_play(plan, feat)

# Make Submission

In [None]:
def submit(plan, feat) :

    model  = plan['best_model']
    feats  = plan['best_feats']
    score  = plan['best_score']

    x_test = feat['test'][feats]
    y_pred = model.predict(x_test)
    
    submission = pd.DataFrame({ 'card_id' : feat['test']['card_id'].values,
                                'target'  : y_pred })
    
    display(submission.head())
    
    submission.to_csv('submission.csv', index = False)  

In [None]:
submit(plan, feat)