### Objective of the notebook:

In this notebook, we will try different parameter combinations for the final lightGBM classfier model to increase model performance.

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm 

from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore") 

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
break_point = datetime(2017, 2, 28)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Importing datasets

In [2]:
def read_data():
    
    print('Reading files...')    
    order_df = pd.read_csv('../input/machine_learning_challenge_order_data.csv')
    print('Order data has {} rows and {} columns'.format(order_df.shape[0], order_df.shape[1]))
    label_df = pd.read_csv('../input/machine_learning_challenge_labeled_data.csv')
    print('Label data has {} rows and {} columns'.format(label_df.shape[0], label_df.shape[1]))
    df = order_df.merge(label_df, on='customer_id')
    print('The final data has {} rows and {} columns'.format(df.shape[0], df.shape[1]))
    print("")
    return df

### Change data types and reduce memory usage

In [3]:
def reduce_mem_usage(df, verbose=False):
    
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    print("")
    return df

### Label encode categorical features

In [4]:
def transform_data(df):

    labelencoder = LabelEncoder()

    for i in ['restaurant_id', 'city_id', 'payment_id', 'platform_id', 'transmission_id']:
        df[i] = labelencoder.fit_transform(df[i])

    return df

### Convert raw data to a session format
- Fill order rank with the forward-filling method.
- Calculate recency and number of days from the first order.
- Get time-related features like the year, month, week, day, day of the week, weekend.
- Add day differences between consecutive orders.
- Calculate rolling features in 3 days, 1, 2, 4, 12, 24 weeks, and all time.
- Calculate unique customer count and churn rate by restaurant, city.
- Keep the last record of each customer.

In [5]:
def getWeeklyDates(df, break_point):

    df['order_date'] = pd.to_datetime(df['order_date'])
    three_day = df[df['order_date'] >= break_point - timedelta(days=3)]
    one_week = df[df['order_date'] >= break_point - timedelta(days=7)]
    two_week = df[df['order_date'] >= break_point - timedelta(days=14)]
    four_week = df[df['order_date'] >= break_point - timedelta(days=28)]
    twelve_week = df[df['order_date'] >= break_point - timedelta(days=84)]
    twenty_four_week = df[df['order_date'] >= break_point - timedelta(days=168)]
    all_week = df
    return three_day, one_week, two_week, four_week, twelve_week, twenty_four_week,all_week

In [6]:
def feature_engineering(df, break_point):
    
    df['customer_order_rank'] = df['customer_order_rank'].fillna(method='ffill')

    df['order_date'] = pd.to_datetime(df['order_date']) 
    df['recency'] = (break_point - df['order_date']) / np.timedelta64(1, 'D')
    df['first_order_date'] = df.groupby(['customer_id'])['order_date'].transform('first')
    df['age_of_user'] = (break_point - df['first_order_date']) / np.timedelta64(1, 'D')

    df['year'] = df['order_date'].dt.year
    df['month'] = df['order_date'].dt.month
    df['week'] = df['order_date'].dt.week
    df['day'] = df['order_date'].dt.day
    df['dayofweek'] = df['order_date'].dt.dayofweek
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(np.int8)
    
    df['demand'] = 1
    
    df['order_date_shift'] = df.groupby('customer_id')['order_date'].shift()
    df['date_diff'] = (df['order_date'] - df['order_date_shift']) / np.timedelta64(1, 'D')

    
    three_day, one_week, two_week, four_week, twelve_week, twenty_four_week,all_week = getWeeklyDates(df, break_point)
    
    col = ['demand', 'is_failed','voucher_amount','delivery_fee', 'amount_paid', 'date_diff']
    three_day = three_day.groupby('customer_id')[col].sum().add_prefix('three_day_').reset_index()
    one_week = one_week.groupby('customer_id')[col].sum().add_prefix('one_week_').reset_index()
    two_week = two_week.groupby('customer_id')[col].sum().add_prefix('two_week_').reset_index()
    four_week = four_week.groupby('customer_id')[col].sum().add_prefix('four_week_').reset_index()
    twelve_week = twelve_week.groupby('customer_id')[col].sum().add_prefix('twelve_week_').reset_index()
    twenty_four_week = twenty_four_week.groupby('customer_id')[col].sum().add_prefix('twenty_four_week_').reset_index()
    all_week = all_week.groupby('customer_id')[col].sum().add_prefix('all_week_').reset_index()
    
    df = df.groupby('customer_id').last().reset_index()
    df = df.merge(three_day, how='left').merge(one_week, how='left').merge(two_week, how='left').merge(four_week,
    'left').merge(twelve_week,'left').merge(twenty_four_week,'left').merge(all_week,'left').reset_index()

    df['city_count'] = df.groupby('city_id')['customer_id'].transform('nunique')
    df['rest_count'] = df.groupby('restaurant_id')['customer_id'].transform('nunique')
    
    df['city_mean'] = df.groupby('city_id')['is_returning_customer'].transform('mean')
    df['rest_mean'] = df.groupby('restaurant_id')['is_returning_customer'].transform('mean')

    return df

### Search best parameters for the final lightGBM model and features

- ROC-AUC: 0.8443 >> 0.846

In [9]:
def run_lgb(df):
    
    y = df['is_returning_customer']
    X = df.drop(columns=['customer_id', 'order_date', 'is_returning_customer',
                        'first_order_date', 'index', 'order_date_shift'])    
    
    clf = lightgbm.LGBMClassifier(random_state=42)
    
    param_dist = {
                    'max_depth': np.arange(3,15,3),
                    'min_child_weight': np.arange(1,8,1),
                    'colsample_bytree': np.arange(0.3,0.9,0.1),
                    'n_estimators': np.arange(100,1000,100),
                    'learning_rate': np.arange(0.05,0.3,0.05),
                    'num_leaves':  np.arange(10,100,10)
             }
    
    fit_params={"early_stopping_rounds":10,
               "eval_metric" : "auc", 
               "eval_set" : [[X, y]]}

    grid_search = RandomizedSearchCV(clf, 
                         param_distributions = param_dist,
                         cv = kfold,  
                         n_iter = 50,
                         verbose = 0, 
                         n_jobs = -1,
                         fit_params=fit_params)
    
    grid_result = grid_search.fit(X,y)

    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_[ 'mean_test_score' ]
    stds = grid_result.cv_results_[ 'std_test_score' ]
    params = grid_result.cv_results_[ 'params' ]

In [10]:
def execute_pipeline():
    
    df = read_data()
    df = reduce_mem_usage(df, True)
    df = transform_data(df)
    df = feature_engineering(df, break_point)
    run_lgb(df)

execute_pipeline()

Reading files...
Order data has 786600 rows and 13 columns
Label data has 245455 rows and 2 columns
The final data has 786600 rows and 14 columns

Mem. usage decreased to 42.76 Mb (52.5% reduction)

[1]	training's auc: 0.796564	training's binary_logloss: 0.520818
Training until validation scores don't improve for 10 rounds.
[2]	training's auc: 0.804678	training's binary_logloss: 0.508937
[3]	training's auc: 0.813406	training's binary_logloss: 0.498518
[4]	training's auc: 0.822379	training's binary_logloss: 0.489253
[5]	training's auc: 0.823235	training's binary_logloss: 0.481081
[6]	training's auc: 0.825287	training's binary_logloss: 0.473743
[7]	training's auc: 0.826134	training's binary_logloss: 0.467
[8]	training's auc: 0.826453	training's binary_logloss: 0.460963
[9]	training's auc: 0.826547	training's binary_logloss: 0.455425
[10]	training's auc: 0.826811	training's binary_logloss: 0.450358
[11]	training's auc: 0.827046	training's binary_logloss: 0.445746
[12]	training's auc: 0.82

[131]	training's auc: 0.844829	training's binary_logloss: 0.370759
[132]	training's auc: 0.844871	training's binary_logloss: 0.370698
[133]	training's auc: 0.844909	training's binary_logloss: 0.37065
[134]	training's auc: 0.844935	training's binary_logloss: 0.370604
[135]	training's auc: 0.844985	training's binary_logloss: 0.370558
[136]	training's auc: 0.845006	training's binary_logloss: 0.370516
[137]	training's auc: 0.845035	training's binary_logloss: 0.370478
[138]	training's auc: 0.845064	training's binary_logloss: 0.370431
[139]	training's auc: 0.845089	training's binary_logloss: 0.370381
[140]	training's auc: 0.845107	training's binary_logloss: 0.370337
[141]	training's auc: 0.845134	training's binary_logloss: 0.370298
[142]	training's auc: 0.845164	training's binary_logloss: 0.370254
[143]	training's auc: 0.845195	training's binary_logloss: 0.370202
[144]	training's auc: 0.845236	training's binary_logloss: 0.37016
[145]	training's auc: 0.845266	training's binary_logloss: 0.3701

[257]	training's auc: 0.847345	training's binary_logloss: 0.367177
[258]	training's auc: 0.847352	training's binary_logloss: 0.367162
[259]	training's auc: 0.847361	training's binary_logloss: 0.367147
[260]	training's auc: 0.847376	training's binary_logloss: 0.367129
[261]	training's auc: 0.847397	training's binary_logloss: 0.36711
[262]	training's auc: 0.847414	training's binary_logloss: 0.367085
[263]	training's auc: 0.847438	training's binary_logloss: 0.367072
[264]	training's auc: 0.847445	training's binary_logloss: 0.367054
[265]	training's auc: 0.847455	training's binary_logloss: 0.367039
[266]	training's auc: 0.847478	training's binary_logloss: 0.367019
[267]	training's auc: 0.847497	training's binary_logloss: 0.366996
[268]	training's auc: 0.847506	training's binary_logloss: 0.36698
[269]	training's auc: 0.847515	training's binary_logloss: 0.366964
[270]	training's auc: 0.847529	training's binary_logloss: 0.366947
[271]	training's auc: 0.847541	training's binary_logloss: 0.3669

[381]	training's auc: 0.848899	training's binary_logloss: 0.365199
[382]	training's auc: 0.848917	training's binary_logloss: 0.365176
[383]	training's auc: 0.848922	training's binary_logloss: 0.365165
[384]	training's auc: 0.848945	training's binary_logloss: 0.365149
[385]	training's auc: 0.848962	training's binary_logloss: 0.365138
[386]	training's auc: 0.848965	training's binary_logloss: 0.365129
[387]	training's auc: 0.848969	training's binary_logloss: 0.365114
[388]	training's auc: 0.848994	training's binary_logloss: 0.365095
[389]	training's auc: 0.849018	training's binary_logloss: 0.365081
[390]	training's auc: 0.84903	training's binary_logloss: 0.365068
[391]	training's auc: 0.849043	training's binary_logloss: 0.365053
[392]	training's auc: 0.849058	training's binary_logloss: 0.365036
[393]	training's auc: 0.84908	training's binary_logloss: 0.365019
[394]	training's auc: 0.849093	training's binary_logloss: 0.365004
[395]	training's auc: 0.849108	training's binary_logloss: 0.3649

[504]	training's auc: 0.850213	training's binary_logloss: 0.36355
[505]	training's auc: 0.850225	training's binary_logloss: 0.363535
[506]	training's auc: 0.850232	training's binary_logloss: 0.363524
[507]	training's auc: 0.850238	training's binary_logloss: 0.363509
[508]	training's auc: 0.850253	training's binary_logloss: 0.363498
[509]	training's auc: 0.85026	training's binary_logloss: 0.363486
[510]	training's auc: 0.850275	training's binary_logloss: 0.363477
[511]	training's auc: 0.850281	training's binary_logloss: 0.363464
[512]	training's auc: 0.850297	training's binary_logloss: 0.36345
[513]	training's auc: 0.850302	training's binary_logloss: 0.363442
[514]	training's auc: 0.850314	training's binary_logloss: 0.363429
[515]	training's auc: 0.850316	training's binary_logloss: 0.363421
[516]	training's auc: 0.850319	training's binary_logloss: 0.363414
[517]	training's auc: 0.850329	training's binary_logloss: 0.363397
[518]	training's auc: 0.850344	training's binary_logloss: 0.36338

[631]	training's auc: 0.851403	training's binary_logloss: 0.362013
[632]	training's auc: 0.85141	training's binary_logloss: 0.362001
[633]	training's auc: 0.851414	training's binary_logloss: 0.361992
[634]	training's auc: 0.851419	training's binary_logloss: 0.361981
[635]	training's auc: 0.851423	training's binary_logloss: 0.361972
[636]	training's auc: 0.851435	training's binary_logloss: 0.361963
[637]	training's auc: 0.851438	training's binary_logloss: 0.361955
[638]	training's auc: 0.851463	training's binary_logloss: 0.361939
[639]	training's auc: 0.851481	training's binary_logloss: 0.361928
[640]	training's auc: 0.851487	training's binary_logloss: 0.361918
[641]	training's auc: 0.851489	training's binary_logloss: 0.361913
[642]	training's auc: 0.851498	training's binary_logloss: 0.361904
[643]	training's auc: 0.851516	training's binary_logloss: 0.36189
[644]	training's auc: 0.851524	training's binary_logloss: 0.361882
[645]	training's auc: 0.851527	training's binary_logloss: 0.3618