### Objective of the notebook:

In this notebook, we will try different parameter combinations for the final lightGBM classfier model to increase model performance.

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm 

from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore") 

kfold = KFold(n_splits=5, random_state=42)
break_point = datetime(2017, 2, 28)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Importing datasets

In [2]:
def read_data():
    
    print('Reading files...')    
    order_df = pd.read_csv('../input/machine_learning_challenge_order_data.csv')
    print('Order data has {} rows and {} columns'.format(order_df.shape[0], order_df.shape[1]))
    label_df = pd.read_csv('../input/machine_learning_challenge_labeled_data.csv')
    print('Label data has {} rows and {} columns'.format(label_df.shape[0], label_df.shape[1]))
    df = order_df.merge(label_df, on='customer_id')
    print('The final data has {} rows and {} columns'.format(df.shape[0], df.shape[1]))
    print("")
    return df

### Change data types and reduce memory usage

In [3]:
def reduce_mem_usage(df, verbose=False):
    
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    print("")
    return df

### Label encode categorical features

In [4]:
def transform_data(df):

    labelencoder = LabelEncoder()

    for i in ['restaurant_id', 'city_id', 'payment_id', 'platform_id', 'transmission_id']:
        df[i] = labelencoder.fit_transform(df[i])

    return df

### Convert raw data to a session format
- Fill order rank with the forward-filling method. 
- Calculate recency and number of days from the first order.
- Get time-related features like the year, month, week, day, day of the week, weekend.
- Add day differences between consecutive orders. 
- Calculate rolling features in 3 days, 1, 2, 4, 12, 24 weeks, and all time.
- Keep the last record of each customer.

In [5]:
def getWeeklyDates(df, break_point):

    df['order_date'] = pd.to_datetime(df['order_date'])
    three_day = df[df['order_date'] >= break_point - timedelta(days=3)]
    one_week = df[df['order_date'] >= break_point - timedelta(days=7)]
    two_week = df[df['order_date'] >= break_point - timedelta(days=14)]
    four_week = df[df['order_date'] >= break_point - timedelta(days=28)]
    twelve_week = df[df['order_date'] >= break_point - timedelta(days=84)]
    twenty_four_week = df[df['order_date'] >= break_point - timedelta(days=168)]
    all_week = df
    return three_day, one_week, two_week, four_week, twelve_week, twenty_four_week,all_week

In [6]:
def feature_engineering(df, break_point):
    
    df['customer_order_rank'] = df['customer_order_rank'].fillna(method='ffill')

    df['date'] = pd.to_datetime(df['order_date']) 
    df['recency'] = (break_point - df['date']) / np.timedelta64(1, 'D')
    df['first_order_date'] = df.groupby(['customer_id'])['date'].transform('first')
    df['age_of_user'] = (break_point - df['first_order_date']) / np.timedelta64(1, 'D')

    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['week'] = df['date'].dt.week
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(np.int8)
    
    df['demand'] = 1
    
    df['order_date_shift'] = df.groupby('customer_id')['date'].shift()
    df['date_diff'] = (df['date'] - df['order_date_shift']) / np.timedelta64(1, 'D')

    three_day, one_week, two_week, four_week, twelve_week, twenty_four_week,all_week = getWeeklyDates(df, break_point)
    
    col = ['demand', 'is_failed','voucher_amount','delivery_fee', 'amount_paid', 'date_diff']
    three_day, one_week, two_week, four_week, twelve_week, twenty_four_week,all_week = getWeeklyDates(df, break_point)
    three_day = three_day.groupby('customer_id')[col].mean().add_prefix('three_day_').reset_index()
    one_week = one_week.groupby('customer_id')[col].mean().add_prefix('one_week_').reset_index()
    two_week = two_week.groupby('customer_id')[col].mean().add_prefix('two_week_').reset_index()
    four_week = four_week.groupby('customer_id')[col].mean().add_prefix('four_week_').reset_index()
    twelve_week = twelve_week.groupby('customer_id')[col].mean().add_prefix('twelve_week_').reset_index()
    twenty_four_week = twenty_four_week.groupby('customer_id')[col].mean().add_prefix('twenty_four_week_').reset_index()
    all_week = all_week.groupby('customer_id')[col].mean().add_prefix('all_week_').reset_index()
    
    df = df.groupby('customer_id').last().reset_index()
    df = df.merge(three_day, how='left').merge(one_week, how='left').merge(two_week, how='left').merge(four_week,
    'left').merge(twelve_week,'left').merge(twenty_four_week,'left').merge(all_week,'left').reset_index()

    return df

### Search best parameters for the final lightGBM model and features

In [7]:
def run_lgb(df):
    
    y = df['is_returning_customer']
    X = df.drop(columns=['customer_id', 'order_date', 'date', 'is_returning_customer',
                        'first_order_date', 'index', 'order_date_shift'])    
    
    clf = lightgbm.LGBMClassifier(n_jobs= -1, scale_pos_weight=2)
    
    param_dist = {
                    'max_depth': np.arange(3,15,3),
                    'min_child_weight': np.arange(1,8,1),
                    'colsample_bytree': np.arange(0.3,0.9,0.1),
                    'n_estimators': np.arange(100,1000,100),
                    'learning_rate': np.arange(0.05,0.3,0.05),
                    'num_leaves':  np.arange(10,100,10)
             }
    
    fit_params={"early_stopping_rounds":10,
               "eval_metric" : "auc", 
               "eval_set" : [[X, y]]}

    grid_search = RandomizedSearchCV(clf, 
                         param_distributions = param_dist,
                         cv = kfold,  
                         n_iter = 50,
                         verbose = 0, 
                         n_jobs = -1,
                         fit_params=fit_params)
    
    grid_result = grid_search.fit(X,y)

    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_[ 'mean_test_score' ]
    stds = grid_result.cv_results_[ 'std_test_score' ]
    params = grid_result.cv_results_[ 'params' ]

In [8]:
def execute_pipeline():
    
    df = read_data()
    df = reduce_mem_usage(df, True)
    df = transform_data(df)
    df = feature_engineering(df, break_point)
    run_lgb(df)

execute_pipeline()

Reading files...
Order data has 786600 rows and 13 columns
Label data has 245455 rows and 2 columns
The final data has 786600 rows and 14 columns

Mem. usage decreased to 42.76 Mb (52.5% reduction)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 35.7min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 53.2min finished


[1]	training's binary_logloss: 0.519346	training's auc: 0.812477
Training until validation scores don't improve for 10 rounds.
[2]	training's binary_logloss: 0.506669	training's auc: 0.816096
[3]	training's binary_logloss: 0.496034	training's auc: 0.81683
[4]	training's binary_logloss: 0.48717	training's auc: 0.817118
[5]	training's binary_logloss: 0.47935	training's auc: 0.817511
[6]	training's binary_logloss: 0.472704	training's auc: 0.81774
[7]	training's binary_logloss: 0.466823	training's auc: 0.817878
[8]	training's binary_logloss: 0.461705	training's auc: 0.818008
[9]	training's binary_logloss: 0.457119	training's auc: 0.818308
[10]	training's binary_logloss: 0.453052	training's auc: 0.818426
[11]	training's binary_logloss: 0.449482	training's auc: 0.818492
[12]	training's binary_logloss: 0.446303	training's auc: 0.818648
[13]	training's binary_logloss: 0.443463	training's auc: 0.818838
[14]	training's binary_logloss: 0.440929	training's auc: 0.818916
[15]	training's binary_logl

[129]	training's binary_logloss: 0.417009	training's auc: 0.826923
[130]	training's binary_logloss: 0.416953	training's auc: 0.826996
[131]	training's binary_logloss: 0.416901	training's auc: 0.827061
[132]	training's binary_logloss: 0.416842	training's auc: 0.827133
[133]	training's binary_logloss: 0.416784	training's auc: 0.827212
[134]	training's binary_logloss: 0.416733	training's auc: 0.827257
[135]	training's binary_logloss: 0.416676	training's auc: 0.827314
[136]	training's binary_logloss: 0.416627	training's auc: 0.827369
[137]	training's binary_logloss: 0.416574	training's auc: 0.827405
[138]	training's binary_logloss: 0.416514	training's auc: 0.827485
[139]	training's binary_logloss: 0.416473	training's auc: 0.827544
[140]	training's binary_logloss: 0.416421	training's auc: 0.827587
[141]	training's binary_logloss: 0.416362	training's auc: 0.827639
[142]	training's binary_logloss: 0.416294	training's auc: 0.827714
[143]	training's binary_logloss: 0.41624	training's auc: 0.827

[252]	training's binary_logloss: 0.410879	training's auc: 0.833077
[253]	training's binary_logloss: 0.410829	training's auc: 0.833142
[254]	training's binary_logloss: 0.410796	training's auc: 0.833175
[255]	training's binary_logloss: 0.410749	training's auc: 0.833231
[256]	training's binary_logloss: 0.41072	training's auc: 0.833247
[257]	training's binary_logloss: 0.410662	training's auc: 0.833285
[258]	training's binary_logloss: 0.410615	training's auc: 0.833305
[259]	training's binary_logloss: 0.410596	training's auc: 0.833321
[260]	training's binary_logloss: 0.410531	training's auc: 0.833345
[261]	training's binary_logloss: 0.410483	training's auc: 0.833399
[262]	training's binary_logloss: 0.410424	training's auc: 0.833467
[263]	training's binary_logloss: 0.410387	training's auc: 0.833481
[264]	training's binary_logloss: 0.410338	training's auc: 0.833527
[265]	training's binary_logloss: 0.410283	training's auc: 0.833627
[266]	training's binary_logloss: 0.410227	training's auc: 0.833

[376]	training's binary_logloss: 0.405368	training's auc: 0.838194
[377]	training's binary_logloss: 0.405316	training's auc: 0.838265
[378]	training's binary_logloss: 0.405271	training's auc: 0.838328
[379]	training's binary_logloss: 0.405229	training's auc: 0.838392
[380]	training's binary_logloss: 0.405196	training's auc: 0.838439
[381]	training's binary_logloss: 0.405158	training's auc: 0.838459
[382]	training's binary_logloss: 0.405099	training's auc: 0.838511
[383]	training's binary_logloss: 0.40505	training's auc: 0.838563
[384]	training's binary_logloss: 0.404997	training's auc: 0.838613
[385]	training's binary_logloss: 0.404946	training's auc: 0.838667
[386]	training's binary_logloss: 0.404889	training's auc: 0.838686
[387]	training's binary_logloss: 0.404837	training's auc: 0.838744
[388]	training's binary_logloss: 0.4048	training's auc: 0.838762
[389]	training's binary_logloss: 0.404764	training's auc: 0.838798
[390]	training's binary_logloss: 0.404711	training's auc: 0.83885

[502]	training's binary_logloss: 0.400168	training's auc: 0.843013
[503]	training's binary_logloss: 0.400146	training's auc: 0.843029
[504]	training's binary_logloss: 0.400093	training's auc: 0.843116
[505]	training's binary_logloss: 0.400037	training's auc: 0.843157
[506]	training's binary_logloss: 0.400003	training's auc: 0.843203
[507]	training's binary_logloss: 0.399952	training's auc: 0.843237
[508]	training's binary_logloss: 0.399912	training's auc: 0.843297
[509]	training's binary_logloss: 0.399867	training's auc: 0.843382
[510]	training's binary_logloss: 0.399805	training's auc: 0.843455
[511]	training's binary_logloss: 0.399752	training's auc: 0.843541
[512]	training's binary_logloss: 0.399704	training's auc: 0.843567
[513]	training's binary_logloss: 0.399664	training's auc: 0.843587
[514]	training's binary_logloss: 0.399625	training's auc: 0.843601
[515]	training's binary_logloss: 0.399567	training's auc: 0.843651
[516]	training's binary_logloss: 0.399528	training's auc: 0.84

[629]	training's binary_logloss: 0.394756	training's auc: 0.84801
[630]	training's binary_logloss: 0.39471	training's auc: 0.848068
[631]	training's binary_logloss: 0.394668	training's auc: 0.848131
[632]	training's binary_logloss: 0.394634	training's auc: 0.848163
[633]	training's binary_logloss: 0.394588	training's auc: 0.848219
[634]	training's binary_logloss: 0.394541	training's auc: 0.848242
[635]	training's binary_logloss: 0.394485	training's auc: 0.848289
[636]	training's binary_logloss: 0.394443	training's auc: 0.84833
[637]	training's binary_logloss: 0.394404	training's auc: 0.848379
[638]	training's binary_logloss: 0.394384	training's auc: 0.848392
[639]	training's binary_logloss: 0.394352	training's auc: 0.848409
[640]	training's binary_logloss: 0.394307	training's auc: 0.848447
[641]	training's binary_logloss: 0.394261	training's auc: 0.848471
[642]	training's binary_logloss: 0.394216	training's auc: 0.848497
[643]	training's binary_logloss: 0.394191	training's auc: 0.84852

[755]	training's binary_logloss: 0.389689	training's auc: 0.852548
[756]	training's binary_logloss: 0.389659	training's auc: 0.85257
[757]	training's binary_logloss: 0.389635	training's auc: 0.852578
[758]	training's binary_logloss: 0.389583	training's auc: 0.852605
[759]	training's binary_logloss: 0.389554	training's auc: 0.852621
[760]	training's binary_logloss: 0.38953	training's auc: 0.852632
[761]	training's binary_logloss: 0.389488	training's auc: 0.852674
[762]	training's binary_logloss: 0.389442	training's auc: 0.852716
[763]	training's binary_logloss: 0.389401	training's auc: 0.852767
[764]	training's binary_logloss: 0.389354	training's auc: 0.852833
[765]	training's binary_logloss: 0.389309	training's auc: 0.852882
[766]	training's binary_logloss: 0.389271	training's auc: 0.852901
[767]	training's binary_logloss: 0.389231	training's auc: 0.852936
[768]	training's binary_logloss: 0.38919	training's auc: 0.85297
[769]	training's binary_logloss: 0.389151	training's auc: 0.853005