### Objective of the notebook:

In this notebook, we will try different parameter combinations for the final lightGBM classfier model to increase model performance.

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm 

from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore") 

kfold = KFold(n_splits=5, random_state=42)
break_point = datetime(2017, 2, 28)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Importing datasets

In [2]:
def read_data():
    
    print('Reading files...')    
    order_df = pd.read_csv('../input/machine_learning_challenge_order_data.csv')
    print('Order data has {} rows and {} columns'.format(order_df.shape[0], order_df.shape[1]))
    label_df = pd.read_csv('../input/machine_learning_challenge_labeled_data.csv')
    print('Label data has {} rows and {} columns'.format(label_df.shape[0], label_df.shape[1]))
    df = order_df.merge(label_df, on='customer_id')
    print('The final data has {} rows and {} columns'.format(df.shape[0], df.shape[1]))
    print("")
    return df

### Change data types and reduce memory usage

In [3]:
def reduce_mem_usage(df, verbose=False):
    
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    print("")
    return df

### Label encode categorical features

In [4]:
def transform_data(df):

    labelencoder = LabelEncoder()

    for i in ['restaurant_id', 'city_id', 'payment_id', 'platform_id', 'transmission_id']:
        df[i] = labelencoder.fit_transform(df[i])

    return df

### Convert raw data to a session format
- Fill order rank with the forward-filling method. 
- Calculate recency and number of days from the first order.
- Get time-related features like the year, month, week, day, day of the week, weekend.
- Add day differences between consecutive orders. 
- Calculate rolling features in 3 days, 1, 2, 4, 12, 24 weeks, and all time.
- Keep the last record of each customer.

In [5]:
def getWeeklyDates(df, break_point):

    df['order_date'] = pd.to_datetime(df['order_date'])
    three_day = df[df['order_date'] >= break_point - timedelta(days=3)]
    one_week = df[df['order_date'] >= break_point - timedelta(days=7)]
    two_week = df[df['order_date'] >= break_point - timedelta(days=14)]
    four_week = df[df['order_date'] >= break_point - timedelta(days=28)]
    twelve_week = df[df['order_date'] >= break_point - timedelta(days=84)]
    twenty_four_week = df[df['order_date'] >= break_point - timedelta(days=168)]
    all_week = df
    return three_day, one_week, two_week, four_week, twelve_week, twenty_four_week,all_week

In [6]:
def feature_engineering(df, break_point):
    
    df['customer_order_rank'] = df['customer_order_rank'].fillna(method='ffill')

    df['date'] = pd.to_datetime(df['order_date']) 
    df['recency'] = (break_point - df['date']) / np.timedelta64(1, 'D')
    df['first_order_date'] = df.groupby(['customer_id'])['date'].transform('first')
    df['age_of_user'] = (break_point - df['first_order_date']) / np.timedelta64(1, 'D')

    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['week'] = df['date'].dt.week
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(np.int8)
    
    df['demand'] = 1
    
    df['order_date_shift'] = df.groupby('customer_id')['date'].shift()
    df['date_diff'] = (df['date'] - df['order_date_shift']) / np.timedelta64(1, 'D')

    three_day, one_week, two_week, four_week, twelve_week, twenty_four_week,all_week = getWeeklyDates(df, break_point)
    
    col = ['demand', 'is_failed','voucher_amount','delivery_fee', 'amount_paid', 'date_diff']
    three_day, one_week, two_week, four_week, twelve_week, twenty_four_week,all_week = getWeeklyDates(df, break_point)
    three_day = three_day.groupby('customer_id')[col].mean().add_prefix('three_day_').reset_index()
    one_week = one_week.groupby('customer_id')[col].mean().add_prefix('one_week_').reset_index()
    two_week = two_week.groupby('customer_id')[col].mean().add_prefix('two_week_').reset_index()
    four_week = four_week.groupby('customer_id')[col].mean().add_prefix('four_week_').reset_index()
    twelve_week = twelve_week.groupby('customer_id')[col].mean().add_prefix('twelve_week_').reset_index()
    twenty_four_week = twenty_four_week.groupby('customer_id')[col].mean().add_prefix('twenty_four_week_').reset_index()
    all_week = all_week.groupby('customer_id')[col].mean().add_prefix('all_week_').reset_index()
    
    df = df.groupby('customer_id').last().reset_index()
    df = df.merge(three_day, how='left').merge(one_week, how='left').merge(two_week, how='left').merge(four_week,
    'left').merge(twelve_week,'left').merge(twenty_four_week,'left').merge(all_week,'left').reset_index()

    return df

### Search best parameters for the final lightGBM model and features

We give more weights on class 1 using scale the pos weight parameter to label returned customers more correctly.

In [7]:
def run_lgb(df):
    
    y = df['is_returning_customer']
    X = df.drop(columns=['customer_id', 'order_date', 'date', 'is_returning_customer',
                        'first_order_date', 'index', 'order_date_shift'])    
    
    clf = lightgbm.LGBMClassifier(n_jobs= -1, scale_pos_weight=2)
    
    param_dist = {
                    'max_depth': np.arange(3,15,3),
                    'min_child_weight': np.arange(1,8,1),
                    'colsample_bytree': np.arange(0.3,0.9,0.1),
                    'n_estimators': np.arange(100,1000,100),
                    'learning_rate': np.arange(0.05,0.3,0.05),
                    'num_leaves':  np.arange(10,100,10)
             }
    
    fit_params={"early_stopping_rounds":10,
               "eval_metric" : "auc", 
               "eval_set" : [[X, y]]}

    grid_search = RandomizedSearchCV(clf, 
                         param_distributions = param_dist,
                         cv = kfold,  
                         n_iter = 50,
                         verbose = 0, 
                         n_jobs = -1,
                         fit_params=fit_params)
    
    grid_result = grid_search.fit(X,y)

    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_[ 'mean_test_score' ]
    stds = grid_result.cv_results_[ 'std_test_score' ]
    params = grid_result.cv_results_[ 'params' ]

In [8]:
def execute_pipeline():
    
    df = read_data()
    df = reduce_mem_usage(df, True)
    df = transform_data(df)
    df = feature_engineering(df, break_point)
    run_lgb(df)

execute_pipeline()

Reading files...
Order data has 786600 rows and 13 columns
Label data has 245455 rows and 2 columns
The final data has 786600 rows and 14 columns

Mem. usage decreased to 42.76 Mb (52.5% reduction)

[1]	training's binary_logloss: 0.519133	training's auc: 0.814505
Training until validation scores don't improve for 10 rounds.
[2]	training's binary_logloss: 0.506316	training's auc: 0.817914
[3]	training's binary_logloss: 0.495534	training's auc: 0.818823
[4]	training's binary_logloss: 0.486532	training's auc: 0.819195
[5]	training's binary_logloss: 0.478628	training's auc: 0.819598
[6]	training's binary_logloss: 0.471959	training's auc: 0.819793
[7]	training's binary_logloss: 0.465952	training's auc: 0.820079
[8]	training's binary_logloss: 0.46077	training's auc: 0.820255
[9]	training's binary_logloss: 0.45639	training's auc: 0.820438
[10]	training's binary_logloss: 0.452182	training's auc: 0.820737
[11]	training's binary_logloss: 0.448574	training's auc: 0.82085
[12]	training's binary_lo

[125]	training's binary_logloss: 0.410869	training's auc: 0.832641
[126]	training's binary_logloss: 0.410791	training's auc: 0.8327
[127]	training's binary_logloss: 0.410698	training's auc: 0.832842
[128]	training's binary_logloss: 0.410593	training's auc: 0.832928
[129]	training's binary_logloss: 0.410514	training's auc: 0.833046
[130]	training's binary_logloss: 0.410431	training's auc: 0.833122
[131]	training's binary_logloss: 0.410355	training's auc: 0.833241
[132]	training's binary_logloss: 0.410243	training's auc: 0.833339
[133]	training's binary_logloss: 0.410149	training's auc: 0.833466
[134]	training's binary_logloss: 0.410059	training's auc: 0.833539
[135]	training's binary_logloss: 0.410035	training's auc: 0.833576
[136]	training's binary_logloss: 0.409974	training's auc: 0.833612
[137]	training's binary_logloss: 0.409904	training's auc: 0.833671
[138]	training's binary_logloss: 0.409815	training's auc: 0.833818
[139]	training's binary_logloss: 0.409704	training's auc: 0.8339

[254]	training's binary_logloss: 0.401584	training's auc: 0.841756
[255]	training's binary_logloss: 0.401524	training's auc: 0.841817
[256]	training's binary_logloss: 0.401477	training's auc: 0.841854
[257]	training's binary_logloss: 0.401391	training's auc: 0.841937
[258]	training's binary_logloss: 0.401334	training's auc: 0.841958
[259]	training's binary_logloss: 0.40125	training's auc: 0.842053
[260]	training's binary_logloss: 0.401178	training's auc: 0.842106
[261]	training's binary_logloss: 0.40112	training's auc: 0.842128
[262]	training's binary_logloss: 0.401035	training's auc: 0.842233
[263]	training's binary_logloss: 0.40094	training's auc: 0.842307
[264]	training's binary_logloss: 0.400838	training's auc: 0.842425
[265]	training's binary_logloss: 0.400779	training's auc: 0.842455
[266]	training's binary_logloss: 0.400701	training's auc: 0.84251
[267]	training's binary_logloss: 0.400614	training's auc: 0.842607
[268]	training's binary_logloss: 0.400558	training's auc: 0.842631

[380]	training's binary_logloss: 0.392956	training's auc: 0.849465
[381]	training's binary_logloss: 0.392861	training's auc: 0.849572
[382]	training's binary_logloss: 0.392773	training's auc: 0.849683
[383]	training's binary_logloss: 0.392676	training's auc: 0.849753
[384]	training's binary_logloss: 0.392577	training's auc: 0.849827
[385]	training's binary_logloss: 0.392561	training's auc: 0.849835
[386]	training's binary_logloss: 0.392471	training's auc: 0.849908
[387]	training's binary_logloss: 0.392419	training's auc: 0.84993
[388]	training's binary_logloss: 0.392407	training's auc: 0.849935
[389]	training's binary_logloss: 0.392381	training's auc: 0.849946
[390]	training's binary_logloss: 0.3923	training's auc: 0.850039
[391]	training's binary_logloss: 0.392216	training's auc: 0.850125
[392]	training's binary_logloss: 0.392135	training's auc: 0.85022
[393]	training's binary_logloss: 0.392114	training's auc: 0.850246
[394]	training's binary_logloss: 0.39203	training's auc: 0.850311
