### Objective of the notebook:

In this notebook, we will add recency, number of days from the first order, year, month, week, day, is weekend features.

### Importing libraries

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

import lightgbm 

from sklearn import metrics
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore") 

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
break_point = datetime(2017, 2, 28)

### Importing datasets

In [2]:
def read_data():
    
    print('Reading files...')    
    order_df = pd.read_csv('../input/machine_learning_challenge_order_data.csv')
    print('Order data has {} rows and {} columns'.format(order_df.shape[0], order_df.shape[1]))
    label_df = pd.read_csv('../input/machine_learning_challenge_labeled_data.csv')
    print('Label data has {} rows and {} columns'.format(label_df.shape[0], label_df.shape[1]))
    df = order_df.merge(label_df, on='customer_id')
    print('The final data has {} rows and {} columns'.format(df.shape[0], df.shape[1]))
    print("")
    return df

### Change data types and reduce memory usage

In [3]:
def reduce_mem_usage(df, verbose=False):
    
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    print("")
    return df

### Label encode categorical features

In [4]:
def transform_data(df):

    labelencoder = LabelEncoder()

    for i in ['restaurant_id', 'city_id', 'payment_id', 'platform_id', 'transmission_id']:
        df[i] = labelencoder.fit_transform(df[i])

    return df

### Convert raw data to a session format

- Fill order rank with the forward-filling method.
- Calculate recency and number of days from the first order.
- Get time-related features like the year, month, week, day, day of the week, weekend.
- Keep the last record of each customer.

In [5]:
def feature_engineering(df, break_point):

    df['customer_order_rank'] = df['customer_order_rank'].fillna(method='ffill')

    df['order_date'] = pd.to_datetime(df['order_date']) 
    df['recency'] = (break_point - df['order_date']) / np.timedelta64(1, 'D')
    df['first_order_date'] = df.groupby(['customer_id'])['order_date'].transform('first')
    df['age_of_user'] = (break_point - df['first_order_date']) / np.timedelta64(1, 'D')

    df['year'] = df['order_date'].dt.year
    df['month'] = df['order_date'].dt.month
    df['week'] = df['order_date'].dt.week
    df['day'] = df['order_date'].dt.day
    df['dayofweek'] = df['order_date'].dt.dayofweek
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(np.int8)
   
    df = df.groupby('customer_id').last().reset_index()

    return df

### Run the lgb model and calculate scores


In [26]:
def run_lgb(df):
    
    y = df['is_returning_customer']
    X = df.drop(columns=['customer_id', 'order_date', 'is_returning_customer',
                        'first_order_date'])    
    
    clf = lightgbm.LGBMClassifier(random_state=42,  scale_pos_weight=2)
    
    auc_mean = cross_val_score(clf, X, y, cv = kfold, scoring = "roc_auc").mean()
    auc_std = cross_val_score(clf, X, y, cv = kfold, scoring = "roc_auc").std()

    acc_mean = cross_val_score(clf, X, y, cv = kfold, scoring = "accuracy").mean()
    acc_std = cross_val_score(clf, X, y, cv = kfold, scoring = "accuracy").std()
    
    result = round(pd.DataFrame({'Roc-Auc Mean':auc_mean,'Roc-Auc Std':auc_std,
                                'Accuracy Mean':acc_mean, 'Accuracy Std':acc_std}, 
                                 index=['LGBM']), 4)                                     
    print(result)

### Execute all pipeline

The accuracy and roc-auc scores are 0.82 and 0.73 relatively. Adding features increase model scores by 2-3%.

In [27]:
def execute_pipeline():
    
    df = read_data()
    df = reduce_mem_usage(df, True)
    df = transform_data(df)
    df = feature_engineering(df, break_point)
    run_lgb(df)
    
execute_pipeline()

Reading files...
Order data has 786600 rows and 13 columns
Label data has 245455 rows and 2 columns
The final data has 786600 rows and 14 columns

Mem. usage decreased to 42.76 Mb (52.5% reduction)

      Roc-Auc Mean  Roc-Auc Std  Accuracy Mean  Accuracy Std
LGBM        0.8161       0.0015         0.8222        0.0012


### What is next? 

We will add day differences between consecutive orders and rolling features in 3 days, 1, 2, 4, 12, 24 weeks, and all time.