In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import xgboost
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

kfold = KFold(n_splits=5, random_state=42)
break_point = datetime(2017, 2, 28)

In [2]:
def read_data():
    
    print('Reading files...')    
    order_df = pd.read_csv('../input/machine_learning_challenge_order_data.csv')
    print('Order data has {} rows and {} columns'.format(order_df.shape[0], order_df.shape[1]))
    label_df = pd.read_csv('../input/machine_learning_challenge_labeled_data.csv')
    print('Label data has {} rows and {} columns'.format(label_df.shape[0], label_df.shape[1]))
    df = order_df.merge(label_df, on='customer_id')
    return df

In [3]:
def reduce_mem_usage(df, verbose=False):
    
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [4]:
def transform_data(df):

    labelencoder = LabelEncoder()

    for i in ['restaurant_id', 'city_id', 'payment_id', 'platform_id', 'transmission_id']:
        df[i] = labelencoder.fit_transform(df[i])

    return df

In [13]:
def feature_engineering(df, break_point):

    df['customer_order_rank'] = df['customer_order_rank'].fillna(method='ffill')

    df['order_date'] = pd.to_datetime(df['order_date']) 
    df['recency'] = (break_point - df['order_date']) / np.timedelta64(1, 'D')
    df['first_order_date'] = df.groupby(['customer_id'])['order_date'].transform('first')
    df['age_of_user'] = (break_point - df['first_order_date']) / np.timedelta64(1, 'D')

    df['year'] = df['order_date'].dt.year
    df['month'] = df['order_date'].dt.month
    df['week'] = df['order_date'].dt.week
    df['day'] = df['order_date'].dt.day
    df['dayofweek'] = df['order_date'].dt.dayofweek
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(np.int8)
   
    df = df.groupby('customer_id').last().reset_index()

    return df

In [14]:
def run_lgb(df):
    
    y = df['is_returning_customer']
    X = df.drop(columns=['customer_id', 'order_date', 'is_returning_customer',
                        'first_order_date'])    
    
    clf = xgboost.XGBClassifier(objective= 'binary:logistic', n_jobs= -1)
    clf.fit(X, y)
    
    y_pred = cross_val_predict(clf, X, y, cv=kfold)
    
    print('Accuracy Score:  ',round(metrics.accuracy_score(y, y_pred), 2))
    print('Roc Auc Score:  ',round(roc_auc_score(y, y_pred), 2))
    print('Classification Report: \n', classification_report(y, y_pred, target_names=['0', '1']))
    return df

In [16]:
def transform_train_and_eval():
    
    df = read_data()
    df = reduce_mem_usage(df, True)
    df = transform_data(df)
    df = feature_engineering(df, break_point)
    run_lgb(df)
    
print(transform_train_and_eval())

Reading files...
Order data has 786600 rows and 13 columns
Label data has 245455 rows and 2 columns
Mem. usage decreased to 42.76 Mb (52.5% reduction)
Accuracy Score:   0.84
Roc Auc Score:   0.69
Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.96      0.90    189948
           1       0.74      0.43      0.54     55507

   micro avg       0.84      0.84      0.84    245455
   macro avg       0.80      0.69      0.72    245455
weighted avg       0.83      0.84      0.82    245455

None
