In [15]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import seaborn as sns

import xgboost
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore") 

kfold = KFold(n_splits=5, random_state=42)
break_point = datetime(2017, 2, 28)

In [2]:
def read_data():
    
    print('Reading files...')    
    order_df = pd.read_csv('../input/machine_learning_challenge_order_data.csv')
    print('Order data has {} rows and {} columns'.format(order_df.shape[0], order_df.shape[1]))
    label_df = pd.read_csv('../input/machine_learning_challenge_labeled_data.csv')
    print('Label data has {} rows and {} columns'.format(label_df.shape[0], label_df.shape[1]))
    df = order_df.merge(label_df, on='customer_id')
    print('The final data has {} rows and {} columns'.format(df.shape[0], df.shape[1]))
    return df

In [3]:
def reduce_mem_usage(df, verbose=False):
    
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [4]:
def getWeeklyDates(df, break_point):

    df['order_date'] = pd.to_datetime(df['order_date'])
    three_day = df[df['order_date'] >= break_point - timedelta(days=3)]
    one_week = df[df['order_date'] >= break_point - timedelta(days=7)]
    two_week = df[df['order_date'] >= break_point - timedelta(days=14)]
    four_week = df[df['order_date'] >= break_point - timedelta(days=28)]
    twelve_week = df[df['order_date'] >= break_point - timedelta(days=84)]
    twenty_four_week = df[df['order_date'] >= break_point - timedelta(days=168)]
    all_week = df
    return three_day, one_week, two_week, four_week, twelve_week, twenty_four_week,all_week

In [5]:
def transform_data(df):

    labelencoder = LabelEncoder()

    for i in ['restaurant_id', 'city_id', 'payment_id', 'platform_id', 'transmission_id']:
        df[i] = labelencoder.fit_transform(df[i])

    return df

In [6]:
def feature_engineering(df, break_point):
    
    df['customer_order_rank'] = df['customer_order_rank'].fillna(method='ffill')

    df['date'] = pd.to_datetime(df['order_date']) 
    df['recency'] = (break_point - df['date']) / np.timedelta64(1, 'D')
    df['first_order_date'] = df.groupby(['customer_id'])['date'].transform('first')
    df['age_of_user'] = (break_point - df['first_order_date']) / np.timedelta64(1, 'D')

    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['week'] = df['date'].dt.week
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(np.int8)
    
    df['demand'] = 1
    
    df['order_date_shift'] = df.groupby('customer_id')['date'].shift()
    df['date_diff'] = (df['date'] - df['order_date_shift']) / np.timedelta64(1, 'D')

    three_day, one_week, two_week, four_week, twelve_week, twenty_four_week,all_week = getWeeklyDates(df, break_point)
    
    col = ['demand', 'is_failed','voucher_amount','delivery_fee', 'amount_paid', 'date_diff']
    three_day, one_week, two_week, four_week, twelve_week, twenty_four_week,all_week = getWeeklyDates(df, break_point)
    three_day = three_day.groupby('customer_id')[col].mean().add_prefix('three_day_').reset_index()
    one_week = one_week.groupby('customer_id')[col].mean().add_prefix('one_week_').reset_index()
    two_week = two_week.groupby('customer_id')[col].mean().add_prefix('two_week_').reset_index()
    four_week = four_week.groupby('customer_id')[col].mean().add_prefix('four_week_').reset_index()
    twelve_week = twelve_week.groupby('customer_id')[col].mean().add_prefix('twelve_week_').reset_index()
    twenty_four_week = twenty_four_week.groupby('customer_id')[col].mean().add_prefix('twenty_four_week_').reset_index()
    all_week = all_week.groupby('customer_id')[col].mean().add_prefix('all_week_').reset_index()
    
    df = df.groupby('customer_id').last().reset_index()
    df = df.merge(three_day, how='left').merge(one_week, how='left').merge(two_week, how='left').merge(four_week,
    'left').merge(twelve_week,'left').merge(twenty_four_week,'left').merge(all_week,'left').reset_index()

    return df

In [60]:
def run_xgb(df):
    
    y = df['is_returning_customer']
    X = df.drop(columns=['customer_id', 'order_date', 'date', 'is_returning_customer',
                        'first_order_date', 'index', 'order_date_shift'])    
    
    clf_xgb = xgboost.XGBClassifier(objective = 'binary:logistic', scale_pos_weight=2)
    param_dist = {
              'max_depth': np.arange(3,15,3) ,
              'min_child_weight': np.arange(1,8,1),
                      'colsample_bytree': np.arange(0.3,0.9,0.1),
              'n_estimators': np.arange(100,1000,100),
        'learning_rate'    : np.arange(0.05,0.3,0.05)
        
             }
    
    fit_params={"early_stopping_rounds":10,
               "eval_metric" : "auc", 
            "eval_set" : [[X, y]]}

    grid_search = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         #cv = kfold,  
                         n_iter = 100,
                         error_score = 0, 
                         verbose = 3, 
                         n_jobs = -1,
                         fit_params=fit_params,)
    
    grid_result = grid_search.fit(X,y)

    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_[ 'mean_test_score' ]
    stds = grid_result.cv_results_[ 'std_test_score' ]
    params = grid_result.cv_results_[ 'params' ]
    print(means, stds, params)
    
    print('Accuracy Score:  ', round(metrics.accuracy_score(y, y_pred), 2))
    print('Roc Auc Score:  ', round(roc_auc_score(y, y_pred), 2))
    print('Classification Report: \n', classification_report(y, y_pred, target_names=['0', '1']))
    return predicted

In [61]:
def transform_train_and_eval():
    
    df = read_data()
    df = reduce_mem_usage(df, True)
    df = transform_data(df)
    df = feature_engineering(df, break_point)
    predicted = run_xgb(df)
    return predicted

predicted = transform_train_and_eval()

Reading files...
Order data has 786600 rows and 13 columns
Label data has 245455 rows and 2 columns
Mem. usage decreased to 42.76 Mb (52.5% reduction)
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 114.1min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed: 308.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 365.1min finished


[0]	validation_0-auc:0.815414
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.834612
[2]	validation_0-auc:0.838115
[3]	validation_0-auc:0.837227
[4]	validation_0-auc:0.841533
[5]	validation_0-auc:0.843863
[6]	validation_0-auc:0.844772
[7]	validation_0-auc:0.846449
[8]	validation_0-auc:0.847587
[9]	validation_0-auc:0.84879
[10]	validation_0-auc:0.849284
[11]	validation_0-auc:0.849706
[12]	validation_0-auc:0.850416
[13]	validation_0-auc:0.850496
[14]	validation_0-auc:0.851395
[15]	validation_0-auc:0.852587
[16]	validation_0-auc:0.853205
[17]	validation_0-auc:0.854107
[18]	validation_0-auc:0.854934
[19]	validation_0-auc:0.855826
[20]	validation_0-auc:0.856726
[21]	validation_0-auc:0.857221
[22]	validation_0-auc:0.857558
[23]	validation_0-auc:0.857869
[24]	validation_0-auc:0.858275
[25]	validation_0-auc:0.858979
[26]	validation_0-auc:0.859724
[27]	validation_0-auc:0.860447
[28]	validation_0-auc:0.861055
[29]	validation_0-auc:0.86141
[30]	validation_0-

[259]	validation_0-auc:0.915817
[260]	validation_0-auc:0.915852
[261]	validation_0-auc:0.915859
[262]	validation_0-auc:0.916141
[263]	validation_0-auc:0.91616
[264]	validation_0-auc:0.916218
[265]	validation_0-auc:0.916238
[266]	validation_0-auc:0.916359
[267]	validation_0-auc:0.916402
[268]	validation_0-auc:0.916411
[269]	validation_0-auc:0.916472
[270]	validation_0-auc:0.916666
[271]	validation_0-auc:0.916828
[272]	validation_0-auc:0.916832
[273]	validation_0-auc:0.916969
[274]	validation_0-auc:0.917024
[275]	validation_0-auc:0.917136
[276]	validation_0-auc:0.917149
[277]	validation_0-auc:0.917164
[278]	validation_0-auc:0.917169
[279]	validation_0-auc:0.917467
[280]	validation_0-auc:0.917477
[281]	validation_0-auc:0.917485
[282]	validation_0-auc:0.917497
[283]	validation_0-auc:0.917559
[284]	validation_0-auc:0.917637
[285]	validation_0-auc:0.917874
[286]	validation_0-auc:0.917877
[287]	validation_0-auc:0.918084
[288]	validation_0-auc:0.918564
[289]	validation_0-auc:0.918673
[290]	val

[516]	validation_0-auc:0.940172
[517]	validation_0-auc:0.940224
[518]	validation_0-auc:0.940241
[519]	validation_0-auc:0.940252
[520]	validation_0-auc:0.940273
[521]	validation_0-auc:0.940413
[522]	validation_0-auc:0.940426
[523]	validation_0-auc:0.940442
[524]	validation_0-auc:0.940466
[525]	validation_0-auc:0.940473
[526]	validation_0-auc:0.94049
[527]	validation_0-auc:0.940509
[528]	validation_0-auc:0.940511
[529]	validation_0-auc:0.940592
[530]	validation_0-auc:0.940717
[531]	validation_0-auc:0.940725
[532]	validation_0-auc:0.940826
[533]	validation_0-auc:0.940979
[534]	validation_0-auc:0.94102
[535]	validation_0-auc:0.941088
[536]	validation_0-auc:0.941377
[537]	validation_0-auc:0.94151
[538]	validation_0-auc:0.941622
[539]	validation_0-auc:0.941656
[540]	validation_0-auc:0.941742
[541]	validation_0-auc:0.941813
[542]	validation_0-auc:0.941906
[543]	validation_0-auc:0.942047
[544]	validation_0-auc:0.942135
[545]	validation_0-auc:0.942281
[546]	validation_0-auc:0.942289
[547]	valid

NameError: name 'y_pred' is not defined