##### Note to CnA: Do check the data tab 

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import optuna
from datetime import datetime
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn import ensemble
import seaborn as sns
import lightgbm as lgb
from sklearn import preprocessing
import matplotlib.pyplot as plt
import catboost as cb
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings("ignore")


# Loading Datasets

In [2]:
# Here we have 2 different versions of train dataset with 2 different forms of 5 fold split (2 because of model stacking having 2 levels)
df_stacking = pd.read_csv('../input/stacking-folds/train_folds (6).csv', parse_dates = True)
df = pd.read_csv('../input/new-kflods-round2/train_folds (4).csv', parse_dates = True)
df_test = pd.read_csv('../input/cascade-cup-22/test.csv', parse_dates = True)
sample_submission = pd.read_csv('../input/cascade-cup-22/sample_submission.csv')

# Working with the Data

In [3]:
# Combining the second set of kfolds created to the main dataset
df_stacking.rename(columns = {'kfold':'stacking_kfold'}, inplace = True)
df_stacking_new = df_stacking.pop('stacking_kfold')
df = pd.concat([df,df_stacking_new],axis = 'columns')

In [4]:
# Splitting the time features given originally to time and date
df[['order_date','order_time']] = (df['order_time'].str.split(' ', expand = True))
df[['allot_date','allot_time']] = (df['allot_time'].str.split(' ', expand = True))
df[['accept_date','accept_time']] = (df['accept_time'].str.split(' ', expand = True))
df[['pickup_date','pickup_time']] = (df['pickup_time'].str.split(' ', expand = True))
df[['delivered_date','delivered_time']] = (df['delivered_time'].str.split(' ', expand = True))
df_test[['order_date','order_time']] = (df_test['order_time'].str.split(' ', expand = True))
df_test[['allot_date','allot_time']] = (df_test['allot_time'].str.split(' ', expand = True))
df_test[['accept_date','accept_time']] = (df_test['accept_time'].str.split(' ', expand = True))

In [5]:
# Dropping the data leakage features and some other redundant features
df = df.drop(['delivered_time','pickup_time'],axis = 'columns')
df = df.drop(['accept_date','allot_date','pickup_date','delivered_date','cancelled_time'],axis = 'columns')
df_test = df_test.drop(['accept_date','allot_date'],axis = 'columns')

In [6]:
# Filling null values in reassigned_order with 0 as null values here represent the fact that order was not a reassigned order
df['reassigned_order'].fillna(value = 0,inplace = True)
df_test['reassigned_order'].fillna(value = 0,inplace = True)

In [7]:
# Filling null values in reassignment_method with null as null values here represent the fact that order was not a reassigned order
df['reassignment_method'].fillna(value = 'null',inplace = True)
df_test['reassignment_method'].fillna(value = 'null',inplace = True)

In [8]:
# Filling null values in reassignment_reason with null as null values here represent the fact that order was not a reassigned order
df['reassignment_reason'].fillna(value = 'null',inplace = True)
df_test['reassignment_reason'].fillna(value = 'null',inplace = True)

In [9]:
# Label encoding reassignment_reason using map function
df['reassignment_reason'] = df['reassignment_reason'].map({'null':0,'Auto Reassignment basis Inaction. coreengine.tasks.repush_order_to_aa_bucket':1,'Reassignment Request from SE portal.':2,'Reassign':3})
df_test['reassignment_reason'] = df_test['reassignment_reason'].map({'null':0,'Auto Reassignment basis Inaction. coreengine.tasks.repush_order_to_aa_bucket':1,'Reassignment Request from SE portal.':2,'Reassign':3})

In [10]:
# Creating a new feature to mark the rows where there were null values in accept_time although this feature turned out to be not so important
df['not_accepted'] = np.where(df['accept_time'].isna() ,1,0)
df_test['not_accepted'] = np.where(df_test['accept_time'].isna() ,1,0)

In [11]:
#converting to datetime form (not sure if this was necessary or not)
df['order_time'] = pd.to_datetime(df['order_time'])
df['order_date'] = pd.to_datetime(df['order_date'])
df['allot_time'] = pd.to_datetime(df['allot_time'])
df['accept_time'] = pd.to_datetime(df['accept_time'])

df_test['order_time'] = pd.to_datetime(df_test['order_time'])
df_test['order_date'] = pd.to_datetime(df_test['order_date'])
df_test['allot_time'] = pd.to_datetime(df_test['allot_time'])
df_test['accept_time'] = pd.to_datetime(df_test['accept_time'])

In [12]:
# Creating new feature which tells us the day of the order
df['day'] = df['order_date'].dt.day_name()
df_test['day'] = df_test['order_date'].dt.day_name()

In [13]:
# Creating a new feature to mark the rows where there were null values in lifetime_order_count although this feature turned out to be not so important
df['lifetime_order_count_null'] = np.where(df['lifetime_order_count'].isna() ,1,0)
df_test['lifetime_order_count_null'] = np.where(df_test['lifetime_order_count'].isna() ,1,0)

## Dealing with Null values

In [14]:
# While initially going through the data I found some rows where I was able to manually impute some null values
df.loc[df['rider_id']==12549,'delivered_orders'] = df.loc[df['rider_id']==12549,'delivered_orders'].fillna(value = 0)
df_test.loc[df_test['rider_id']==12549,'delivered_orders'] = df_test.loc[df_test['rider_id']==12549,'delivered_orders'].fillna(value = 0)
df.loc[df['rider_id']==14345,'delivered_orders'] = df.loc[df['rider_id']==14345,'delivered_orders'].fillna(value = 0)
df.loc[df['rider_id']==16749,'delivered_orders'] = df.loc[df['rider_id']==16749,'delivered_orders'].fillna(value = 0)
df.loc[df['order_id']==343888,'alloted_orders'] = df.loc[df['order_id']==343888,'alloted_orders'].fillna(value = 0)

In [15]:
# Since I planned to use tree based models and I noticed certain groups of features based on null values row wise, I marked them in this manner
df.loc[(df['lifetime_order_count'].isna() == True) & (df['alloted_orders'].isna() == True) & (df['delivered_orders'].isna() == True) & (df['session_time'].isna() == True),['lifetime_order_count','alloted_orders','delivered_orders','undelivered_orders']] = df.loc[(df['lifetime_order_count'].isna() == True) & (df['alloted_orders'].isna() == True) & (df['delivered_orders'].isna() == True) & (df['session_time'].isna() == True),['lifetime_order_count','alloted_orders','delivered_orders','undelivered_orders']].fillna(value = -1000)
df_test.loc[(df_test['lifetime_order_count'].isna() == True) & (df_test['alloted_orders'].isna() == True) & (df_test['delivered_orders'].isna() == True) & (df_test['session_time'].isna() == True),['lifetime_order_count','alloted_orders','delivered_orders','undelivered_orders']] = df_test.loc[(df_test['lifetime_order_count'].isna() == True) & (df_test['alloted_orders'].isna() == True) & (df_test['delivered_orders'].isna() == True) & (df_test['session_time'].isna() == True),['lifetime_order_count','alloted_orders','delivered_orders','undelivered_orders']].fillna(value = -1000)

In [16]:
# Since I planned to use tree based models and I noticed certain groups of features based on null values row wise, I marked them in this manner
df.loc[(df['lifetime_order_count'].isna() == True) & (df['alloted_orders'].isna() == True) & (df['delivered_orders'].isna() == True),['lifetime_order_count','alloted_orders','delivered_orders','undelivered_orders']] = df.loc[(df['lifetime_order_count'].isna() == True) & (df['alloted_orders'].isna() == True) & (df['delivered_orders'].isna() == True),['lifetime_order_count','alloted_orders','delivered_orders','undelivered_orders']].fillna(value = -999)
df_test.loc[(df_test['lifetime_order_count'].isna() == True) & (df_test['alloted_orders'].isna() == True) & (df_test['delivered_orders'].isna() == True),['lifetime_order_count','alloted_orders','delivered_orders','undelivered_orders']]= df_test.loc[(df_test['lifetime_order_count'].isna() == True) & (df_test['alloted_orders'].isna() == True) & (df_test['delivered_orders'].isna() == True),['lifetime_order_count','alloted_orders','delivered_orders','undelivered_orders']].fillna(value = -999)

In [17]:
# Since I planned to use tree based models and I noticed certain groups of features based on null values row wise, I marked them in this manner
df.loc[df['lifetime_order_count'].isna() == True, 'delivered_orders'] = df.loc[df['lifetime_order_count'].isna() == True, 'delivered_orders'].fillna(value = 0)
df.loc[df['lifetime_order_count'].isna() == True, 'lifetime_order_count'] = df.loc[df['lifetime_order_count'].isna() == True, 'lifetime_order_count'].fillna(value = -998)
df_test.loc[df_test['lifetime_order_count'].isna() == True, 'delivered_orders'] = df_test.loc[df_test['lifetime_order_count'].isna() == True, 'delivered_orders'].fillna(value = 0)
df_test.loc[df_test['lifetime_order_count'].isna() == True, 'lifetime_order_count'] = df_test.loc[df_test['lifetime_order_count'].isna() == True, 'lifetime_order_count'].fillna(value = -998)

In [18]:
# Not sure how useful this imputation trick was but used it anyway, a combination of groupy and transform
for val in ['alloted_orders','delivered_orders','session_time']:
    df[val] = df[val].fillna(df.groupby('rider_id')[val].transform('median'))
    df_test[val] = df_test[val].fillna(df_test.groupby('rider_id')[val].transform('median'))

In [19]:
# Simple Imputer with median strategy
cols_log = ['alloted_orders','delivered_orders','session_time']
imputer_log = SimpleImputer(strategy = 'median')
train_iterimp = imputer_log.fit_transform(df[cols_log])
test_iterimp = imputer_log.transform(df_test[cols_log])
train_iterimp = pd.DataFrame(train_iterimp,columns = cols_log)
test_iterimp = pd.DataFrame(test_iterimp,columns = cols_log)
df = df.drop(cols_log, axis = 'columns')
df = pd.concat([df,train_iterimp],axis = 'columns')
df_test = df_test.drop(cols_log, axis = 'columns')
df_test = pd.concat([df_test,test_iterimp],axis = 'columns')

In [20]:
# Since undelivered_orders = alloted_orders - delivered_orders, I filled the null values in this manner for rows not part of certain defined groups
df.loc[(df['delivered_orders']!=-999)&(df['delivered_orders']!=-1000),'undelivered_orders'] = df.loc[(df['delivered_orders']!=-999)&(df['delivered_orders']!=-1000),'alloted_orders']-df.loc[(df['delivered_orders']!=-999)&(df['delivered_orders']!=-1000),'delivered_orders']
df_test.loc[(df_test['delivered_orders']!=-999)&(df_test['delivered_orders']!=-1000),'undelivered_orders'] = df_test.loc[(df_test['delivered_orders']!=-999)&(df_test['delivered_orders']!=-1000),'alloted_orders']- df_test.loc[(df_test['delivered_orders']!=-999)&(df_test['delivered_orders']!=-1000),'delivered_orders']

In [21]:
# Created a new feature but I dont think this was very useful
df['total_distance'] = df['first_mile_distance']+df['last_mile_distance']
df_test['total_distance'] = df_test['first_mile_distance']+df_test['last_mile_distance']

In [22]:
# My code is not very organized.... anyway here I am creating a new feature from the datetime features 
hours = df['order_time'].dt.hour
df['hours'] = hours
hours_test = df_test['order_time'].dt.hour
df_test['hours'] = hours_test

In [23]:
# and again
df['month'] = df['order_date'].dt.month
df_test['month'] = df_test['order_date'].dt.month

In [24]:
# and again....
df['is_weekend'] = df['day'].apply(lambda x : 1 if x in ['Saturday','Sunday'] else 0)
df_test['is_weekend'] = df_test['day'].apply(lambda x : 1 if x in ['Saturday','Sunday'] else 0)

In [25]:
# Features to calculate allotment and acceptance delay (there were some negative values too...clearly an error and these were marked using a separate feature)
allotment_delay = []
acceptance_delay = []
for i in range(len(df.index)):
    allotment_delay.append((df.loc[i,'allot_time'] - df.loc[i,'order_time']).total_seconds())
df['allotment_delay'] = allotment_delay
for i in range(len(df.index)):
    acceptance_delay.append((df.loc[i,'accept_time'] - df.loc[i,'allot_time']).total_seconds())
df['acceptance_delay'] = acceptance_delay

In [26]:
# Features to calculate allotment and acceptance delay (there were some negative values too...clearly an error and these were marked using a separate feature)
allotment_delay = []
acceptance_delay = []
for i in range(len(df_test.index)):
    allotment_delay.append((df_test.loc[i,'allot_time'] - df_test.loc[i,'order_time']).total_seconds())
df_test['allotment_delay'] = allotment_delay
for i in range(len(df_test.index)):
    acceptance_delay.append((df_test.loc[i,'accept_time'] - df_test.loc[i,'allot_time']).total_seconds())
df_test['acceptance_delay'] = acceptance_delay

In [27]:
# Feature to mark the negative values seen in acceptance_delay
df['ad_neg'] = np.where(df['acceptance_delay']<0 ,1,0)
df_test['ad_neg'] = np.where(df_test['acceptance_delay']<0 ,1,0)

In [28]:
# Simple Imputer with median strategy to deal with null values in acceptance_delay
cols_log = ['acceptance_delay']
imputer_log = SimpleImputer(strategy = 'median')
train_iterimp = imputer_log.fit_transform(df[cols_log])
test_iterimp = imputer_log.transform(df_test[cols_log])
train_iterimp = pd.DataFrame(train_iterimp,columns = cols_log)
test_iterimp = pd.DataFrame(test_iterimp,columns = cols_log)
df = df.drop(cols_log, axis = 'columns')
df = pd.concat([df,train_iterimp],axis = 'columns')
df_test = df_test.drop(cols_log, axis = 'columns')
df_test = pd.concat([df_test,test_iterimp],axis = 'columns')

In [29]:
# Defined a function to define the part of the day when the followwing events took place
def daypart(hour):
    if hour in [6,7,8,9,10]:
        return "morning"
    elif hour in [14,11,12,13]:
        return "noon"
    elif hour in [15,16,17,18]:
        return "afternoon"
    else: return "night"
dayparts = hours.apply(daypart)
df['dayparts'] = dayparts
dayparts_test = hours_test.apply(daypart)
df_test['dayparts'] = dayparts_test

In [30]:
# Dropping some redundant features
df = df.drop(['order_time','order_date','allot_time','accept_time'],axis = 'columns')
df_test = df_test.drop(['order_time','order_date','allot_time','accept_time'],axis = 'columns')

In [31]:
# Useless Cell tbh 
useless_features = ['order_id','cancelled','kfold','stacking_kfold']
useful_features = [f for f in df.columns if f not in useless_features]
object_cols = ['reassignment_method','reassigned_order','first_ismore','last_ismore','not_accepted','undelivered_orders','month','day','is_weekend','hours','dayparts','date','combined_reassignment']
non_numerical_cols = object_cols + useless_features
numerical_cols = [c for c in df.columns if c not in non_numerical_cols ]

In [32]:
# Label Encoding using map function
df['dayparts'] = df['dayparts'].map({'night':0,'morning':1,'noon':2,'afternoon':3})
df_test['dayparts'] = df_test['dayparts'].map({'night':0,'morning':1,'noon':2,'afternoon':3})
df['reassignment_method'] = df['reassignment_method'].map({'null':0,'auto':1,'manual':2})
df_test['reassignment_method'] = df_test['reassignment_method'].map({'null':0,'auto':1,'manual':2})
df['day'] = df['day'].map({'Monday':0,'Tuesday':1,'Wednesday':2,'Thursday':3,'Friday':4,'Saturday':5,'Sunday':6})
df_test['day'] = df_test['day'].map({'Monday':0,'Tuesday':1,'Wednesday':2,'Thursday':3,'Friday':4,'Saturday':5,'Sunday':6})


In [33]:
# Changing dtypes
df['undelivered_orders'] = df['undelivered_orders'].astype('int')
df['reassigned_order'] = df['reassigned_order'].astype('int')
df['undelivered_orders'] = df['undelivered_orders'].astype('int')
df['day'] = df['day'].astype('int')
df['dayparts'] = df['dayparts'].astype('int')
df['reassignment_method'] = df['reassignment_method'].astype('int')

df_test['undelivered_orders'] =df_test['undelivered_orders'].astype('int')
df_test['reassigned_order'] = df_test['reassigned_order'].astype('int')
df_test['undelivered_orders'] = df_test['undelivered_orders'].astype('int')
df_test['day'] = df_test['day'].astype('int')
df_test['dayparts'] = df_test['dayparts'].astype('int')
df_test['reassignment_method'] = df_test['reassignment_method'].astype('int')

In [34]:
# Binning the Last Mile Distance feature (idk how useful this was) 
def bin_last(x):
    if x>0 and x<0.45:
        return 0
    elif x>=0.45 and x<0.9:
        return 1
    elif x>=0.9 and x<1.34:
        return 2
    elif x>= 1.34 and x<1.79:
        return 3
    elif x>=1.79 and x<2.24:
        return 4
    elif x>=2.24 and x<2.69:
        return 5
    elif x>=2.69 and x<2.52:
        return 6
    elif x>= 2.52and x<3.14:
        return 7
    elif x>=3.14 and x<3.59:
        return 8
    elif x>=3.59 and x<4.03:
        return 9
    elif x>= 4.03and x<4.48:
        return 10
    elif x>=4.48 and x<4.93:
        return 11
    elif x>=4.93 and x<5.38:
        return 12
    elif x>= 5.38and x<6.27:
        return 13
    else:
        return 14
df['bin_last'] = df['last_mile_distance'].apply(bin_last)
def bin_last_test(x):
    if x>0 and x<0.41:
        return 0
    elif x>=0.41 and x<0.83:
        return 1
    elif x>=0.83 and x<1.24:
        return 2
    elif x>= 1.24 and x<1.65:
        return 3
    elif x>=1.65 and x<2.07:
        return 4
    elif x>=2.07 and x<2.48:
        return 5
    elif x>=2.48 and x<2.90:
        return 6
    elif x>= 2.90 and x<3.31:
        return 7
    elif x>=3.31 and x<3.72:
        return 8
    elif x>=3.72 and x<4.14:
        return 9
    elif x>= 4.14 and x<4.55:
        return 10
    elif x>=4.55 and x<4.96:
        return 11
    elif x>=4.93 and x<5.38:
        return 12
    elif x>= 5.38and x<5.79:
        return 13
    else:
        return 14
df_test['bin_last'] = df_test['last_mile_distance'].apply(bin_last_test)

# The Magical Feature Engineering Step

 ##### Here I submitted all features created by the method because I had just one submission left and didnt wanna risk it. The main score improving features were: 'undelivered_orders_mean'(MAINNNNNNNNNNN), 'last_mile_distance_min','first_mile_distance_min','lifetime_order_count_null_sum','lifetime_order_count_null_mean' (based on correlation with target feature done in separate notebook)

In [35]:
numeric_cols = [   
'rider_id',                       
'first_mile_distance',            
'last_mile_distance',            
'lifetime_order_count',           
'alloted_orders',                 
'delivered_orders',               
'session_time','total_distance','acceptance_delay','allotment_delay'
]
categorical_cols = [
'rider_id',                      
'undelivered_orders',                      
'reassignment_method',           
'reassignment_reason',            
'reassigned_order',              
'day',                            
'hours',                          
'dayparts',                      
'lifetime_order_count_null','ad_neg','is_weekend','bin_last']

numeric_df = df[numeric_cols]
categorical = df[categorical_cols]
numeric_df_test = df_test[numeric_cols]
categorical_test = df_test[categorical_cols]

In [36]:
agg = numeric_df.groupby('rider_id').agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
agg.columns = ['rider_id','first_mile_distance_count','first_mile_distance_mean','first_mile_distance_max','first_mile_distance_min','first_mile_distance_sum',
               'last_mile_distance_count','last_mile_distance_mean','last_mile_distance_max','last_mile_distance_min','last_mile_distance_sum',
               'lifetime_order_count_count','lifetime_order_count_mean','lifetime_order_count_max','lifetime_order_count_min','lifetime_order_count_sum',
               'alloted_orders_count','alloted_orders_mean','alloted_orders_max','alloted_orders_min','alloted_orders_sum',
               'delivered_orders_count','delivered_orders_mean','delivered_orders_max','delivered_orders_min','delivered_orders_sum',
               'session_time_count','session_time_mean','session_time_max','session_time_min','session_time_sum',
               'total_distance_count','total_distance_mean','total_distance_max','total_distance_min','total_distance_sum',
               'acceptance_delay_count','acceptance_delay_mean','acceptance_delay_max','acceptance_delay_min','acceptance_delay_sum',
               'allotment_delay_count','allotment_delay_mean','allotment_delay_max','allotment_delay_min','allotment_delay_sum'
              ]
df = df.merge(agg,how = 'left', on = 'rider_id')

In [37]:
agg = numeric_df_test.groupby('rider_id').agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
agg.columns = ['rider_id','first_mile_distance_count','first_mile_distance_mean','first_mile_distance_max','first_mile_distance_min','first_mile_distance_sum',
               'last_mile_distance_count','last_mile_distance_mean','last_mile_distance_max','last_mile_distance_min','last_mile_distance_sum',
               'lifetime_order_count_count','lifetime_order_count_mean','lifetime_order_count_max','lifetime_order_count_min','lifetime_order_count_sum',
               'alloted_orders_count','alloted_orders_mean','alloted_orders_max','alloted_orders_min','alloted_orders_sum',
               'delivered_orders_count','delivered_orders_mean','delivered_orders_max','delivered_orders_min','delivered_orders_sum',
               'session_time_count','session_time_mean','session_time_max','session_time_min','session_time_sum',
               'total_distance_count','total_distance_mean','total_distance_max','total_distance_min','total_distance_sum',
               'acceptance_delay_count','acceptance_delay_mean','acceptance_delay_max','acceptance_delay_min','acceptance_delay_sum',
               'allotment_delay_count','allotment_delay_mean','allotment_delay_max','allotment_delay_min','allotment_delay_sum'
              ]
df_test = df_test.merge(agg,how = 'left', on = 'rider_id')

In [38]:
cat = categorical.groupby('rider_id').agg(['sum', 'mean'])
cat.columns = ['rider_id'
              'undelivered_orders_sum','undelivered_orders_mean',
               'reassignment_method_sum','reassignment_method_mean',
               'reassignment_reason_sum','reassignment_reason_mean',
              'reassigned_order_sum','reassigned_order_mean',
              'day_sum','day_mean',
              'hours_sum','hours_mean',
              'dayparts_sum','dayparts_mean',
              'lifetime_order_count_null_sum','lifetime_order_count_null_mean',
              'ad_neg_sum','ad_neg_mean','is_weekend_sum','is_weekend_mean','bin_last_sum','bin_last_mean']
df = df.merge(cat,how = 'left', on = 'rider_id')

In [39]:
cat = categorical_test.groupby('rider_id').agg(['sum', 'mean'])
cat.columns = ['rider_id'
              'undelivered_orders_sum','undelivered_orders_mean',
               'reassignment_method_sum','reassignment_method_mean',
               'reassignment_reason_sum','reassignment_reason_mean',
              'reassigned_order_sum','reassigned_order_mean',
              'day_sum','day_mean',
              'hours_sum','hours_mean',
              'dayparts_sum','dayparts_mean',
              'lifetime_order_count_null_sum','lifetime_order_count_null_mean',
              'ad_neg_sum','ad_neg_mean','is_weekend_sum','is_weekend_mean','bin_last_sum','bin_last_mean']
df_test = df_test.merge(cat,how = 'left', on = 'rider_id')

In [40]:
# agg = df[['rider_id','last_mile_distance','first_mile_distance','total_distance']].groupby('rider_id').agg('min').reset_index()
# df = df.merge(agg,on = 'rider_id',how = 'left')
# cat = df[['rider_id','undelivered_orders']].groupby('rider_id').agg('mean').reset_index()
# df = df.merge(cat,on = 'rider_id',how = 'left')
# cat_new = df[['rider_id','lifetime_order_count_null']].groupby('rider_id').agg(['sum','mean']).reset_index()
# cat_new.columns = ['rider_id','lifeime_order_count_null_sum','lifetime_order_count_null_mean']
# df = df.merge(cat_new,on = 'rider_id',how = 'left')
# agg = df_test[['rider_id','last_mile_distance','first_mile_distance','total_distance']].groupby('rider_id').agg('min').reset_index()
# df_test = df_test.merge(agg,on = 'rider_id',how = 'left')
# cat = df_test[['rider_id','undelivered_orders']].groupby('rider_id').agg('mean').reset_index()
# df_test = df_test.merge(cat,on = 'rider_id',how = 'left')
# cat_new = df_test[['rider_id','lifetime_order_count_null']].groupby('rider_id').agg(['sum','mean']).reset_index()
# cat_new.columns = ['rider_id','lifeime_order_count_null_sum','lifetime_order_count_null_mean']
# df_test = df_test.merge(cat_new,on = 'rider_id',how = 'left')

In [41]:
# Undersampling because the training data is large and imbalanced. Tried normally and with oversampling too.
undersample = RandomUnderSampler(sampling_strategy = 0.5,random_state = 42)

In [42]:
# Used this cell to test various models


# final_predictions = []
# for fold in range(5):
#     xtrain =  df[df.kfold != fold].reset_index(drop=True)
#     xvalid = df[df.kfold == fold].reset_index(drop=True)
#     xtest = df_test.copy()
#     useless_features = ['order_id','cancelled','kfold','stacking_kfold']
#     useful_features = [f for f in df.columns if f not in useless_features]
    


#     ytrain = xtrain['cancelled']
#     yvalid = xvalid['cancelled']
    
#     useless_features = ['order_id','cancelled','kfold','stacking_kfold']
#     useful_features = [f for f in xtrain.columns if f not in useless_features]
#     useful_features_valid = [f for f in xvalid.columns if f not in useless_features]
#     xtrain = xtrain[useful_features]
#     xvalid = xvalid[useful_features_valid]
    
#     useless_features = ['order_id','cancelled','kfold','stacking_kfold']
#     useful_features = [f for f in df_test.columns if f not in useless_features]
    
#     xtest = xtest[useful_features]
        
#     xtrain,ytrain = undersample.fit_resample(xtrain,ytrain)
    
#     model = lgb.LGBMClassifier()

#     model.fit(xtrain, ytrain)
#     #     model = LogisticRegression()
#     #     model.fit(xtrain,ytrain)
#     preds_valid = model.predict_proba(xvalid)
#     test_preds = model.predict_proba(xtest)[:,1]
#     final_predictions.append(test_preds)
#     #     perm_importance = permutation_importance(model,xvalid,yvalid)
#     #     pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort_values('importance', ascending=False)
#     print(model.feature_importances_)
#     plt.figure(figsize = (10,10))
#     plt.barh(useful_features,model.feature_importances_)
#     #     print(model.get_booster().get_fscore(importance_type="gain"))
#     print(fold, roc_auc_score(yvalid, preds_valid[:,1]))
# # import eli5
# # from eli5.sklearn import PermutationImportance
# # perm = PermutationImportance(model, random_state=1).fit(xvalid,yvalid)
# # eli5.show_weights(perm, feature_names = xtrain.columns.tolist())

# Model Stacking

In [43]:
final_test_predictions_1 = []
final_valid_predictions_1 = {}
final_test_predictions_2 = []
final_valid_predictions_2 = {}
final_test_predictions_3 = []
final_valid_predictions_3 = {}
final_test_predictions_4 = []
final_valid_predictions_4 = {}
final_test_predictions_5 = []
final_valid_predictions_5 = {}
final_test_predictions_6 = []
final_valid_predictions_6 = {}
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid['order_id'].values.tolist()
    
    useless_features = ['order_id','cancelled','kfold','stacking_kfold']
    useful_features = [f for f in df.columns if f not in useless_features]


    ytrain = xtrain['cancelled']
    yvalid = xvalid['cancelled']
    
    useless_features = ['order_id','cancelled','kfold','stacking_kfold']
    useful_features = [f for f in xtrain.columns if f not in useless_features]
    useful_features_valid = [f for f in xvalid.columns if f not in useless_features]
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features_valid]
    
    useless_features = ['order_id','cancelled','kfold','stacking_kfold']
    useful_features = [f for f in df_test.columns if f not in useless_features]
    xtest = xtest[useful_features]
    
    
    xtrain,ytrain = undersample.fit_resample(xtrain,ytrain)

    
    model_1 = ensemble.RandomForestClassifier()
        
    model_1.fit(xtrain, ytrain)
    preds_valid_1 = model_1.predict_proba(xvalid)[:,1]
    test_preds_1 = model_1.predict_proba(xtest)[:,1]
    final_test_predictions_1.append(test_preds_1)
    final_valid_predictions_1.update(dict(zip(valid_ids, preds_valid_1)))

    model_2 = lgb.LGBMClassifier()

        
    model_2.fit(xtrain,ytrain)
    preds_valid_2 = model_2.predict_proba(xvalid)[:,1]
    test_preds_2 = model_2.predict_proba(xtest)[:,1]
    final_test_predictions_2.append(test_preds_2)
    final_valid_predictions_2.update(dict(zip(valid_ids, preds_valid_2)))

    model_3 = cb.CatBoostClassifier()

        
    model_3.fit(xtrain, ytrain, verbose=False)
    preds_valid_3 = model_3.predict_proba(xvalid)[:,1]
    test_preds_3 = model_3.predict_proba(xtest)[:,1]
    final_test_predictions_3.append(test_preds_3)
    final_valid_predictions_3.update(dict(zip(valid_ids, preds_valid_3)))
    
    model_4 = ensemble.GradientBoostingClassifier()
        
    model_4.fit(xtrain, ytrain)
    preds_valid_4 = model_4.predict_proba(xvalid)[:,1]
    test_preds_4 = model_4.predict_proba(xtest)[:,1]
    final_test_predictions_4.append(test_preds_4)
    final_valid_predictions_4.update(dict(zip(valid_ids, preds_valid_4)))
    
#     model_5 = ensemble.AdaBoostClassifier()
    
#     model_5.fit(xtrain, ytrain)
#     preds_valid_5 = model_5.predict_proba(xvalid)[:,1]
#     test_preds_5 = model_5.predict_proba(xtest)[:,1]
#     final_test_predictions_5.append(test_preds_5)
#     final_valid_predictions_5.update(dict(zip(valid_ids, preds_valid_5)))
    
#     model_6 = XGBClassifier()
    
#     model_6.fit(xtrain, ytrain)
#     preds_valid_6 = model_6.predict_proba(xvalid)[:,1]
#     test_preds_6 = model_6.predict_proba(xtest)[:,1]
#     final_test_predictions_6.append(test_preds_6)
#     final_valid_predictions_6.update(dict(zip(valid_ids, preds_valid_6)))


In [44]:
final_valid_predictions_1 = pd.DataFrame.from_dict(final_valid_predictions_1, orient="index").reset_index()
final_valid_predictions_1.columns = ["order_id", "pred_1"]
df = df.merge(final_valid_predictions_1, on = 'order_id', how = 'left')

final_valid_predictions_2 = pd.DataFrame.from_dict(final_valid_predictions_2, orient="index").reset_index()
final_valid_predictions_2.columns = ["order_id", "pred_2"]
df = df.merge(final_valid_predictions_2, on = 'order_id', how = 'left')

final_valid_predictions_3 = pd.DataFrame.from_dict(final_valid_predictions_3, orient="index").reset_index()
final_valid_predictions_3.columns = ["order_id", "pred_3"]
df = df.merge(final_valid_predictions_3, on = 'order_id', how = 'left')

final_valid_predictions_4 = pd.DataFrame.from_dict(final_valid_predictions_4, orient="index").reset_index()
final_valid_predictions_4.columns = ["order_id", "pred_4"]
df = df.merge(final_valid_predictions_4, on = 'order_id', how = 'left')

# final_valid_predictions_5 = pd.DataFrame.from_dict(final_valid_predictions_5, orient="index").reset_index()
# final_valid_predictions_5.columns = ["order_id", "pred_5"]
# df = df.merge(final_valid_predictions_5, on = 'order_id', how = 'left')

# final_valid_predictions_6 = pd.DataFrame.from_dict(final_valid_predictions_6, orient="index").reset_index()
# final_valid_predictions_6.columns = ["order_id", "pred_6"]
# df = df.merge(final_valid_predictions_6, on = 'order_id', how = 'left')


sample_submission = pd.read_csv('../input/cascade-cup-22/sample_submission.csv')
sample_submission['cancelled'] = np.mean(np.column_stack(final_test_predictions_1), axis=1)
sample_submission.columns = ["order_id", "pred_1"]
df_test = df_test.merge(sample_submission, on = 'order_id', how = 'left')

sample_submission = pd.read_csv('../input/cascade-cup-22/sample_submission.csv')
sample_submission['cancelled'] = np.mean(np.column_stack(final_test_predictions_2), axis=1)
sample_submission.columns = ["order_id", "pred_2"]
df_test = df_test.merge(sample_submission, on = 'order_id', how = 'left')

sample_submission = pd.read_csv('../input/cascade-cup-22/sample_submission.csv')
sample_submission['cancelled'] = np.mean(np.column_stack(final_test_predictions_3), axis=1)
sample_submission.columns = ["order_id", "pred_3"]
df_test = df_test.merge(sample_submission, on = 'order_id', how = 'left')

sample_submission = pd.read_csv('../input/cascade-cup-22/sample_submission.csv')
sample_submission['cancelled'] = np.mean(np.column_stack(final_test_predictions_4), axis=1)
sample_submission.columns = ["order_id", "pred_4"]
df_test = df_test.merge(sample_submission, on = 'order_id', how = 'left')

# sample_submission = pd.read_csv('../input/cascade-cup-22/sample_submission.csv')
# sample_submission['cancelled'] = np.mean(np.column_stack(final_test_predictions_5), axis=1)
# sample_submission.columns = ["order_id", "pred_5"]
# df_test = df_test.merge(sample_submission, on = 'order_id', how = 'left')

# sample_submission = pd.read_csv('../input/cascade-cup-22/sample_submission.csv')
# sample_submission['cancelled'] = np.mean(np.column_stack(final_test_predictions_6), axis=1)
# sample_submission.columns = ["order_id", "pred_6"]
# df_test = df_test.merge(sample_submission, on = 'order_id', how = 'left')

In [45]:
useful_features = ["pred_1", "pred_2",'pred_3','pred_4']
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df['stacking_kfold'] != fold].reset_index(drop=True)
    xvalid = df[df['stacking_kfold'] == fold].reset_index(drop=True)
    xtest = df_test.copy()


    ytrain = xtrain['cancelled']

    yvalid = xvalid['cancelled']
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LogisticRegression()
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict_proba(xvalid)[:,1]
    test_preds = model.predict_proba(xtest)[:,1]
    final_predictions.append(test_preds)
    print(fold, roc_auc_score(yvalid, preds_valid))

0 0.9277548760429148
1 0.9224500410644716
2 0.9179198799733788
3 0.9288892812650587
4 0.9241460628260147


# Submission

In [46]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

In [47]:
sample_submission = pd.read_csv("../input/cascade-cup-22/sample_submission.csv")

In [48]:
preds

array([0.03036322, 0.00168552, 0.00122377, ..., 0.00132761, 0.00252757,
       0.00177428])

In [49]:
sample_submission['cancelled'] = preds
sample_submission.to_csv("submission.csv", index=False)

In [50]:
sample_submission

Unnamed: 0,order_id,cancelled
0,130231,0.030363
1,130232,0.001686
2,130233,0.001224
3,130234,0.002594
4,130235,0.000962
...,...,...
144839,41184,0.001171
144840,41185,0.002468
144841,41186,0.001328
144842,41187,0.002528
