In [44]:
#IMPORT MODULES
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import roc_auc_score
from tsfresh.feature_extraction import feature_calculators
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from joblib import Parallel, delayed
from matplotlib.pyplot import figure
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import shap
%env JOBLIB_TEMP_FOLDER=/tmp

env: JOBLIB_TEMP_FOLDER=/tmp


In [3]:
#DATA PRE-PROCESSING
#Read the 10 training folders 
feat_1 = pd.read_csv('features_1')
feat_2 = pd.read_csv('features_2')
feat_3 = pd.read_csv('features_3')
feat_4 = pd.read_csv('features_4')
feat_5 = pd.read_csv('features_5')
feat_6 = pd.read_csv('features_6')
feat_7 = pd.read_csv('features_7')
feat_8 = pd.read_csv('features_8')
feat_9 = pd.read_csv('features_9')
feat_10 = pd.read_csv('features_10')

In [4]:
#DATA PRE-PROCESSING
#combine the 10 training folders into 1 dataframe 
train_X = feat_1.append(feat_2)
train_X = train_X.append(feat_3)
train_X = train_X.append(feat_4)
train_X = train_X.append(feat_5)
train_X = train_X.append(feat_6)
train_X = train_X.append(feat_7)
train_X = train_X.append(feat_8)
train_X = train_X.append(feat_9)
train_X = train_X.append(feat_10)
train_X = train_X.reset_index(drop=True)
train_X.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,1202590843006,3.0,353.0,1.228867,8.9001,3.986968,0.008221,0.002269,-0.009966,1362.0,0.0
1,274877907034,9.293,17.0,0.032775,8.659933,4.7373,0.024629,0.004028,-0.010858,257.0,0.19
2,884763263056,3.0,189.0,1.139675,9.545974,1.951334,-0.006899,-0.01508,0.001122,973.0,0.667059
3,1073741824054,3.9,126.0,3.871543,10.386364,-0.136474,0.001344,-0.339601,-0.017956,902.0,7.913285
4,1056561954943,3.9,50.0,-0.112882,10.55096,-1.56011,0.130568,-0.061697,0.16153,820.0,20.419409


In [5]:
#DATA PRE-PROCESSING(FOR TEST SET)
#REPLACE xxxx with file name
#This is for reading the hold-out test file 
test_X = pd.read_csv(xxx)
test_X.head()

In [None]:
#DATA PRE-PROCESSING
#Checking rows of data
len(train_X)

In [None]:
#DATA PRE-PROCESSING
#Checking number of unique rides
train_X.bookingID.nunique()

In [None]:
#DATA PRE-PROCESSING
#Sort by unique rides, from start to end of trip
train_X = train_X.sort_values(['bookingID','second'])
train_X.head()

In [6]:
#DATA PRE-PROCESSING(FOR TEST SET)
#Sort by unique rides, from start to end of trip 
#Note that unlike train_X, test_X index is jumbled up
test_X = test_X.sort_values(['bookingID','second'])
test_X.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
10835302,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991
12007854,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454
3394723,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454
436147,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454
9490986,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454


In [61]:
#DATA PRE-PROCESSING
#Read file containing labels for each ride 
train_y = pd.read_csv('labels')
train_y.head()

Unnamed: 0,bookingID,label
0,111669149733,0
1,335007449205,1
2,171798691856,0
3,1520418422900,0
4,798863917116,0


In [None]:
#DATA PRE-PROCESSING
#Merge labels with rides so that we can look at the variable distributions for each driver type later
train_X = train_X.merge(train_y, on='bookingID', how='left')
train_X.head()

In [None]:
#DATA PRE-PROCESSING
#Separate training set into dangerous and safe drivers[0=safe,1=dangerous]
train_X_0 = train_X[train_X['label'] == 0]
train_X_1 = train_X[train_X['label'] == 1]

In [None]:
#DATA PRE-PROCESSING
#Checking first 5 rows of safe drivers set
train_X_0.head()

In [None]:
#DATA PRE-PROCESSING
#Checking first 5 rows of dangerous drivers set
train_X_1.head()

In [None]:
#EXPLORATORY DATA ANALYSIS
#Speed distribution for safe & dangerous drivers
figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')

plt.subplot(2,1,2)
sns.distplot(train_X_0['Speed'],hist = False,label='Safe')
sns.distplot(train_X_1['Speed'],hist = False,label='Dangerous')
plt.xlabel('Speed',fontsize=9)
locs,labels = plt.xticks()
plt.tick_params(axis='x',which='major',labelsize=6,pad=-6)
plt.tick_params(axis='y',which='major',labelsize=6)
plt.show() 
#We see more outlier on the right side for dangerous drivers
#Speed for dangerous drivers is more erratic

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check speed outliers for both driver types
train_X.boxplot(by='label', column=['Speed'], grid=False)
#As seen in the boxplot, dangerous drivers have more outliers

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary speed stats for safe drivers 
train_X_0['Speed'].describe()
#Notice that negative speed values exist
#This may be measurement errors due to GPS inaccuracy

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary speed stats for dangerous drivers 
train_X_1['Speed'].describe()
#Negative speed values exist for dangerous drivers as well

In [None]:
#EXPLORATORY DATA ANALYSIS
#Let's plot distribution of negative speed values against GPS accuracy for negative speed occurences only
figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
sns.distplot(train_X[train_X['Speed'] < 0]['Accuracy'],hist = False,label='Accuracy for negative speed')
plt.xlabel('Accuracy',fontsize=9)
locs,labels = plt.xticks()
plt.tick_params(axis='x',which='major',labelsize=6,pad=-6)
plt.tick_params(axis='y',which='major',labelsize=6)
plt.show() 
#Notice the spike of high accuracy values 
#Seems that negative speed values are the result of GPS inaccuracy (denoted by the large accuracy values) 
#Need to verify for positive speed values

In [None]:
#EXPLORATORY DATA ANALYSIS
#Let's plot distribution of GPS accuracy for positive speed occurences only
train_X_pos = train_X[train_X['Speed'] > 0]
figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
sns.distplot(train_X_pos['Accuracy'],hist = False,label='Accuracy')
plt.xlabel('Accuracy',fontsize=9)
locs,labels = plt.xticks()
plt.tick_params(axis='x',which='major',labelsize=6,pad=-6)
plt.tick_params(axis='y',which='major',labelsize=6)
plt.show() 
#As expected, most of them are concentrated in the region where accuracy values are low

In [None]:
#EXPLORATORY DATA ANALYSIS
#Let's plot distribution of accuracy values for all speed values
#Note that this is a combination of the above 2 plots
figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
sns.distplot(train_X['Accuracy'],hist = False,label='Accuracy')
plt.xlabel('Accuracy',fontsize=9)
locs,labels = plt.xticks()
plt.tick_params(axis='x',which='major',labelsize=6,pad=-6)
plt.tick_params(axis='y',which='major',labelsize=6)
plt.show() 

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary accuracy stats for all drivers 
train_X['Accuracy'].describe()

In [None]:
#EXPLORATORY DATA ANALYSIS
#Plot gyro_z distribution for safe & dangerous drivers
figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
plt.subplot(2,1,2)
sns.distplot(train_X_0['gyro_z'],hist = False,label='gyro_z for safe drivers')
sns.distplot(train_X_1['gyro_z'],hist = False,label='gyro_z for dangerous drivers')
plt.xlabel('gyro_z',fontsize=9)
locs,labels = plt.xticks()
plt.tick_params(axis='x',which='major',labelsize=6,pad=-6)
plt.tick_params(axis='y',which='major',labelsize=6)
plt.show() 
#distribution seems to be the same for both driver types

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary gyro_z stats for safe drivers 
train_X_0['gyro_z'].describe()

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary accuracy stats for dangerous drivers 
train_X_1['gyro_z'].describe()

In [None]:
#EXPLORATORY DATA ANALYSIS
#Plot gyro_y distribution for safe & dangerous drivers
figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
plt.subplot(2,1,2)
sns.distplot(train_X_0['gyro_y'],hist = False,label='gyro_y for safe drivers')
sns.distplot(train_X_1['gyro_y'],hist = False,label='gyro_y for dangerous drivers')
plt.xlabel('gyro_y',fontsize=9)
locs,labels = plt.xticks()
plt.tick_params(axis='x',which='major',labelsize=6,pad=-6)
plt.tick_params(axis='y',which='major',labelsize=6)
plt.show() 
#distribution seems to be the same for both driver types

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary gyro_y stats for safe drivers 
train_X_0['gyro_y'].describe()

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary gyro_y stats for dangerous drivers 
train_X_1['gyro_y'].describe()

In [None]:
#EXPLORATORY DATA ANALYSIS
#Plot gyro_x distribution for safe & dangerous drivers
figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
plt.subplot(2,1,2)
sns.distplot(train_X_0['gyro_x'],hist = False,label='gyro_x for safe drivers')
sns.distplot(train_X_1['gyro_x'],hist = False,label='gyro_x for dangerous drivers')
plt.xlabel('gyro_x',fontsize=9)
locs,labels = plt.xticks()
plt.tick_params(axis='x',which='major',labelsize=6,pad=-6)
plt.tick_params(axis='y',which='major',labelsize=6)
plt.show() 
#distribution seems to be the same for both driver types
#however, values for safe drivers seem to be slightly-higher

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary gyro_x stats for safe drivers 
train_X_0['gyro_x'].describe()

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary gyro_x stats for dangerous drivers 
train_X_1['gyro_x'].describe()

In [None]:
#EXPLORATORY DATA ANALYSIS
#Plot acc_z distribution for safe & dangerous drivers
figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
plt.subplot(2,1,2)
sns.distplot(train_X_0['acceleration_z'],hist = False,label='acc_z for safe drivers')
sns.distplot(train_X_1['acceleration_z'],hist = False,label='acc_z for dangerous drivers')
plt.xlabel('acc_z',fontsize=9)
locs,labels = plt.xticks()
plt.tick_params(axis='x',which='major',labelsize=6,pad=-6)
plt.tick_params(axis='y',which='major',labelsize=6)
plt.show() 
#distribution seems to be the same for both driver types
#however, values for safe drivers seem to be slightly-higher
#dangerous drivers seem to have slightly-lower peak but fatter tails

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary acceleration_z stats for safe drivers 
train_X_0['acceleration_z'].describe()

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary acceleration_z stats for dangerous drivers 
train_X_1['acceleration_z'].describe()

In [None]:
#EXPLORATORY DATA ANALYSIS
#Plot acc_y distribution for safe & dangerous drivers
figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
plt.subplot(2,1,2)
sns.distplot(train_X_0['acceleration_y'],hist = False,label='acc_y for safe drivers')
sns.distplot(train_X_1['acceleration_y'],hist = False,label='acc_y for dangerous drivers')
plt.xlabel('acc_y',fontsize=9)
locs,labels = plt.xticks()
plt.tick_params(axis='x',which='major',labelsize=6,pad=-6)
plt.tick_params(axis='y',which='major',labelsize=6)
plt.show() 
#distribution seems to be the same for both driver types
#interesting 2 peaks observed
#however, values for safe drivers seem to be slightly-higher
#dangerous drivers seem to have slightly-lower peaks, and have values shifted slightly to the right

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary acceleration_y stats for safe drivers 
train_X_0['acceleration_y'].describe()

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary acceleration_y stats for dangerous drivers 
train_X_1['acceleration_y'].describe()
#dangerous drivers have higher max,lower min
#implies abrupt acc & jam-brakes may be large contributor to driver differentiation

In [None]:
#EXPLORATORY DATA ANALYSIS
#Plot acc_x distribution for safe & dangerous drivers
figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
plt.subplot(2,1,2)
sns.distplot(train_X_0['acceleration_x'],hist = False,label='acc_x for safe drivers')
sns.distplot(train_X_1['acceleration_x'],hist = False,label='acc_x for dangerous drivers')
plt.xlabel('acc_x',fontsize=9)
locs,labels = plt.xticks()
plt.tick_params(axis='x',which='major',labelsize=6,pad=-6)
plt.tick_params(axis='y',which='major',labelsize=6)
plt.show() 
#distribution seems to be the same for both driver types
#however, values for safe drivers seem to be slightly-higher
#dangerous drivers seem to have slightly-lower peak but fatter tails

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary acceleration_x stats for safe drivers 
train_X_0['acceleration_x'].describe()

In [None]:
#EXPLORATORY DATA ANALYSIS
#Check summary acceleration_x stats for dangerous drivers 
train_X_1['acceleration_x'].describe()
#dangerous drivers have higher acc_x std
#much bigger max 
#much lower min
#implying abrupt acc & dcc during turns may be large contributor to driver type differentiation

In [None]:
#EXPLORATORY DATA ANALYSIS
#Plot bearing distribution for safe & dangerous drivers
figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
plt.subplot(2,1,2)
sns.distplot(train_X_0['Bearing'],hist = False,label='bearing for safe drivers')
sns.distplot(train_X_1['Bearing'],hist = False,label='bearing for dangerous drivers')
plt.xlabel('bearing',fontsize=9)
locs,labels = plt.xticks()
plt.tick_params(axis='x',which='major',labelsize=6,pad=-6)
plt.tick_params(axis='y',which='major',labelsize=6)
plt.show() 
#distribution is the same for both driver types
#result is expected since we don't expect dangerous drivers to prefer certain routes 

In [None]:
#EXPLORATORY DATA ANALYSIS
#Correlation matrix for safe drivers
corr = train_X_0.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
#EXPLORATORY DATA ANALYSIS
#Correlation matrix for dangerous drivers
corr = train_X_1.corr()
corr.style.background_gradient(cmap='coolwarm')

In [10]:
#DATA PROCESSING
#There exist some bookings with both labels
#Find such bookings & add to to_drop
#we are going to drop these bookings from our training set 
to_drop = []
for i in train_X.bookingID.unique():
    if len(train_y[train_y['bookingID'] == i]) > 1:
        to_drop.append(i)
#check number of bookings with dubious labels        
len(to_drop)        

18

In [11]:
#DATA PROCESSING
#Drop rides with dubious labels from both train_X & train_y
train_y = train_y[~train_y['bookingID'].isin(to_drop)]
train_X = train_X[~train_X['bookingID'].isin(to_drop)]

In [None]:
#DATA PROCESSING
#Check for trips with only negative speeds & add to to_drop
to_drop = []
for i in tqdm(train_X.bookingID.unique()):
    if len(train_X[(train_X.bookingID == i) & (train_X.Speed < 0)]) == len(train_X[train_X.bookingID == i]):
        to_drop.append(i)
#check number of bookings with dubious labels        
len(to_drop)            

In [None]:
#DATA PROCESSING
#Drop rides with only negative speeds from both train_X & train_y
train_y = train_y[~train_y['bookingID'].isin(to_drop)]
train_X = train_X[~train_X['bookingID'].isin(to_drop)]

In [None]:
#DATA PROCESSING
#Check minimum accuracy for these rides with only negative speed
#May use this value as threshold
minimum = 100
for i in to_drop:
    if train_X[train_X.bookingID == i]['Accuracy'].min() < minimum:
        minimum = train_X[train_X.bookingID == i]['Accuracy'].min()
minimum        

In [14]:
#DATA PROCESSING(FOR TEST SET)
#Check for trips with only negative speeds & add to to_drop 
to_drop = []
for i in tqdm(test_X.bookingID.unique()):
    if len(test_X[(test_X.bookingID == i) & (test_X.Speed < 0)]) == len(test_X[test_X.bookingID == i]):
        to_drop.append(i)
#check number of bookings with negative speed       
len(to_drop) 

HBox(children=(IntProgress(value=0, max=19982), HTML(value='')))




27

In [23]:
#DATA PROCESSING(FOR TEST SET)
#Drop rides with only negative speeds from test_X
test_X = test_X[~test_X['bookingID'].isin(to_drop)]

In [47]:
#DATA PROCESSING(FOR TEST SET)
#Reset index for test_X, for applying smooth_feature & negative functions later
#Note that train_X index has already been reset via merge function
test_X.reset_index(drop=True,inplace=True)
test_X.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991
1,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454
2,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454
3,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454
4,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454


In [49]:
#DATA PROCESSING
#This is a function used to fix inaccurate values due to GPS inaccuracy
#1) For each ride, get dataframe for that ride(df) & dataframe for that ride with rows accuracy > 10(df_filtered)
#2) Pass indices of both dataframes to 2 lists(idx & idx_filtered)
#3) For each index in idx_filtered(inaccurate values), search for nearest 2 indices that are in idx(accurate values), subjected to idx range
#4) Using the index in idx_filtered ,range between nearest 2 indices in idx & range between idx_filtered index & 1st nearest idx, use formula to set new value for inaccurate value 
#5) If either of nearest 2 indices is not found, entire row containing the inaccurate value is dropped
#6) The formula is designed such that inaccurate values will be replaced with accurate values that are gradually increasing/decreasing between nearest 2 accurate values 
def smooth_features(input_X,bookingID):
    df = input_X[input_X.bookingID == bookingID]
    df_filtered = input_X[(input_X.bookingID == bookingID) & (input_X.Accuracy > 10)]
    idx_filtered = df_filtered.index.values.tolist()
    idx = df.index.values.tolist()
        
    for i in idx_filtered:
        increment_start = 1
        increment_end = 1
        start = i
        end = i
        while (idx[0] <= start-increment_start < idx[-1]) & (start-increment_start in idx_filtered):
            increment_start += 1   
        start -= increment_start     
        if start >= idx[0]:
            while (idx[0] < end+increment_end <= idx[-1]) & (end+increment_end in idx_filtered):
                increment_end += 1
            end += increment_end    
            if end <= idx[-1]:
                df.loc[i,'Speed'] = ((df['Speed'][end]-df['Speed'][start])/(end-start))*(i-start)+df['Speed'][start]  
                df.loc[i,'acceleration_x'] = ((df['acceleration_x'][end]-df['acceleration_x'][start])/(end-start))*(i-start)+df['acceleration_x'][start] 
                df.loc[i,'acceleration_y'] = ((df['acceleration_y'][end]-df['acceleration_y'][start])/(end-start))*(i-start)+df['acceleration_y'][start] 
                df.loc[i,'acceleration_z'] = ((df['acceleration_z'][end]-df['acceleration_z'][start])/(end-start))*(i-start)+df['acceleration_z'][start] 
                df.loc[i,'gyro_x'] = ((df['gyro_x'][end]-df['gyro_x'][start])/(end-start))*(i-start)+df['gyro_x'][start] 
                df.loc[i,'gyro_y'] = ((df['gyro_y'][end]-df['gyro_y'][start])/(end-start))*(i-start)+df['gyro_y'][start] 
                df.loc[i,'gyro_z'] = ((df['gyro_z'][end]-df['gyro_z'][start])/(end-start))*(i-start)+df['gyro_z'][start]            
            else:
                df.drop([i],inplace=True)
        else:
            df.drop([i],inplace=True)
    return df

#Aply smoothing to train set
#Create a dataframe for new training set values
#For each ride, smooth variables & append to new dataframe
#I used parallel processing since it takes a while to finish running
train_X_new = pd.DataFrame(columns=['bookingID','Accuracy','Bearing','acceleration_x','acceleration_y','acceleration_z','gyro_x','gyro_y','gyro_z','second','Speed','label'],dtype=np.int64)
features = Parallel(n_jobs=-1, verbose=2)(delayed(smooth_features)(train_X,i) for i in train_y.bookingID.unique()) 
for i, feat in tqdm(enumerate(features)):
    train_X_new = train_X_new.append(feat)
    
del features
gc.collect()
      

In [50]:
#Apply smoothing to test set(FOR TEST SET)
#Create a dataframe for new test set values 
#For each ride, smooth variables & append to new dataframe
#I used parallel processing since it takes a while to finish running
test_X_new = pd.DataFrame(columns=['bookingID','Accuracy','Bearing','acceleration_x','acceleration_y','acceleration_z','gyro_x','gyro_y','gyro_z','second','Speed','label'],dtype=np.int64)
features = Parallel(n_jobs=-1, verbose=2)(delayed(smooth_features)(test_X,i) for i in test_X.bookingID.unique()) 
for i, feat in tqdm(enumerate(features)):
    test_X_new = test_X_new.append(feat)
    
del features
gc.collect()
      

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:  4.5min


KeyboardInterrupt: 

In [None]:
#DATA PROCESSING
#Check number of rides left in new training set
train_X_new.bookingID.unique()

In [None]:
#DATA PROCESSING
#Check first 5 rows of new training set
train_X_new.head()

In [None]:
#DATA PROCESSING
#Save new training set to csv file, in case kernel is accidently-disconnected
train_X_new.to_csv('train_X_new.csv')

In [None]:
#DATA PROCESSING(FOR TEST SET)
#Save new test set to csv file, in case kernel is accidently-disconnected
test_X_new.to_csv('test_X_new.csv')

In [None]:
#DATA PROCESSING
#Reading new training set file
train_X_new = pd.read_csv('train_X_new.csv')
train_X_new.set_index('Unnamed: 0',inplace=True)
train_X_new.head()

In [None]:
#DATA PROCESSING(FOR TEST SET)
#Reading new test set file
test_X_new = pd.read_csv('test_X_new.csv')
test_X_new.set_index('Unnamed: 0',inplace=True)
test_X_new.head()

In [None]:
#DATA PROCESSING
#This is a function similar to the above, used to fix negative Speed values 
#I can't do both together since some negative speed values have low accuracy values
#1) For each ride, get dataframe for that ride(df) & dataframe for that ride with rows speed < 0(df_filtered)
#2) Pass indices of both dataframes to 2 lists(idx & idx_filtered)
#3) For each index in idx_filtered(negative speed values), search for nearest 2 indices that are in idx(positive speed values), subjected to idx range
#4) Using the index in idx_filtered ,range between nearest 2 indices in idx & range between idx_filtered index & 1st nearest idx, use formula to set new value for negative speed value 
#5) If either of nearest 2 indices is not found, entire row containing the negative speed value is dropped
#6) The formula is designed such that negative speed values will be replaced with speed values that are gradually increasing/decreasing between nearest 2 positive speed values 
def negative_features(input_X,bookingID):
    df = input_X[input_X.bookingID == bookingID]
    df_filtered = input_X[(input_X.bookingID == bookingID) & (input_X.Speed < 0)]
    idx_filtered = df_filtered.index.values.tolist()
    idx = df.index.values.tolist()
        
    for i in idx_filtered:
        increment_start = 1
        increment_end = 1
        start = i
        end = i
        while (idx[0] <= start-increment_start < idx[-1]) & (start-increment_start in idx_filtered):
            increment_start += 1   
        start -= increment_start     
        if start >= idx[0]:
            while (idx[0] < end+increment_end <= idx[-1]) & (end+increment_end in idx_filtered):
                increment_end += 1
            end += increment_end    
            if end <= idx[-1]:
                df.loc[i,'Speed'] = ((df['Speed'][end]-df['Speed'][start])/(end-start))*(i-start)+df['Speed'][start]  
                df.loc[i,'acceleration_x'] = ((df['acceleration_x'][end]-df['acceleration_x'][start])/(end-start))*(i-start)+df['acceleration_x'][start] 
                df.loc[i,'acceleration_y'] = ((df['acceleration_y'][end]-df['acceleration_y'][start])/(end-start))*(i-start)+df['acceleration_y'][start] 
                df.loc[i,'acceleration_z'] = ((df['acceleration_z'][end]-df['acceleration_z'][start])/(end-start))*(i-start)+df['acceleration_z'][start] 
                df.loc[i,'gyro_x'] = ((df['gyro_x'][end]-df['gyro_x'][start])/(end-start))*(i-start)+df['gyro_x'][start] 
                df.loc[i,'gyro_y'] = ((df['gyro_y'][end]-df['gyro_y'][start])/(end-start))*(i-start)+df['gyro_y'][start] 
                df.loc[i,'gyro_z'] = ((df['gyro_z'][end]-df['gyro_z'][start])/(end-start))*(i-start)+df['gyro_z'][start]            
            else:
                df.drop([i],inplace=True)
        else:
            df.drop([i],inplace=True)
    return df
#Create a dataframe for new training set values(coontaining positive speed values only)
#For each ride, smooth speed values & append to new dataframe
train_X_final = pd.DataFrame(columns=['bookingID','Accuracy','Bearing','acceleration_x','acceleration_y','acceleration_z','gyro_x','gyro_y','gyro_z','second','Speed','label'])
features = Parallel(n_jobs=-1, verbose=2)(delayed(negative_features)(train_X_new,i) for i in train_X_new.bookingID.unique()) 
for i, feat in tqdm(enumerate(features)):
    train_X_final = train_X_final.append(feat)
    
del features
gc.collect()


In [None]:
#DATA PROCESSING(FOR TEST SET)
#Apply negative_features to test set 
test_X_final = pd.DataFrame(columns=['bookingID','Accuracy','Bearing','acceleration_x','acceleration_y','acceleration_z','gyro_x','gyro_y','gyro_z','second','Speed','label'])
features = Parallel(n_jobs=-1, verbose=2)(delayed(negative_features)(test_X_new,i) for i in test_X_new.bookingID.unique()) 
for i, feat in tqdm(enumerate(features)):
    test_X_final = test_X_final.append(feat)
    
del features
gc.collect()

In [None]:
#DATA PROCESSING
#Check first 5 rows of new training set
train_X_final.head()

In [None]:
#DATA PROCESSING
#Again, save new training set, in case kernel disconnects
train_X_final.to_csv('train_X_final.csv')

In [None]:
#DATA PROCESSING(FOR TEST SET)
#Again, save new training set, in case kernel disconnects 
test_X_final.to_csv('test_X_final.csv')

In [51]:
#DATA PROCESSING
#Read new training set file
train_X_final = pd.read_csv('train_X_final.csv')
train_X_final.set_index('Unnamed: 0',inplace=True)
train_X_final.head()

Unnamed: 0_level_0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1181233,111669100000.0,5.736,0.0,0.881093,10.127783,1.503605,-0.024443,-0.055418,-0.008494,0.0,3.156902,0.0
1181234,111669100000.0,5.961,0.0,0.392661,10.185245,1.680781,0.103839,0.041098,0.049538,1.0,3.232407,0.0
1181235,111669100000.0,5.302,2.0,-0.462095,10.474953,-0.668003,0.042141,0.03499,0.089855,2.0,3.318302,0.0
1181236,111669100000.0,4.99,2.0,-0.026337,9.852441,-1.240234,-0.059873,-0.010825,-0.010327,3.0,2.916334,0.0
1181237,111669100000.0,6.0,3.0,-0.390267,9.763853,-0.385478,-0.008561,-0.005327,-0.010327,4.0,4.250799,0.0


In [None]:
#DATA PROCESSING(FOR TEST SET)
#Read new test set file 
test_X_final = pd.read_csv('test_X_final.csv')
test_X_final.set_index('Unnamed: 0',inplace=True)
test_X_final.head()

In [None]:
#DATA PROCESSING
#Check minimum & maximum ride durations
minimum = 1000
maximum = 0
for i in tqdm(train_X_final.bookingID.unique()):
    if len(train_X_final[train_X_final['bookingID'] == i]) < minimum:
        minimum = len(train_X_final[train_X_final['bookingID'] == i])
    elif len(train_X_final[train_X_final['bookingID'] == i]) > maximum:
        maximum = len(train_X_final[train_X_final['bookingID'] == i])
minimum,maximum        

In [52]:
#DATA PROCESSING
#Find rides lasting less than 15 seconds & add their bookingID to to_drop
#Of course, feel free to play with different thresholds
#Due to time constraint, I decided to stick with 15 seconds
to_drop = []
for i in tqdm(train_X_final.bookingID.unique()):
    if len(train_X_final[train_X_final.bookingID == i]) < 15:
        to_drop.append(i)

HBox(children=(IntProgress(value=0, max=19826), HTML(value='')))




In [53]:
#DATA PROCESSING
#Drop rides less than 15 seconds, whose bookingIDs are in to_drop
train_X_final = train_X_final[~train_X_final['bookingID'].isin(to_drop)]

In [None]:
#DATA PROCESSING(FOR TEST SET) 
#Find rides lasting less than 15 seconds & add their bookingID to to_drop 
to_drop = []
for i in tqdm(test_X_final.bookingID.unique()):
    if len(test_X_final[test_X_final.bookingID == i]) < 15:
        to_drop.append(i)

In [None]:
#DATA PROCESSING(FOR TEST SET)
#Drop rides less than 15 seconds, whose bookingIDs are in to_drop 
test_X_final = test_X_final[~test_X_final['bookingID'].isin(to_drop)]

In [62]:
#DATA PROCESSING
#Similarly for label set, drop rides that are not in new training set
train_y = train_y[train_y.bookingID.isin(train_X_final.bookingID.unique())]   

In [55]:
#DATA PROCESSING
#Final look at number of rides remaining in training set
train_X_final.bookingID.nunique()

19788

In [None]:
#DATA PROCESSING(FOR TEST SET)
#Final look at number of rides remaining in test set 
test_X_final.bookingID.nunique()

In [63]:
#DATA PROCESSING
#Sanity check to ensure number of unique rides in both label set & training set are the same
assert len(train_y) == train_X_final.bookingID.nunique()

In [57]:
#FEATURE ENGINEERING
#Create a function to calculate rate of change for creating rate-related features
def calc_change_rate(x):
    change = (np.diff(x) / x[:-1])
    #change = change[np.nonzero(change)[0]]
    change = change[~np.isnan(change)]
    change = change[change != -np.inf]
    change = change[change != np.inf]
    return change.mean()

In [58]:
#FEATURE ENGINEERING
#seg is the data segment for each unique ride
#seg_id is the bookingID for each unique ride
#For each segment, calculate summary stats & store it in X
def create_features(seg_id,seg,X):
    xs = pd.Series(seg['Speed'].values)    
    X.loc[seg_id, 'mean_Speed'] = np.mean(xs)
    rolling_speed_2_std = xs.rolling(2).std()
    X.loc[seg_id, 'rolling_speed_2_std'] = rolling_speed_2_std.mean()
    speed_diff = np.diff(xs)
    X.loc[seg_id, 'hard_braking_speed_min'] = speed_diff[speed_diff < 0].min() if len(speed_diff[speed_diff < 0]) > 0 else 0
    X.loc[seg_id, 'hard_braking_speed_std'] = speed_diff[speed_diff < 0].std() if len(speed_diff[speed_diff < 0]) > 0 else 0
    X.loc[seg_id, 'hard_braking_speed_ptp'] = np.ptp(speed_diff[speed_diff < 0]) if len(speed_diff[speed_diff < 0]) > 0 else 0
    X.loc[seg_id, 'hard_braking_speed_num_peaks_10'] = feature_calculators.number_peaks(speed_diff[speed_diff < 0],10) if len(speed_diff[speed_diff < 0]) > 0 else 0
    X.loc[seg_id, 'hard_braking_speed_be_5'] = feature_calculators.binned_entropy(speed_diff[speed_diff < 0],5) if len(speed_diff[speed_diff < 0]) > 0 else 0
    
    X.loc[seg_id, 'hard_acc_speed_max'] = speed_diff[speed_diff > 0].max() if len(speed_diff[speed_diff > 0]) > 0 else 0
    X.loc[seg_id, 'hard_acc_speed_std'] = speed_diff[speed_diff > 0].std() if len(speed_diff[speed_diff > 0]) > 0 else 0
    X.loc[seg_id, 'hard_acc_speed_ptp'] = np.ptp(speed_diff[speed_diff > 0]) if len(speed_diff[speed_diff > 0]) > 0 else 0
    X.loc[seg_id, 'hard_acc_speed_num_peaks_10'] = feature_calculators.number_peaks(speed_diff[speed_diff > 0],10) if len(speed_diff[speed_diff > 0]) > 0 else 0
    X.loc[seg_id, 'hard_acc_speed_be_5'] = feature_calculators.binned_entropy(speed_diff[speed_diff < 0],5) if len(speed_diff[speed_diff < 0]) > 0 else 0
    rolling_pos_speed_2_std = xs[xs>0].rolling(2).std()
    X.loc[seg_id, 'rolling_pos_speed_2_std'] = rolling_pos_speed_2_std.mean()
    
    xz = pd.Series(seg['acceleration_z'].values)
    rolling_acc_z_2_diff = xz.rolling(2).apply(lambda x:np.diff(x))
    X.loc[seg_id, 'rolling_acc_z_diff_mean'] = rolling_acc_z_2_diff.mean()
    X.loc[seg_id, 'max_acc_z'] = xz.max()
    X.loc[seg_id, 'min_acc_z'] = xz.min()
    X.loc[seg_id, 'med_acc_z'] = np.median(xz)
    X.loc[seg_id, 'p90_acc_z'] = np.percentile(xz,90)
    X.loc[seg_id, 'mean_acc_z_pos'] = xz[xz>0].mean()
    X.loc[seg_id, 'mean_acc_z_neg'] = xz[xz<0].mean()
    X.loc[seg_id, 'num_peaks_pos_acc_z_10'] = feature_calculators.number_peaks(xz[xz > 0],10) if len(xz[xz > 0]) > 0 else 0
    X.loc[seg_id, 'num_peaks_neg_acc_z_10'] = feature_calculators.number_peaks(xz[xz < 0],10) if len(xz[xz < 0]) > 0 else 0
    
    xy = pd.Series(seg['acceleration_y'].values)
    rolling_acc_y_2_diff = xy.rolling(2).apply(lambda x:np.diff(x))
    X.loc[seg_id, 'rolling_acc_y_diff_mean'] = rolling_acc_y_2_diff.mean()
    X.loc[seg_id, 'max_acc_y'] = xy.max()
    X.loc[seg_id, 'min_acc_y'] = xy.min()
    X.loc[seg_id, 'med_acc_y'] = np.median(xy)
    X.loc[seg_id, 'p90_acc_y'] = np.percentile(xy,90)
    X.loc[seg_id, 'mean_acc_y_pos'] = xy[xy>0].mean()
    X.loc[seg_id, 'mean_acc_y_neg'] = xy[xy<0].mean()
    X.loc[seg_id, 'num_peaks_pos_acc_y_10'] = feature_calculators.number_peaks(xy[xy > 0],10) if len(xy[xy > 0]) > 0 else 0
    X.loc[seg_id, 'num_peaks_neg_acc_y_10'] = feature_calculators.number_peaks(xy[xy < 0],10) if len(xy[xy < 0]) > 0 else 0
    
    xx = pd.Series(seg['acceleration_x'].values)
    rolling_acc_x_2_diff = xx.rolling(2).apply(lambda x:np.diff(x))
    X.loc[seg_id, 'rolling_acc_x_diff_mean'] = rolling_acc_x_2_diff.mean()
    X.loc[seg_id, 'max_acc_x'] = xx.max()
    X.loc[seg_id, 'min_acc_x'] = xx.min()
    X.loc[seg_id, 'med_acc_x'] = np.median(xx)
    X.loc[seg_id, 'p90_acc_x'] = np.percentile(xx,90)
    X.loc[seg_id, 'mean_acc_x_pos'] = xx[xx>0].mean()
    X.loc[seg_id, 'mean_acc_x_neg'] = xx[xx<0].mean()
    X.loc[seg_id, 'num_peaks_pos_acc_x_10'] = feature_calculators.number_peaks(xx[xx > 0],10) if len(xx[xx > 0]) > 0 else 0
    X.loc[seg_id, 'num_peaks_neg_acc_x_10'] = feature_calculators.number_peaks(xx[xx < 0],10) if len(xx[xx < 0]) > 0 else 0
    
    X.loc[seg_id, 'trip_time'] = len(seg)
     
    ##hardbrake combos
    temp = seg[(seg['gyro_x'] > 0) &(seg['acceleration_y'] < 0)] 
    gx_temp = pd.Series(temp['gyro_x'].values)
    ay_temp = pd.Series(temp['acceleration_y'].values)                                 
    s_temp = pd.Series(temp['Speed'].values)
    X.loc[seg_id, 'gx_ay_hardbrake_min'] = np.multiply(gx_temp,ay_temp).min()
    X.loc[seg_id, 'gx_ay_hardbrake_std'] = np.multiply(gx_temp,ay_temp).std()
    X.loc[seg_id, 'gx_ay_hardbrake_mean'] = np.multiply(gx_temp,ay_temp).mean()
    X.loc[seg_id, 'gx_ay_hardbrake_diff_mean'] = np.diff(np.multiply(gx_temp,ay_temp)).mean()
    
    X.loc[seg_id, 'gx_s_hardbrake_max'] = np.multiply(gx_temp,s_temp).max()
    X.loc[seg_id, 'gx_s_hardbrake_std'] = np.multiply(gx_temp,s_temp).std()
    X.loc[seg_id, 'gx_s_hardbrake_mean'] = np.multiply(gx_temp,s_temp).mean()
    #to be del
    X.loc[seg_id, 'gx_s_hardbrake_diff_mean'] = np.diff(np.multiply(gx_temp,s_temp)).mean()
    
    X.loc[seg_id, 'ay_s_hardbrake_min'] = np.multiply(ay_temp,s_temp).min()
    X.loc[seg_id, 'ay_s_hardbrake_std'] = np.multiply(ay_temp,s_temp).std()
    X.loc[seg_id, 'ay_s_hardbrake_mean'] = np.multiply(ay_temp,s_temp).mean()
    X.loc[seg_id, 'ay_s_hardbrake_diff_mean'] = np.diff(np.multiply(ay_temp,s_temp)).mean()
    
    X.loc[seg_id, 'gx_hardbrake_max'] = gx_temp.max()
    X.loc[seg_id, 'gx_hardbrake_mean'] = gx_temp.mean()
    X.loc[seg_id, 'gx_hardbrake_med'] = np.median(gx_temp)
    X.loc[seg_id, 'gx_hardbrake_std'] = gx_temp.std()
    X.loc[seg_id, 'gx_hardbrake_90p'] = np.percentile(gx_temp,90) if len(temp) > 0 else 0
    X.loc[seg_id, 'gx_hardbrake_diff_mean'] = np.diff(gx_temp).mean()
    
    X.loc[seg_id, 'ay_hardbrake_min'] = ay_temp.min()
    X.loc[seg_id, 'ay_hardbrake_mean'] = ay_temp.mean()
    X.loc[seg_id, 'ay_hardbrake_med'] = np.median(ay_temp)
    #X.loc[seg_id, 'ay_hardbrake_std'] = ay_temp.std()
    X.loc[seg_id, 'ay_hardbrake_10p'] = np.percentile(ay_temp,10) if len(temp) > 0 else 0
    X.loc[seg_id, 'ay_hardbrake_diff_mean'] = np.diff(ay_temp).mean()
    
    X.loc[seg_id, 's_hardbrake_min'] = s_temp.min()
    X.loc[seg_id, 's_hardbrake_mean'] = s_temp.mean()
    X.loc[seg_id, 's_hardbrake_med'] = np.median(s_temp)
    X.loc[seg_id, 's_hardbrake_std'] = s_temp.std()
    #X.loc[seg_id, 's_hardbrake_10p'] = np.percentile(s_temp,10) if len(temp) > 0 else 0
    #X.loc[seg_id, 's_hardbrake_diff_mean'] = np.diff(s_temp).mean()
    ##hardbrake combos
    
    ##hardacc combos
    temp = seg[(seg['gyro_x'] > 0) &(seg['acceleration_y'] > 0)] 
    gx_temp = pd.Series(temp['gyro_x'].values)
    ay_temp = pd.Series(temp['acceleration_y'].values)
    s_temp = pd.Series(temp['Speed'].values)
    X.loc[seg_id, 'gx_ay_hardacc_max'] = np.multiply(gx_temp,ay_temp).max()
    X.loc[seg_id, 'gx_ay_hardacc_std'] = np.multiply(gx_temp,ay_temp).std()
    X.loc[seg_id, 'gx_ay_hardacc_mean'] = np.multiply(gx_temp,ay_temp).mean()
    X.loc[seg_id, 'gx_ay_hardacc_diff_mean'] = np.diff(np.multiply(gx_temp,ay_temp)).mean()
    
    X.loc[seg_id, 'gx_s_hardacc_max'] = np.multiply(gx_temp,s_temp).max()
    X.loc[seg_id, 'gx_s_hardacc_std'] = np.multiply(gx_temp,s_temp).std()
    X.loc[seg_id, 'gx_s_hardacc_mean'] = np.multiply(gx_temp,s_temp).mean()
    X.loc[seg_id, 'gx_s_hardacc_diff_mean'] = np.diff(np.multiply(gx_temp,s_temp)).mean()
    
    X.loc[seg_id, 'ay_s_hardacc_max'] = np.multiply(ay_temp,s_temp).max()
    X.loc[seg_id, 'ay_s_hardacc_std'] = np.multiply(ay_temp,s_temp).std()
    X.loc[seg_id, 'ay_s_hardacc_mean'] = np.multiply(ay_temp,s_temp).mean()
    X.loc[seg_id, 'ay_s_hardacc_diff_mean'] = np.diff(np.multiply(ay_temp,s_temp)).mean()
    
    X.loc[seg_id, 'gx_hardacc_max'] = gx_temp.max()
    X.loc[seg_id, 'gx_hardacc_mean'] = gx_temp.mean()
    X.loc[seg_id, 'gx_hardacc_med'] = np.median(gx_temp)
    X.loc[seg_id, 'gx_hardacc_std'] = gx_temp.std()
    X.loc[seg_id, 'gx_hardacc_90p'] = np.percentile(gx_temp,90) if len(temp) > 1 else 0
    X.loc[seg_id, 'gx_hardacc_diff_mean'] = np.diff(gx_temp).mean()
    
    X.loc[seg_id, 'ay_hardacc_max'] = ay_temp.max()
    X.loc[seg_id, 'ay_hardacc_mean'] = ay_temp.mean()
    X.loc[seg_id, 'ay_hardacc_med'] = np.median(ay_temp)
    X.loc[seg_id, 'ay_hardacc_std'] = ay_temp.std()
    X.loc[seg_id, 'ay_hardacc_90p'] = np.percentile(ay_temp,90) if len(temp) > 1 else 0
    X.loc[seg_id, 'ay_hardacc_diff_mean'] = np.diff(ay_temp).mean()
    
    X.loc[seg_id, 's_hardacc_max'] = s_temp.max()
    X.loc[seg_id, 's_hardacc_mean'] = s_temp.mean()
    X.loc[seg_id, 's_hardacc_med'] = np.median(s_temp)
    X.loc[seg_id, 's_hardacc_std'] = s_temp.std()
    X.loc[seg_id, 's_hardacc_90p'] = np.percentile(s_temp,90) if len(temp) > 1 else 0
    X.loc[seg_id, 's_hardacc_diff_mean'] = np.diff(s_temp).mean()
    ##hardacc combos
    
    ##hardright combos
    temp = seg[(seg['gyro_z'] < 0) &(seg['acceleration_x'] > 0)]
    ax_temp = pd.Series(temp['acceleration_x'].values)
    gz_temp = pd.Series(temp['gyro_z'].values)
    s_temp = pd.Series(temp['Speed'].values)
    X.loc[seg_id, 'ax_gz_hardright_min'] = np.multiply(ax_temp,gz_temp).min()
    X.loc[seg_id, 'ax_gz_hardright_std'] = np.multiply(ax_temp,gz_temp).std()
    X.loc[seg_id, 'ax_gz_hardright_mean'] = np.multiply(ax_temp,gz_temp).mean()
    X.loc[seg_id, 'ax_gz_hardright_diff_mean'] = np.diff(np.multiply(ax_temp,gz_temp)).mean()
    
    X.loc[seg_id, 's_gz_hardright_min'] = np.multiply(s_temp,gz_temp).min()
    X.loc[seg_id, 's_gz_hardright_std'] = np.multiply(s_temp,gz_temp).std()
    X.loc[seg_id, 's_gz_hardright_mean'] = np.multiply(s_temp,gz_temp).mean()
    X.loc[seg_id, 's_gz_hardright_diff_mean'] = np.diff(np.multiply(s_temp,gz_temp)).mean()
    
    X.loc[seg_id, 'ax_s_hardright_max'] = np.multiply(ax_temp,s_temp).max()
    X.loc[seg_id, 'ax_s_hardright_std'] = np.multiply(ax_temp,s_temp).std()
    X.loc[seg_id, 'ax_s_hardright_mean'] = np.multiply(ax_temp,s_temp).mean()
    X.loc[seg_id, 'ax_s_hardright_diff_mean'] = np.diff(np.multiply(ax_temp,s_temp)).mean()
    
    X.loc[seg_id, 'gz_hardright_min'] = gz_temp.min()
    X.loc[seg_id, 'gz_hardright_mean'] = gz_temp.mean()
    X.loc[seg_id, 'gz_hardright_med'] = np.median(gz_temp)
    X.loc[seg_id, 'gz_hardright_std'] = gz_temp.std()
    X.loc[seg_id, 'gz_hardright_10p'] = np.percentile(gz_temp,10) if len(temp) > 0 else 0
    X.loc[seg_id, 'gz_hardright_diff_mean'] = np.diff(gz_temp).mean()
    
    X.loc[seg_id, 'ax_hardright_max'] = ax_temp.max()
    X.loc[seg_id, 'ax_hardright_mean'] = ax_temp.mean()
    X.loc[seg_id, 'ax_hardright_med'] = np.median(ax_temp)
    X.loc[seg_id, 'ax_hardright_std'] = ax_temp.std()
    X.loc[seg_id, 'ax_hardright_90p'] = np.percentile(ax_temp,90) if len(temp) > 0 else 0
    X.loc[seg_id, 'ax_hardright_diff_mean'] = np.diff(ax_temp).mean()
    
    X.loc[seg_id, 's_hardright_max'] = s_temp.max()
    X.loc[seg_id, 's_hardright_mean'] = s_temp.mean()
    X.loc[seg_id, 's_hardright_med'] = np.median(s_temp)
    X.loc[seg_id, 's_hardright_std'] = s_temp.std()
    X.loc[seg_id, 's_hardright_90p'] = np.percentile(s_temp,90) if len(temp) > 0 else 0
    X.loc[seg_id, 's_hardright_diff_mean'] = np.diff(s_temp).mean()
    ##hardright combos
    
    ##hardleft combos
    temp = seg[(seg['gyro_z'] > 0) &(seg['acceleration_x'] < 0)]
    ax_temp = pd.Series(temp['acceleration_x'].values)
    gz_temp = pd.Series(temp['gyro_z'].values)
    s_temp = pd.Series(temp['Speed'].values)
    X.loc[seg_id, 'ax_gz_hardleft_min'] = np.multiply(ax_temp,gz_temp).min()
    X.loc[seg_id, 'ax_gz_hardleft_std'] = np.multiply(ax_temp,gz_temp).std()
    X.loc[seg_id, 'ax_gz_hardleft_mean'] = np.multiply(ax_temp,gz_temp).mean()
    X.loc[seg_id, 'ax_gz_hardleft_diff_mean'] = np.diff(np.multiply(ax_temp,gz_temp)).mean()
    
    X.loc[seg_id, 's_gz_hardleft_max'] = np.multiply(s_temp,gz_temp).max()
    X.loc[seg_id, 's_gz_hardleft_std'] = np.multiply(s_temp,gz_temp).std()
    X.loc[seg_id, 's_gz_hardleft_mean'] = np.multiply(s_temp,gz_temp).mean()
    X.loc[seg_id, 's_gz_hardleft_diff_mean'] = np.diff(np.multiply(s_temp,gz_temp)).mean()
    
    X.loc[seg_id, 'ax_s_hardleft_min'] = np.multiply(ax_temp,s_temp).min()
    X.loc[seg_id, 'ax_s_hardleft_std'] = np.multiply(ax_temp,s_temp).std()
    X.loc[seg_id, 'ax_s_hardleft_mean'] = np.multiply(ax_temp,s_temp).mean()
    X.loc[seg_id, 'ax_s_hardleft_diff_mean'] = np.diff(np.multiply(ax_temp,s_temp)).mean()
    
    X.loc[seg_id, 'gz_hardleft_max'] = gz_temp.max()
    X.loc[seg_id, 'gz_hardleft_mean'] = gz_temp.mean()
    X.loc[seg_id, 'gz_hardleft_med'] = np.median(gz_temp)
    X.loc[seg_id, 'gz_hardleft_std'] = gz_temp.std()
    X.loc[seg_id, 'gz_hardleft_90p'] = np.percentile(gz_temp,90) if len(temp) > 0 else 0
    X.loc[seg_id, 'gz_hardleft_diff_mean'] = np.diff(gz_temp).mean()
    
    X.loc[seg_id, 'ax_hardleft_min'] = ax_temp.min()
    X.loc[seg_id, 'ax_hardleft_mean'] = ax_temp.mean()
    X.loc[seg_id, 'ax_hardleft_med'] = np.median(ax_temp)
    X.loc[seg_id, 'ax_hardleft_std'] = ax_temp.std()
    X.loc[seg_id, 'ax_hardleft_10p'] = np.percentile(ax_temp,10) if len(temp) > 0 else 0
    X.loc[seg_id, 'ax_hardleft_diff_mean'] = np.diff(ax_temp).mean()
    
    X.loc[seg_id, 's_hardleft_max'] = s_temp.max()
    X.loc[seg_id, 's_hardleft_mean'] = s_temp.mean()
    X.loc[seg_id, 's_hardleft_med'] = np.median(s_temp)
    X.loc[seg_id, 's_hardleft_std'] = s_temp.std()
    X.loc[seg_id, 's_hardleft_90p'] = np.percentile(s_temp,90) if len(temp) > 0 else 0
    X.loc[seg_id, 's_hardleft_diff_mean'] = np.diff(s_temp).mean()
    ##hardleft combos
    
    ##hardswerveright combos
    temp = seg[(seg['gyro_y'] > 0) &(seg['acceleration_x'] > 0)] 
    ax_temp = pd.Series(temp['acceleration_x'].values)
    gy_temp = pd.Series(temp['gyro_y'].values)
    s_temp = pd.Series(temp['Speed'].values)
    X.loc[seg_id, 'gy_ax_hardSright_max'] = np.multiply(gy_temp,ax_temp).max()
    X.loc[seg_id, 'gy_ax_hardSright_std'] = np.multiply(gy_temp,ax_temp).std()
    X.loc[seg_id, 'gy_ax_hardSright_mean'] = np.multiply(gy_temp,ax_temp).mean()
    X.loc[seg_id, 'gy_ax_hardSright_diff_mean'] = np.diff(np.multiply(gy_temp,ax_temp)).mean()
    
    X.loc[seg_id, 'gy_s_hardSright_max'] = np.multiply(gy_temp,s_temp).max()
    X.loc[seg_id, 'gy_s_hardSright_std'] = np.multiply(gy_temp,s_temp).std()
    X.loc[seg_id, 'gy_s_hardSright_mean'] = np.multiply(gy_temp,s_temp).mean()
    X.loc[seg_id, 'gy_s_hardSright_diff_mean'] = np.diff(np.multiply(gy_temp,s_temp)).mean()
    
    X.loc[seg_id, 'ax_s_hardSright_max'] = np.multiply(ax_temp,s_temp).max()
    X.loc[seg_id, 'ax_s_hardSright_std'] = np.multiply(ax_temp,s_temp).std()
    X.loc[seg_id, 'ax_s_hardSright_mean'] = np.multiply(ax_temp,s_temp).mean()
    X.loc[seg_id, 'ax_s_hardSright_diff_mean'] = np.diff(np.multiply(ax_temp,s_temp)).mean()
    
    X.loc[seg_id, 'gy_hardSright_max'] = gy_temp.max()
    X.loc[seg_id, 'gy_hardSright_mean'] = gy_temp.mean()
    X.loc[seg_id, 'gy_hardSright_med'] = np.median(gy_temp)
    X.loc[seg_id, 'gy_hardSright_std'] = gy_temp.std()
    X.loc[seg_id, 'gy_hardSright_90p'] = np.percentile(gy_temp,90) if len(temp) > 0 else 0
    X.loc[seg_id, 'gy_hardSright_diff_mean'] = np.diff(gy_temp).mean()
    
    X.loc[seg_id, 'ax_hardSright_max'] = ax_temp.max()
    X.loc[seg_id, 'ax_hardSright_mean'] = ax_temp.mean()
    X.loc[seg_id, 'ax_hardSright_med'] = np.median(ax_temp)
    X.loc[seg_id, 'ax_hardSright_std'] = ax_temp.std()
    X.loc[seg_id, 'ax_hardSright_90p'] = np.percentile(ax_temp,90) if len(temp) > 0 else 0
    X.loc[seg_id, 'ax_hardSright_diff_mean'] = np.diff(ax_temp).mean()
    
    X.loc[seg_id, 's_hardSright_max'] = s_temp.max()
    X.loc[seg_id, 's_hardSright_mean'] = s_temp.mean()
    X.loc[seg_id, 's_hardSright_med'] = np.median(s_temp)
    X.loc[seg_id, 's_hardSright_std'] = s_temp.std()
    X.loc[seg_id, 's_hardSright_90p'] = np.percentile(s_temp,90) if len(temp) > 0 else 0
    X.loc[seg_id, 's_hardSright_diff_mean'] = np.diff(s_temp).mean()
    ##hardswerveright combos
    
    ##hardswerveleft combos
    temp = seg[(seg['gyro_y'] < 0) &(seg['acceleration_x'] < 0)] 
    ax_temp = pd.Series(temp['acceleration_x'].values)
    gy_temp = pd.Series(temp['gyro_y'].values)
    s_temp = pd.Series(temp['Speed'].values)
    X.loc[seg_id, 'gy_ax_hardSleft_max'] = np.multiply(gy_temp,ax_temp).max()
    X.loc[seg_id, 'gy_ax_hardSleft_std'] = np.multiply(gy_temp,ax_temp).std()
    X.loc[seg_id, 'gy_ax_hardSleft_mean'] = np.multiply(gy_temp,ax_temp).mean()
    X.loc[seg_id, 'gy_ax_hardSleft_diff_mean'] = np.diff(np.multiply(gy_temp,ax_temp)).mean()
    
    X.loc[seg_id, 'ax_s_hardSleft_min'] = np.multiply(ax_temp,s_temp).min()
    X.loc[seg_id, 'ax_s_hardSleft_std'] = np.multiply(ax_temp,s_temp).std()
    X.loc[seg_id, 'ax_s_hardSleft_mean'] = np.multiply(ax_temp,s_temp).mean()
    X.loc[seg_id, 'ax_s_hardSleft_diff_mean'] = np.diff(np.multiply(ax_temp,s_temp)).mean()
    
    X.loc[seg_id, 'gy_s_hardSleft_min'] = np.multiply(gy_temp,s_temp).min()
    X.loc[seg_id, 'gy_s_hardSleft_std'] = np.multiply(gy_temp,s_temp).std()
    X.loc[seg_id, 'gy_s_hardSleft_mean'] = np.multiply(gy_temp,s_temp).mean()
    X.loc[seg_id, 'gy_s_hardSleft_diff_mean'] = np.diff(np.multiply(gy_temp,s_temp)).mean()
    
    X.loc[seg_id, 'gy_hardSleft_min'] = gy_temp.min()
    X.loc[seg_id, 'gy_hardSleft_mean'] = gy_temp.mean()
    X.loc[seg_id, 'gy_hardSleft_med'] = np.median(gy_temp)
    X.loc[seg_id, 'gy_hardSleft_std'] = gy_temp.std()
    X.loc[seg_id, 'gy_hardSleft_10p'] = np.percentile(gy_temp,10) if len(temp) > 0 else 0
    X.loc[seg_id, 'gy_hardSleft_diff_mean'] = np.diff(gy_temp).mean()
    
    X.loc[seg_id, 'ax_hardSleft_min'] = ax_temp.min()
    X.loc[seg_id, 'ax_hardSleft_mean'] = ax_temp.mean()
    X.loc[seg_id, 'ax_hardSleft_med'] = np.median(ax_temp)
    X.loc[seg_id, 'ax_hardSleft_std'] = ax_temp.std()
    X.loc[seg_id, 'ax_hardSleft_10p'] = np.percentile(ax_temp,10) if len(temp) > 0 else 0
    X.loc[seg_id, 'ax_hardSleft_diff_mean'] = np.diff(ax_temp).mean()
    
    X.loc[seg_id, 's_hardSleft_max'] = s_temp.max()
    X.loc[seg_id, 's_hardSleft_mean'] = s_temp.mean()
    X.loc[seg_id, 's_hardSleft_med'] = np.median(s_temp)
    X.loc[seg_id, 's_hardSleft_std'] = s_temp.std()
    X.loc[seg_id, 's_hardSleft_90p'] = np.percentile(s_temp,90) if len(temp) > 0 else 0
    X.loc[seg_id, 's_hardSleft_diff_mean'] = np.diff(s_temp).mean()
    ##hardswerveleft combos
    
    ##hardbump combos
    temp = seg[(seg['gyro_x'] > 0) &(seg['acceleration_z'] > 0)] 
    az_temp = pd.Series(temp['acceleration_z'].values)
    gx_temp = pd.Series(temp['gyro_x'].values)
    s_temp = pd.Series(temp['Speed'].values)
    X.loc[seg_id, 'az_s_hardbump_max'] = np.multiply(az_temp,s_temp).max()
    X.loc[seg_id, 'az_s_hardbump_std'] = np.multiply(az_temp,s_temp).std()
    X.loc[seg_id, 'az_s_hardbump_mean'] = np.multiply(az_temp,s_temp).mean()
    X.loc[seg_id, 'az_s_hardbump_diff_mean'] = np.diff(np.multiply(az_temp,s_temp)).mean()
    
    X.loc[seg_id, 'az_gx_hardbump_max'] = np.multiply(az_temp,gx_temp).max()
    X.loc[seg_id, 'az_gx_hardbump_std'] = np.multiply(az_temp,gx_temp).std()
    X.loc[seg_id, 'az_gx_hardbump_mean'] = np.multiply(az_temp,gx_temp).mean()
    X.loc[seg_id, 'az_gx_hardbump_diff_mean'] = np.diff(np.multiply(az_temp,gx_temp)).mean()
    
    X.loc[seg_id, 's_gx_hardbump_max'] = np.multiply(s_temp,gx_temp).max()
    X.loc[seg_id, 's_gx_hardbump_std'] = np.multiply(s_temp,gx_temp).std()
    X.loc[seg_id, 's_gx_hardbump_mean'] = np.multiply(s_temp,gx_temp).mean()
    X.loc[seg_id, 's_gx_hardbump_diff_mean'] = np.diff(np.multiply(s_temp,gx_temp)).mean()
    
    X.loc[seg_id, 'gx_hardbump_max'] = gx_temp.max()
    X.loc[seg_id, 'gx_hardbump_mean'] = gx_temp.mean()
    X.loc[seg_id, 'gx_hardbump_med'] = np.median(gx_temp)
    X.loc[seg_id, 'gx_hardbump_std'] = gx_temp.std()
    X.loc[seg_id, 'gx_hardbump_90p'] = np.percentile(gx_temp,90) if len(temp) > 0 else 0
    X.loc[seg_id, 'gx_hardbump_diff_mean'] = np.diff(gx_temp).mean()
    
    X.loc[seg_id, 'az_hardbump_max'] = az_temp.max()
    X.loc[seg_id, 'az_hardbump_mean'] = az_temp.mean()
    X.loc[seg_id, 'az_hardbump_med'] = np.median(az_temp)
    X.loc[seg_id, 'az_hardbump_std'] = az_temp.std()
    X.loc[seg_id, 'az_hardbump_90p'] = np.percentile(az_temp,90) if len(temp) > 0 else 0
    X.loc[seg_id, 'az_hardbump_diff_mean'] = np.diff(az_temp).mean()
    
    X.loc[seg_id, 's_hardbump_max'] = s_temp.max()
    X.loc[seg_id, 's_hardbump_mean'] = s_temp.mean()
    X.loc[seg_id, 's_hardbump_med'] = np.median(s_temp)
    X.loc[seg_id, 's_hardbump_std'] = s_temp.std()
    X.loc[seg_id, 's_hardbump_90p'] = np.percentile(s_temp,90) if len(temp) > 0 else 0
    X.loc[seg_id, 's_hardbump_diff_mean'] = np.diff(s_temp).mean()
    ##hardbump combos
    
    #specials
    power = np.multiply(xs,xy)
    X.loc[seg_id, 'power_mean'] = power.mean()
    X.loc[seg_id, 'power_std'] = power.std()
    X.loc[seg_id, 'power_max'] = power.max()
    X.loc[seg_id, 'power_min'] = power.min()
    X.loc[seg_id, 'power_med'] = np.median(power)
    X.loc[seg_id, 'power_pos_mean'] = power[power>0].mean()
    X.loc[seg_id, 'power_pos_max'] = power[power>0].max()
    X.loc[seg_id, 'power_pos_std'] = power[power>0].std()
    X.loc[seg_id, 'power_neg_mean'] = power[power<0].mean()
    X.loc[seg_id, 'power_neg_min'] = power[power<0].min()
    X.loc[seg_id, 'power_neg_std'] = power[power<0].std()

    bb = pd.Series(seg['Bearing'].values)
    diff = np.diff(bb)
    turn_power = np.multiply(diff,power[:-1]).values
    X.loc[seg_id, 'turn_power_mean'] = turn_power.mean()
     
    return X
           

In [67]:
#FEATURE ENGINEERING
#Create a new training dataframe for storing newly-created features for each ride
#Create a new label dataframe so that bookingID becomes index
train_X_final_2 = pd.DataFrame(index=pd.Series(train_y.bookingID), dtype=np.float64)
train_y_final = pd.DataFrame(index=pd.Series(train_y.bookingID),dtype=np.float64)
train_y_final['label'] = train_y['label'].values
train_X_final_2.head()

111669149733
335007449205
171798691856
1520418422900
798863917116


In [None]:
#FEATURE ENGINEERING(FOR TEST SET) 
#Create another new test set for test set summary stat features 
test_X_final_2 = pd.DataFrame(index=pd.Series(test_X_final.bookingID), dtype=np.float64)

In [69]:
#FEATURE ENGINEERING
#Sanity check to ensure no. of unique rides are the same in train set & label set
assert len(train_X_final_2) == len(train_y)

In [70]:
#FEATURE ENGINEERING
#Create features & store them to train_X_final_2
#I didn't use parallel processing here since feature creation is quite fast(~1hour)
for seg_id in tqdm(train_X_final.bookingID.unique()):
    seg = train_X_final[train_X_final['bookingID'] == seg_id]
    train_X_final_2 = create_features(seg_id, seg, train_X_final_2)

HBox(children=(IntProgress(value=0, max=19788), HTML(value='')))

  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)





In [None]:
#FEATURE ENGINEERING(FOR TEST SET)
#Create features & store them to test_X_final_2
#I didn't use parallel processing here since feature creation is quite fast(~1hour)
for seg_id in tqdm(test_X_final.bookingID.unique()):
    seg = test_X_final[test_X_final['bookingID'] == seg_id]
    test_X_final_2 = create_features(seg_id, seg, test_X_final_2)

In [71]:
#FEATURE ENGINEERING
#Sanity check for inf values, which shouldn't exist
for i in train_X_final_2.columns.unique():
    if np.isinf(train_X_final_2[i]).any() == True:
        print(i)

In [None]:
#FEATURE ENGINEERING(FOR TEST SET)
#Sanity check for inf values, which shouldn't exist
for i in test_X_final_2.columns.unique():
    if np.isinf(test_X_final_2[i]).any() == True:
        print(i)

In [77]:
#FEATURE ENGINEERING
#Check first 5 rows of final training set
train_X_final_2.head()

Unnamed: 0,mean_Speed,rolling_speed_2_std,hard_braking_speed_min,hard_braking_speed_std,hard_braking_speed_ptp,hard_braking_speed_num_peaks_10,hard_braking_speed_be_5,hard_acc_speed_max,hard_acc_speed_std,hard_acc_speed_ptp,...,power_max,power_min,power_med,power_pos_mean,power_pos_max,power_pos_std,power_neg_mean,power_neg_min,power_neg_std,turn_power_mean
111669100000.0,5.221561,0.303947,-2.966065,0.587765,2.963853,9.0,1.184315,2.610508,0.545873,2.59774,...,203.448537,0.0,34.440804,87.934016,203.448537,45.742945,,,,-24.572235
335007400000.0,6.029151,0.291928,-4.829653,0.655941,4.828535,17.0,0.597932,2.952679,0.53818,2.950594,...,204.180021,0.0,35.796707,85.296985,204.180021,52.018306,,,,14.206442
171798700000.0,16.362283,0.127965,-2.75,0.272192,2.7,0.0,0.212654,1.0,0.169775,0.95,...,254.795443,0.0,187.251697,167.657437,254.795443,65.724623,,,,-1.331214
1520418000000.0,13.62855,0.282835,-2.63,0.466653,2.62,17.0,0.862097,3.12,0.412678,3.11,...,293.526642,0.0,168.391346,149.991389,293.526642,73.986091,,,,-30.611714
798863900000.0,8.729576,0.363284,-2.564959,0.548929,2.56225,9.0,1.273726,4.337577,0.616003,4.337299,...,238.407435,0.0,95.061418,115.829375,238.407435,48.175361,,,,3.097101


In [None]:
#FEATURE ENGINEERING(FOR TEST SET)
test_X_final_2.head()

In [None]:
#FEATURE ENGINEERING
#Check first 5 rows of train y
train_y.head()

In [73]:
#FEATURE ENGINEERING
#Use StandardScaler to scale training set features
scaler = StandardScaler()
scaler.fit(train_X_final_2)
scaled_train_X = pd.DataFrame(scaler.transform(train_X_final_2), columns=train_X_final_2.columns,index=train_X_final_2.index)

In [None]:
#FEATURE ENGINEERING(FOR TEST SET)
#Use StandardScaler to scale test set features 
scaler = StandardScaler()
scaler.fit(test_X_final_2)
scaled_test_X = pd.DataFrame(scaler.transform(test_X_final_2), columns=test_X_final_2.columns,index=test_X_final_2.index)

In [74]:
#FEATURE ENGINEERING
#Assign label values to train_y_final_2 for use in Lightgbm model later
train_y_final_2 = train_y_final['label']

In [75]:
#FEATURE ENGINEERING
#Checking no. of rows & columns for scaled_train_X, the final training set containing all the scaled features
scaled_train_X.shape

(19788, 260)

In [None]:
#FEATURE ENGINEERING(FOR TEST SET)
#Checking no. of rows & columns for scaled_test_X, the final test set containing all the scaled features
scaled_test_X.shape

In [78]:
#MODELING
#Due to imbalanced class, I decided to do data augmentation within each fold by up-sampling both classes(but up-sampling more of class 1)
def augment(input_X,input_y):
    #Triples minor category
    mask = input_y>0
    x1 = input_X[mask].copy()     
    new_input_X = input_X.append(x1.sample(frac=1))
    new_input_X = new_input_X.append(x1.sample(frac=1))
    new_input_y = input_y.append(input_y[mask])
    new_input_y = new_input_y.append(input_y[mask])
    
    #Doubles major category
    mask = input_y==0
    x1 = input_X[mask].copy()
    new_input_X = new_input_X.append(x1.sample(frac=1))
    new_input_y = new_input_y.append(input_y[mask])
    
    return new_input_X,new_input_y

In [79]:
#MODELING
folds = 5
random_state = 0
skf = StratifiedKFold(n_splits=folds,shuffle=True,random_state=random_state)
#kf = KFold(n_splits=folds,shuffle=True,random_state=random_state)
predictions = np.zeros(len(scaled_test_X))
#scores = []
#train_columns = scaled_train_X.columns.values

In [80]:
#MODELING
#Model paramters are as shown
#I played around with the hyperparameters until a decent gap between train & val scores is achieved
#Of course, feel free to further-tweak the model to improve model accuracy
params = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" :13,
    "num_threads" : 8,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.01,
    #"min_data_in_leaf": 300,
    "min_sum_hessian_in_leaf" : 400,
    "tree_learner": "serial",
    "boost_from_average": "false",
    "lambda_l1" : 5,
    #"lambda_l2" : 5,
    "bagging_seed" : random_state,
    "verbosity" : -1,
    "seed": random_state
}

In [83]:
#MODELING
#I'm using Lightgbm since I'm most-comfortable with this model(after using it for kaggle competitions & achieving decent results)
#Online sources suggested LSTM as well. Due to time constraints however, I'm not doing a blend of different models
#Within each fold, I augment 5 times. 25 times in total
feature_importance = pd.DataFrame()
yp_final = 0
for fold_, (trn_idx, val_idx) in enumerate(skf.split(scaled_train_X,train_y_final_2)):
    
    
    print("Current Fold: {}".format(fold_))
    input_train = scaled_train_X.iloc[trn_idx]
    target_train = train_y_final_2.iloc[trn_idx]
    N = 5
    yp = 0
    for i in range(N):
        auginput_train,augtarget_train = augment(input_train,target_train)
        trn_data = lgb.Dataset(auginput_train, label=augtarget_train)
        val_data = lgb.Dataset(scaled_train_X.iloc[val_idx], label=train_y_final_2.iloc[val_idx])
        evals_result = {}
        model = lgb.train(params,trn_data,100000,valid_sets = [trn_data, val_data],early_stopping_rounds=1000,verbose_eval = 1000,evals_result=evals_result)                   
        yp += model.predict(scaled_test_X)     
          
    yp_final += (yp/N)
predictions = yp_final/folds    

Current Fold: 0
Training until validation scores don't improve for 1000 rounds.
[1000]	training's auc: 0.705564	valid_1's auc: 0.683635
[2000]	training's auc: 0.724518	valid_1's auc: 0.687954
[3000]	training's auc: 0.738379	valid_1's auc: 0.688739
[4000]	training's auc: 0.750191	valid_1's auc: 0.690299
[5000]	training's auc: 0.761015	valid_1's auc: 0.690898
[6000]	training's auc: 0.770436	valid_1's auc: 0.691181
[7000]	training's auc: 0.778805	valid_1's auc: 0.691404
Early stopping, best iteration is:
[6595]	training's auc: 0.775489	valid_1's auc: 0.691598
Training until validation scores don't improve for 1000 rounds.
[1000]	training's auc: 0.705847	valid_1's auc: 0.683553
[2000]	training's auc: 0.725067	valid_1's auc: 0.687147
[3000]	training's auc: 0.739009	valid_1's auc: 0.689378
[4000]	training's auc: 0.750482	valid_1's auc: 0.69126
[5000]	training's auc: 0.761458	valid_1's auc: 0.691812
[6000]	training's auc: 0.770355	valid_1's auc: 0.692355
[7000]	training's auc: 0.778583	valid_

[8000]	training's auc: 0.786207	valid_1's auc: 0.682254
Early stopping, best iteration is:
[7844]	training's auc: 0.785002	valid_1's auc: 0.68245
Training until validation scores don't improve for 1000 rounds.
[1000]	training's auc: 0.707487	valid_1's auc: 0.662126
[2000]	training's auc: 0.726289	valid_1's auc: 0.671204
[3000]	training's auc: 0.73937	valid_1's auc: 0.674053
[4000]	training's auc: 0.751306	valid_1's auc: 0.677653
[5000]	training's auc: 0.761697	valid_1's auc: 0.679082
[6000]	training's auc: 0.770949	valid_1's auc: 0.679989
[7000]	training's auc: 0.778908	valid_1's auc: 0.68096
[8000]	training's auc: 0.786286	valid_1's auc: 0.681112
Early stopping, best iteration is:
[7835]	training's auc: 0.785133	valid_1's auc: 0.681432
Training until validation scores don't improve for 1000 rounds.
[1000]	training's auc: 0.707768	valid_1's auc: 0.664176
[2000]	training's auc: 0.726327	valid_1's auc: 0.672015
[3000]	training's auc: 0.739863	valid_1's auc: 0.674995
[4000]	training's auc

In [84]:
#MODELING
#A gauge of how our model performs overall on the training set
#Of course, this is a slightly-inaccurate gauge since there is definitely leakage between training set & 'test set'(which is also training set)
roc_auc_score(train_y_final_2, predictions)

0.7558901902021601

In [85]:
#FEATURE IMPORTANCES
#Use shap to plot bar-chart that shows feature importances in descending order
#This is better than the default feature_importances_ of lightgbm since the lightgbm version uses no. of splits to judge a feature's importance
X_importance = scaled_train_X
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_importance)
shap.summary_plot(shap_values, X_importance, plot_type='bar')

NameError: name 'shap' is not defined

In [None]:
#FEATURE IMPORTANCES
#Can drop features based on shap values
#However, doing this seems to produce erratic results so I shall skip this

#shap_sum = np.abs(shap_values).mean(axis=0)
#importance_df = pd.DataFrame([scaled_train_X.columns.tolist(), shap_sum.tolist()]).T
#importance_df.columns = ['column_name', 'shap_importance']
#importance_df = importance_df.sort_values('shap_importance', ascending=False)
#to_remove = importance_df[importance_df['shap_importance'] < 0.001]
#to_drop = to_remove['column_name'].tolist()
#scaled_train_X_filtered = scaled_train_X.drop((i for i in to_drop),axis=1)
#scaled_test_X_filtered = scaled_test_X.drop((i for i in to_drop),axis=1)

In [98]:
#SUBMISSION
#Save predictions to csv file
test_pred =pd.DataFrame(dtype=np.float64)
test_pred['bookingID'] = test_X_final_2.index
test_pred['label'] = predictions
test_pred.to_csv('predictions.csv', index=False)
test_pred.head()

Unnamed: 0,bookingID,label
0,111669100000.0,0.264637
1,335007400000.0,0.408246
2,171798700000.0,0.075052
3,1520418000000.0,0.28148
4,798863900000.0,0.282884
