### This is my first attempt at the DoorDash task I got on StrataScratch. 
### The task is create a model that predicts as accurately as possible the time it will take a doordash customer to receive their order (in seconds) from the time they placed their order.

In [None]:
import pickle
import os
import math
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns
import datetime
from datetime import timedelta

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from category_encoders.one_hot import OneHotEncoder

# Wrangle the data

In [None]:
def day_hour_extract(datetime_string, from_datetime_object=False, to_datetime=False, day=True):
    """
        This function returns the day of the week that an order was made 
        It could also give the specific hour group when the order was made
        This would depend on the value of the 'day' argument
    """
    # dictionary for the time groups
    time_groups_dict = {
    'Group 1': ['06', '07', '08', '09'],
    'Group 2': ['10', '11', '12', '13'],
    'Group 3': ['14', '15', '16', '17'],
    'Group 4': ['18', '19', '20', '21'],
    'Group 5': ['22', '23', '00', '01'],
    'Group 6': ['02', '03', '04', '05']
    }
    
    days = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
    
    # Convert datetime string to datetime object
    datetime_object = datetime.datetime.strptime(str(datetime_string),'%Y-%m-%d %H:%M:%S')
    
    if to_datetime == True:
        return datetime_object
    
    if day == False:
        hour = datetime_string[11:13]
        time_group = [x for x in time_groups_dict if hour in time_groups_dict[x]][0]
        return time_group
    
    return days[datetime_object.isoweekday() - 1]

In [None]:
day_hour_extract('2015-02-06 22:24:17', day=False)

In [None]:

def wrangle(csv_file):
    df = pd.read_csv(csv_file)
    
    # Drop rows with null values
    df.dropna(inplace=True)
    
    # Create columns for day of the week and time group for the order
    df['order_day'] = [day_hour_extract(date) for date in df['created_at']]
    df['order_time_group'] = [day_hour_extract(date, day=False) for date in df['created_at']]
    
    # Create column for total delivery time
    df['created_at'] = [day_hour_extract(date, to_datetime=True) for date in df['created_at']] # Convert from string to datetime object
    
    df['actual_delivery_time'] = [day_hour_extract(date, to_datetime=True) for date in df['actual_delivery_time']] # Convert from string to datetime object

    df['total_delivery_time'] = [timedelta.total_seconds(each) for each in (df['actual_delivery_time'] - df['created_at'])]

    # Calculate availablen dasher
    df['total_available_dashers'] = df['total_onshift_dashers'] - df['total_busy_dashers']
    # Set the available dashers to zero for columns where the value is negative
    index_list = df[df['total_available_dashers'] < 0].index
    df.loc[index_list, 'total_available_dashers'] = 0

    # Drop high collinearity columns and high dimensionality columns
    
    drop_cols = ['created_at', 'actual_delivery_time', 'store_id', 'total_busy_dashers', 'total_onshift_dashers', 'estimated_order_place_duration',
                'num_distinct_items', 'min_item_price', 'max_item_price']
    df.drop(columns = drop_cols, inplace=True)
    
    return df

In [None]:
# Import csv file
df = wrangle('../input/doordash-data/datasets/historical_data.csv')
print(len(df))
print(df.info())
df.head()

In [None]:
corr = df.corr()
sns.heatmap(corr)

In [None]:
df['market_id'].value_counts()

## Additional wrangling and analysis

In [None]:
df['store_primary_category'].value_counts().head(35)

The store category has over 70 distinct values. To take care of this, we will turn all categories that are not in the top 30 to a category called 'other'

In [None]:
# Get the top 30 categories
top_30_cat = list(df['store_primary_category'].value_counts().head(30).index)

# Get the index of the rows that are not in the top 30 categories
others_index = df.drop(index = df[df['store_primary_category'].isin(top_30_cat)].index).index
others_index[:10]

# Change the category type to other
df.loc[others_index, 'store_primary_category'] = 'other'
df['store_primary_category'].value_counts().head(30)

In [None]:
df.head()

In [None]:
df.info()

change order protocol to object
normalise the data
change market id to object

In [None]:
df[['market_id', 'order_protocol']] = df[['market_id', 'order_protocol']].astype(str)
df['order_protocol']

In [None]:
df.describe()

In [None]:
df['order_time_group'].value_counts()

Because the different features of the dataset are in different ranges, this data wil need to be normalised.

# Split into Feature and Target

In [None]:
target = 'total_delivery_time'
y = df[target]
X = df.drop(columns = target)
X.head()

### Horizontal split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(y_train.shape)

### Get baseline

In [None]:
# Get the mean of the target
mean_values = [y_train.mean()] * len(y_train)
# Set your baseline mean absolute error
mae_baseline = mean_absolute_error(y_train, mean_values)
mae_baseline

Our baseline MAE is 830.74

### Make pipeline for model

In [None]:
# Create pipeline for model
model = make_pipeline(
        OneHotEncoder(use_cat_names=True),
        LinearRegression()
)
model.fit(X_train, y_train)

In [None]:
# predict for the training data
y_pred_train = model.predict(X_train)
# check mean absolute error for training data
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_train

This gave an mae of 721.38

We beat the baseline, that is good.

In [None]:
# Predict results on the test set
y_pred_test = model.predict(X_test)

mae_test = mean_absolute_error(y_test, y_pred_test)
mae_test

MAE = 710.31

We see that the model even did better on the test set than on the training set.

### Save Model

In [None]:
# Save model
with open('linear_reg_model.pkl' , 'wb') as f:
    pickle.dump(model, f)
    print('Model saved')

## Presentation phase

More evaluation on the regression equation

Present the regression graphs

In [None]:
intercept = model.named_steps['linearregression'].intercept_
intercept

In [None]:
coefficients = model.named_steps['linearregression'].coef_
coefficients

In [None]:
feature_names = model.named_steps['onehotencoder'].get_feature_names()
feature_names

In [None]:
feature_imp_df = pd.DataFrame({'feature_names': feature_names, 'coefficients': coefficients}).set_index('feature_names').sort_values('coefficients', ascending=False)
feature_imp_df

From the above dataframe, we can make out the features that caused an increase in the total delivery time for an order and the features that cause a decrease in the total delivery time. 

We can plot out the 10 most influential features in increasing total delivery time

In [None]:
feature_imp_df.head(10).plot(kind='bar')
plt.title('Most Influential features for increased delivery time')
plt.ylabel('Feature Coefficient');

In [None]:
feature_imp_df.tail(10).plot(kind='bar')
plt.title('Most Influential features for decreased delivery time')
plt.ylabel('Feature Coefficient');

In [None]:
feature_imp_df.sort_values('coefficients',ascending=False, key=abs).head(15).plot(kind='bar')
plt.title('Most Influential features for delivery time')
plt.ylim([-350, 800])
plt.ylabel('Feature Coefficient');