# Part I : EDA - Exploratory Data Analysis

## Part 1 Preliminary Stage: Imports

In [48]:
# Imports
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns


# Csv
delivery_df = pd.read_csv(r'./model_train.csv')

## Part 1 Preliminary Stage: Model Description

This project will employ a logistic regression model where the assumptions for this type of model are:

    Source: https://data.compass.lighthouselabs.ca/p/7/days/w05d3/activities/2297

    1. Data should be independent and random (each random variable has the same probability distribution).

    2. The response variable y does not need to be normally distributed, but the distribution is from an exponential family (e.g. binomial, Poisson, multinomial, normal)

    3. The original response variable need not have a linear relationship with the independent variables, but the transformed response variable (through the link function) is linearly dependent on the independent variables



 ## Part 1a: Are there any missing values in the dataset?

In [49]:
delivery_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           45593 non-null  object 
 1   Delivery_person_ID           45593 non-null  object 
 2   Delivery_person_Age          43739 non-null  float64
 3   Delivery_person_Ratings      45539 non-null  object 
 4   Restaurant_latitude          45593 non-null  float64
 5   Restaurant_longitude         45593 non-null  float64
 6   Delivery_location_latitude   45593 non-null  float64
 7   Delivery_location_longitude  45593 non-null  float64
 8   Order_Date                   45593 non-null  object 
 9   Time_Orderd                  43862 non-null  object 
 10  Time_Order_picked            45593 non-null  object 
 11  Weatherconditions            45593 non-null  object 
 12  Road_traffic_density         44992 non-null  object 
 13  Vehicle_conditio

In [50]:
delivery_df.head(10)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
0,0x4607,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,19-03-2022,11:30:00,11:45:00,conditions Sunny,High,2,Snack,motorcycle,0.0,No,Urban,(min) 24
1,0xb379,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,25-03-2022,19:45:00,19:50:00,conditions Stormy,Jam,2,Snack,scooter,1.0,No,Metropolitian,(min) 33
2,0x5d6d,BANGRES19DEL01,23.0,4.4,12.914264,77.6784,12.924264,77.6884,19-03-2022,8:30:00,8:45:00,conditions Sandstorms,Low,0,Drinks,motorcycle,1.0,No,Urban,(min) 26
3,0x7a6a,COIMBRES13DEL02,38.0,4.7,11.003669,76.976494,11.053669,77.026494,5/4/2022,18:00:00,18:10:00,conditions Sunny,Medium,0,Buffet,motorcycle,1.0,No,Metropolitian,(min) 21
4,0x70a2,CHENRES12DEL01,32.0,4.6,12.972793,80.249982,13.012793,80.289982,26-03-2022,13:30:00,13:45:00,conditions Cloudy,High,1,Snack,scooter,1.0,No,Metropolitian,(min) 30
5,0x9bb4,HYDRES09DEL03,22.0,4.8,17.431668,78.408321,17.461668,78.438321,11/3/2022,21:20:00,21:30:00,conditions Cloudy,Jam,0,Buffet,motorcycle,1.0,No,Urban,(min) 26
6,0x95b4,RANCHIRES15DEL01,33.0,4.7,23.369746,85.33982,23.479746,85.44982,4/3/2022,19:15:00,19:30:00,conditions Fog,Jam,1,Meal,scooter,1.0,No,Metropolitian,(min) 40
7,0x9eb2,MYSRES15DEL02,35.0,4.6,12.352058,76.60665,12.482058,76.73665,14-03-2022,17:25:00,17:30:00,conditions Cloudy,Medium,2,Meal,motorcycle,1.0,No,Metropolitian,(min) 32
8,0x1102,HYDRES05DEL02,22.0,4.8,17.433809,78.386744,17.563809,78.516744,20-03-2022,20:55:00,21:05:00,conditions Stormy,Jam,0,Buffet,motorcycle,1.0,No,Metropolitian,(min) 34
9,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12/2/2022,21:55:00,22:10:00,conditions Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,(min) 46


In [51]:
# Stage the Cell
delivery_df2 = delivery_df.copy()

# Strip (min) from time taken and convert to Integer
delivery_df2['Time_taken(min)'] = delivery_df2['Time_taken(min)'].str.replace('(min) ', '').str.strip()
delivery_df2['Time_taken(min)'] = delivery_df2['Time_taken(min)'].astype(int)
delivery_df2['Time_taken(min)'].value_counts()

Time_taken(min)
26    2123
25    2050
27    1976
28    1965
29    1956
19    1824
15    1810
18    1765
16    1706
17    1696
24    1680
23    1643
20    1640
22    1626
21    1601
33    1259
30    1218
31    1213
34    1172
32    1124
38     887
36     852
39     847
35     832
37     828
11     757
10     750
12     746
14     739
13     716
43     567
42     561
40     555
41     553
44     553
47     295
49     280
48     277
46     274
45     241
53     100
51      94
54      91
52      79
50      72
Name: count, dtype: int64

In [52]:
# Stage the Cell
delivery_df2 = delivery_df.copy()
# Create Ordinal Hierarchy for city

delivery_df2['City'] = delivery_df2['City'].str.strip()
x = {'Metropolitian': '3', 'Semi-Urban': '1', 'Urban': '2', np.nan: '0', '': '0'}
delivery_df2['City'].replace(x, regex=True, inplace=True)
delivery_df2['City'] = delivery_df2['City'].astype('int')
delivery_df2['City'].value_counts()

City
3    34093
2    10136
0     1200
1      164
Name: count, dtype: int64

In [53]:
# Stage Type of vehicle for dummy input
delivery_df2 = delivery_df2.copy()
delivery_df2['Type_of_vehicle'] = delivery_df2['Type_of_vehicle'].str.strip()
x = {'electric_scooter': 'scooter'}
delivery_df2['Type_of_vehicle'].replace(x, regex=True, inplace=True)
delivery_df2 = delivery_df2.rename(columns={'Type_of_vehicle': 'Vehicle'})
delivery_df2['Vehicle'].value_counts()

Vehicle
motorcycle    26435
scooter       19090
bicycle          68
Name: count, dtype: int64

In [54]:
# Get Dummies
delivery_df2 = delivery_df2.copy()
delivery_df2 = pd.concat([delivery_df2, pd.get_dummies(delivery_df2['Vehicle'], prefix='Vehicle')], axis=1)
delivery_df2.columns

Index(['ID', 'Delivery_person_ID', 'Delivery_person_Age',
       'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Order_Date', 'Time_Orderd',
       'Time_Order_picked', 'Weatherconditions', 'Road_traffic_density',
       'Vehicle_condition', 'Type_of_order', 'Vehicle', 'multiple_deliveries',
       'Festival', 'City', 'Time_taken(min)', 'Vehicle_bicycle',
       'Vehicle_motorcycle', 'Vehicle_scooter'],
      dtype='object')

In [55]:
# Drop Unused Columns  from moel
delivery_df2 = delivery_df2.copy()
delivery_df2 = delivery_df2.drop(columns=['Vehicle', 'Festival', 'Vehicle_condition'])

In [56]:
delivery_df2 = delivery_df2.copy()

delivery_df2['Weatherconditions'].replace('conditions ', '', inplace=True, regex=True)
delivery_df2['Weatherconditions'] = delivery_df2['Weatherconditions'].str.strip()

# Create Ordinal Hierarchy
x = {'Sandstorms': '6'
    ,'Stormy': '5'
    ,'Fog': '4'
    ,'Windy': '3'
    ,'Cloudy': '2'
    ,'Sunny': '1'
    ,'': '0'}

delivery_df2['Weatherconditions'].replace(x, inplace=True, regex=True)


delivery_df2["Weatherconditions"].value_counts()


Weatherconditions
4    7654
5    7586
2    7536
6    7495
3    7422
1    7284
0     616
Name: count, dtype: int64

In [58]:
# Clean Date and Time Columns

from datetime import datetime

def combine_date_time(datecolumn, timecolumn, instance, df):
    df[f'{datecolumn}'] = pd.to_datetime(df[datecolumn], yearfirst=True, format='mixed')
    df[f'{timecolumn}'] = pd.to_datetime(df[timecolumn], format='%H:%M:%S').dt.time

    df[f'Timestamp_{instance}'] = df.apply(lambda row: datetime.combine(row[f'{datecolumn}'], row[f'{timecolumn}']) if pd.notnull(row[f'{datecolumn}']) and pd.notnull(row[f'{timecolumn}']) else pd.NaT, axis=1)

    return df

delivery_df2 = combine_date_time('Order_Date', 'Time_Orderd', 'placed', delivery_df2)
delivery_df2 = combine_date_time('Order_Date', 'Time_Order_picked', 'picked', delivery_df2)


In [59]:
# Define a function to add a day if 'Timestamp_picked' is earlier than 'Timestamp_placed'

def add_day(row):
    if row['Timestamp_picked'] < row['Timestamp_placed']:
        return row['Timestamp_picked'] + pd.DateOffset(days=1)
    else:
        return row['Timestamp_picked']

# Apply the function to the 'Timestamp_picked' column
delivery_df2['Timestamp_picked'] = delivery_df2.apply(add_day, axis=1)

delivery_df2 = delivery_df2.drop(columns=["Order_Date", 'Time_Order_picked', 'Time_Orderd'])

Index(['ID', 'Delivery_person_ID', 'Delivery_person_Age',
       'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Weatherconditions',
       'Road_traffic_density', 'Type_of_order', 'multiple_deliveries', 'City',
       'Time_taken(min)', 'Vehicle_bicycle', 'Vehicle_motorcycle',
       'Vehicle_scooter', 'Timestamp_placed', 'Timestamp_picked'],
      dtype='object')