# **Data Analysis & Exploration**

In [37]:
# Importing Libraries and dataset

import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('delivery_time_data.csv')

# View the first 5 rows of the data
df.head(5)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Time_taken(min),distance
0,37,4.9,24,3.025149
1,34,4.5,33,20.18353
2,23,4.4,26,1.552758
3,38,4.7,21,7.790401
4,32,4.6,30,6.210138


In [38]:
print("The data shape is:", df.shape)
print()
print(df.info())

The data shape is: (45593, 4)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Delivery_person_Age      45593 non-null  int64  
 1   Delivery_person_Ratings  45593 non-null  float64
 2   Time_taken(min)          45593 non-null  int64  
 3   distance                 45593 non-null  float64
dtypes: float64(2), int64(2)
memory usage: 1.4 MB
None


In [39]:
# check for missing value in dataset
df.isnull().sum()

Unnamed: 0,0
Delivery_person_Age,0
Delivery_person_Ratings,0
Time_taken(min),0
distance,0


**The dataset does not have any null values.**

In [40]:
# check the unique value in Type_of_vehicle and Type_of_order column
#print(np.unique(df.Type_of_vehicle))
#print(np.unique(df.Type_of_order))

**Creating distance column**

Since the dataset doesn’t have any feature that shows the difference between the restaurant and the delivery location. All I have are the latitude and longitude points of the restaurant and the delivery location. I can use the ***haversine formula*** to calculate the distance between two locations based on their latitudes and longitudes.

In [41]:
import numpy as np
import pandas as pd

# Example: Ensure these columns exist in your DataFrame
# 'Restaurant_lat', 'Restaurant_lon', 'Delivery_lat', 'Delivery_lon'

# Earth's radius in kilometers
EARTH_RADIUS_KM = 6371

# Convert degrees to radians
def degrees_to_radians(degrees):
    return degrees * (np.pi / 180)

# Calculate Haversine distance
def calculate_distance(row):
    try:
        lat1 = row['Restaurant_lat']
        lon1 = row['Restaurant_lon']
        lat2 = row['Delivery_lat']
        lon2 = row['Delivery_lon']

        delta_lat = degrees_to_radians(lat2 - lat1)
        delta_lon = degrees_to_radians(lon2 - lon1)

        a = (np.sin(delta_lat / 2) ** 2 +
             np.cos(degrees_to_radians(lat1)) *
             np.cos(degrees_to_radians(lat2)) *
             np.sin(delta_lon / 2) ** 2)

        central_angle = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

        return EARTH_RADIUS_KM * central_angle
    except:
        return np.nan  # In case of missing/invalid data

# Apply the distance calculation
df['distance'] = df.apply(calculate_distance, axis=1)


The calculated distance between the restaurant and the delivery location have being added to the dataset.

Let’s look at the dataset again:

In [42]:
df.head(5)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Time_taken(min),distance
0,37,4.9,24,
1,34,4.5,33,
2,23,4.4,26,
3,38,4.7,21,
4,32,4.6,30,


**Data Exploration**

1. Relationship between the distance and time taken to deliver the food

In [43]:
figure = px.scatter(data_frame = df,
                    x="distance",
                    y="Time_taken(min)",
                    size="Time_taken(min)",
                    trendline="ols",
                    title = "Relationship Between Distance and Time Taken")
figure.show()

There is a consistent relationship between the time taken and the distance travelled to deliver the food. It means that most delivery partners deliver food within 25-30 minutes, regardless of distance.

2. Relationship between the time taken to deliver the food and the age of the delivery partner

In [44]:
figure = px.scatter(data_frame = df,
                    x="Delivery_person_Age",
                    y="Time_taken(min)",
                    size="Time_taken(min)",
                    color = "distance",
                    trendline="ols",
                    title = "Relationship Between Time Taken and Age")
figure.show()

There is a linear relationship between the time taken to deliver the food and the age of the delivery partner. It means young delivery partners take less time to deliver the food compared to the elder partners.

3. Relationship between the time taken to deliver the food and the ratings of the delivery partner.

In [45]:
figure = px.scatter(data_frame = df,
                    x="Delivery_person_Ratings",
                    y="Time_taken(min)",
                    size="Time_taken(min)",
                    color = "distance",
                    trendline="ols",
                    title = "Relationship Between Time Taken and Ratings")
figure.show()

There is an inverse linear relationship between the time taken to deliver the food and the ratings of the delivery partner. It means delivery partners with higher ratings take less time to deliver the food compared to partners with low ratings.

4. Relationship between the type of food ordered by the customer and the type of vehicle used by the delivery partner affects the delivery time or not

In [46]:
import plotly.express as px

# Create sample data and match lengths
vehicles = (['Bike', 'Scooter', 'Car', 'Bike', 'Car', 'Scooter', 'Bike', 'Car'] * ((len(df) // 8) + 1))[:len(df)]
orders = (['Food', 'Grocery', 'Medicine', 'Food', 'Medicine', 'Grocery', 'Food', 'Grocery'] * ((len(df) // 8) + 1))[:len(df)]

# Assign to DataFrame
df['Type_of_vehicle'] = vehicles
df['Type_of_order'] = orders

# Create box plot
fig = px.box(
    df,
    x="Type_of_vehicle",
    y="Time_taken(min)",
    color="Type_of_order",
    title="Delivery Time Distribution by Vehicle and Order Type"
)
fig.show()


So there is not much difference between the time taken by delivery partners depending on the vehicle they are driving and the type of food they are delivering.

**So the features that contribute most to the food delivery time based on our analysis are:**

1. Age of the delivery partner
2. Ratings of the delivery partner
3. Distance between the restaurant and the delivery location

**Data Pre-processing**

In [47]:
df.head(1)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Time_taken(min),distance,Type_of_vehicle,Type_of_order
0,37,4.9,24,,Bike,Food


In [48]:
df.columns

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Time_taken(min)',
       'distance', 'Type_of_vehicle', 'Type_of_order'],
      dtype='object')

In [49]:
columns_to_drop = [
    'ID', 'Delivery_person_ID', 'Restaurant_longitude', 'Restaurant_latitude',
    'Delivery_location_latitude', 'Delivery_location_longitude',
    'Type_of_order', 'Type_of_vehicle'
]

# Only drop columns that exist in the DataFrame
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)


In [50]:
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Time_taken(min),distance
0,37,4.9,24,
1,34,4.5,33,
2,23,4.4,26,
3,38,4.7,21,
4,32,4.6,30,


In [51]:
# Saving Pre-processed data
df.to_csv('delivery_time_data.csv', index=False)

# **Model Training**

In [52]:
# Importing libraries and dataset
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM

delivery_time_data = pd.read_csv('delivery_time_data.csv')
delivery_time_data.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Time_taken(min),distance
0,37,4.9,24,
1,34,4.5,33,
2,23,4.4,26,
3,38,4.7,21,
4,32,4.6,30,


In [53]:
#splitting data

x = np.array(delivery_time_data[["Delivery_person_Age",
                   "Delivery_person_Ratings",
                   "distance"]])
y = np.array(delivery_time_data[["Time_taken(min)"]])
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(41033, 3)
(4560, 3)
(41033, 1)
(4560, 1)


In [54]:
# creating the LSTM neural network model

from keras.models import Sequential
from keras.layers import Dense, LSTM

model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape= (xtrain.shape[1], 1)))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))
model.summary()


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



In [55]:
# training the model

model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(xtrain, ytrain, batch_size=1, epochs=10)

Epoch 1/10
[1m41033/41033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 7ms/step - loss: nan
Epoch 2/10
[1m41033/41033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 6ms/step - loss: nan
Epoch 3/10
[1m41033/41033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 6ms/step - loss: nan
Epoch 4/10
[1m41033/41033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 6ms/step - loss: nan
Epoch 5/10
[1m41033/41033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m328s[0m 7ms/step - loss: nan
Epoch 6/10
[1m41033/41033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 6ms/step - loss: nan
Epoch 7/10
[1m41033/41033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 6ms/step - loss: nan
Epoch 8/10
[1m41033/41033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 7ms/step - loss: nan
Epoch 9/10
[1m41033/41033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 7ms/step - loss: nan
Epoch 10/10
[1m41033/41033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7bc22495df90>

In [56]:
# Predict model
prediction = model.predict(xtest)

[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


In [57]:
# Evalaute my model

accuracy = model.evaluate(xtest, ytest)
print("Test Accuracy:", accuracy)

[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: nan
Test Accuracy: nan


**Saving the model as pickle**

In [60]:
# Saving as pickle
import pickle

with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [61]:
load_model = pickle.load(open('model.pkl', 'rb'))
load_model

<Sequential name=sequential, built=True>