In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
data = pd.read_csv("Data/amazon_delivery.csv")
data.head()

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43739 entries, 0 to 43738
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order_ID         43739 non-null  object 
 1   Agent_Age        43739 non-null  int64  
 2   Agent_Rating     43685 non-null  float64
 3   Store_Latitude   43739 non-null  float64
 4   Store_Longitude  43739 non-null  float64
 5   Drop_Latitude    43739 non-null  float64
 6   Drop_Longitude   43739 non-null  float64
 7   Order_Date       43739 non-null  object 
 8   Order_Time       43739 non-null  object 
 9   Pickup_Time      43739 non-null  object 
 10  Weather          43648 non-null  object 
 11  Traffic          43739 non-null  object 
 12  Vehicle          43739 non-null  object 
 13  Area             43739 non-null  object 
 14  Delivery_Time    43739 non-null  int64  
 15  Category         43739 non-null  object 
dtypes: float64(5), int64(2), object(9)
memory usage: 5.3+ MB


In [4]:
data.describe()

Unnamed: 0,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Delivery_Time
count,43739.0,43685.0,43739.0,43739.0,43739.0,43739.0,43739.0
mean,29.567137,4.63378,17.21096,70.661177,17.459031,70.821842,124.905645
std,5.815155,0.334716,7.764225,21.475005,7.34295,21.153148,51.915451
min,15.0,1.0,-30.902872,-88.366217,0.01,0.01,10.0
25%,25.0,4.5,12.933298,73.170283,12.985996,73.28,90.0
50%,30.0,4.7,18.55144,75.898497,18.633626,76.002574,125.0
75%,35.0,4.9,22.732225,78.045359,22.785049,78.104095,160.0
max,50.0,6.0,30.914057,88.433452,31.054057,88.563452,270.0


In [5]:
miss_values = data.isnull().sum()
miss_values

Order_ID            0
Agent_Age           0
Agent_Rating       54
Store_Latitude      0
Store_Longitude     0
Drop_Latitude       0
Drop_Longitude      0
Order_Date          0
Order_Time          0
Pickup_Time         0
Weather            91
Traffic             0
Vehicle             0
Area                0
Delivery_Time       0
Category            0
dtype: int64

In [6]:
data.shape

(43739, 16)

In [7]:
missing_percent = (miss_values[miss_values > 0] / data.shape[0]) * 100
missing_percent

Agent_Rating    0.123460
Weather         0.208052
dtype: float64

In [8]:
data.columns = data.columns.str.strip() # remove any extra space

In [9]:
data.columns

Index(['Order_ID', 'Agent_Age', 'Agent_Rating', 'Store_Latitude',
       'Store_Longitude', 'Drop_Latitude', 'Drop_Longitude', 'Order_Date',
       'Order_Time', 'Pickup_Time', 'Weather', 'Traffic', 'Vehicle', 'Area',
       'Delivery_Time', 'Category'],
      dtype='object')

In [10]:
data.duplicated().any()

False

In [11]:
data.dropna(inplace=True)

In [12]:
# Store_Latitude columns have negative value 
Latitude_neg_value = data[data['Store_Latitude'] < 0]
print(len(Latitude_neg_value))

151


In [13]:
data['Store_Latitude'] = data['Store_Latitude'].abs()
data['Store_Longitude'] = data['Store_Longitude'].abs()

In [14]:
# Convert date/time columns
data['Order_DateTime'] = pd.to_datetime(data['Order_Date'] + ' ' + data['Order_Time'])
data['Pickup_DateTime'] = pd.to_datetime(data['Order_Date'] + ' ' + data['Pickup_Time'])

In [None]:
# Calculate time to pickup
data['Time_to_Pickup'] = (data['Pickup_DateTime'] - data['Order_DateTime']).dt.total_seconds() / 60 


In [16]:
# Drop original date/time columns
data.drop(['Order_Date', 'Order_Time', 'Pickup_Time'], axis=1, inplace=True)

In [17]:
data.head()

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Weather,Traffic,Vehicle,Area,Delivery_Time,Category,Order_DateTime,Pickup_DateTime,Time_to_Pickup
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,Sunny,High,motorcycle,Urban,120,Clothing,2022-03-19 11:30:00,2022-03-19 11:45:00,15.0
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,Stormy,Jam,scooter,Metropolitian,165,Electronics,2022-03-25 19:45:00,2022-03-25 19:50:00,5.0
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,Sandstorms,Low,motorcycle,Urban,130,Sports,2022-03-19 08:30:00,2022-03-19 08:45:00,15.0
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics,2022-04-05 18:00:00,2022-04-05 18:10:00,10.0
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,Cloudy,High,scooter,Metropolitian,150,Toys,2022-03-26 13:30:00,2022-03-26 13:45:00,15.0


In [18]:
data.head()

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Weather,Traffic,Vehicle,Area,Delivery_Time,Category,Order_DateTime,Pickup_DateTime,Time_to_Pickup
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,Sunny,High,motorcycle,Urban,120,Clothing,2022-03-19 11:30:00,2022-03-19 11:45:00,15.0
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,Stormy,Jam,scooter,Metropolitian,165,Electronics,2022-03-25 19:45:00,2022-03-25 19:50:00,5.0
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,Sandstorms,Low,motorcycle,Urban,130,Sports,2022-03-19 08:30:00,2022-03-19 08:45:00,15.0
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics,2022-04-05 18:00:00,2022-04-05 18:10:00,10.0
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,Cloudy,High,scooter,Metropolitian,150,Toys,2022-03-26 13:30:00,2022-03-26 13:45:00,15.0


In [19]:
data['Order_Hour'] = data['Order_DateTime'].dt.hour
data['Order_Day'] = data['Order_DateTime'].dt.day
data['Order_Month'] = data['Order_DateTime'].dt.month
data['Order_Weekday'] = data['Order_DateTime'].dt.weekday

In [20]:
data.head()

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Weather,Traffic,Vehicle,Area,Delivery_Time,Category,Order_DateTime,Pickup_DateTime,Time_to_Pickup,Order_Hour,Order_Day,Order_Month,Order_Weekday
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,Sunny,High,motorcycle,Urban,120,Clothing,2022-03-19 11:30:00,2022-03-19 11:45:00,15.0,11,19,3,5
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,Stormy,Jam,scooter,Metropolitian,165,Electronics,2022-03-25 19:45:00,2022-03-25 19:50:00,5.0,19,25,3,4
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,Sandstorms,Low,motorcycle,Urban,130,Sports,2022-03-19 08:30:00,2022-03-19 08:45:00,15.0,8,19,3,5
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics,2022-04-05 18:00:00,2022-04-05 18:10:00,10.0,18,5,4,1
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,Cloudy,High,scooter,Metropolitian,150,Toys,2022-03-26 13:30:00,2022-03-26 13:45:00,15.0,13,26,3,5


In [21]:
data.drop(columns=['Order_ID', 'Order_DateTime', 'Pickup_DateTime'],axis=1,inplace=True)

In [22]:
X = data.drop(['Delivery_Time'], axis=1)
y = data['Delivery_Time']

In [23]:
categorical_features = [feature for feature in data.columns if data[feature].dtype == "object"]
categorical_features

['Weather', 'Traffic', 'Vehicle', 'Area', 'Category']

In [24]:
numerical_features  = [feature for feature in data.columns if data[feature].dtype != "object"]
numerical_features

['Agent_Age',
 'Agent_Rating',
 'Store_Latitude',
 'Store_Longitude',
 'Drop_Latitude',
 'Drop_Longitude',
 'Delivery_Time',
 'Time_to_Pickup',
 'Order_Hour',
 'Order_Day',
 'Order_Month',
 'Order_Weekday']

In [25]:
numerical_features =['Agent_Age',
 'Agent_Rating',
 'Store_Latitude',
 'Store_Longitude',
 'Drop_Latitude',
 'Drop_Longitude',
 'Time_to_Pickup',
 'Order_Hour',
 'Order_Day',
 'Order_Month',
 'Order_Weekday']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [27]:
preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

In [28]:
import joblib
joblib.dump(preprocessor,"preprocessor.pkl")

['preprocessor.pkl']

In [29]:
models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(n_estimators=200, random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, random_state=42),
        'XGBoost': XGBRegressor(n_estimators=200, random_state=42)
    }

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

In [31]:
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

In [32]:
rf = RandomForestRegressor()
rf.fit(X_train_pre,y_train)
print(rf.score(X_train_pre,y_train))
prd= rf.predict(X_test_pre)
r2_score(y_test,prd)

0.9657386438032822


0.7505925654309935

In [33]:
xgb = XGBRegressor()
xgb.fit(X_train_pre,y_train)
print(xgb.score(X_train_pre,y_train))
prd= xgb.predict(X_test_pre)
r2_score(y_test,prd)

0.8514423370361328


0.7905992865562439

In [34]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.sklearn
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns

In [35]:
best_model = None
best_r2 = -np.inf
mlflow.set_experiment("Amazon_Delivery_Time_Prediction")


for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        
        pipeline.fit(X_train, y_train)
        
        
        y_pred = pipeline.predict(X_test)
        
        
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
       
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("R2", r2)
        
        
        mlflow.sklearn.log_model(pipeline, model_name)
        
        
        if r2 > best_r2:
            best_r2 = r2
            best_model = pipeline
            
        print(f"{model_name}:")
        print(f"  RMSE: {rmse:.2f}")
        print(f"  MAE: {mae:.2f}")
        print(f"  R2: {r2:.4f}")



Linear Regression:
  RMSE: 32.18
  MAE: 25.51
  R2: 0.6063
Random Forest:
  RMSE: 25.57
  MAE: 19.55
  R2: 0.7513
Gradient Boosting:
  RMSE: 26.26
  MAE: 20.41
  R2: 0.7379
XGBoost:
  RMSE: 23.87
  MAE: 18.66
  R2: 0.7834


In [36]:

if best_model:
    best_model_name = best_model.named_steps['regressor'].__class__.__name__
    print(f"Best model based on R2: {best_model_name}")
    print(f"Best R2 score: {best_r2:.4f}")
else:
    print("No model was evaluated.")


Best model based on R2: XGBRegressor
Best R2 score: 0.7834


In [37]:
categorical_features

['Weather', 'Traffic', 'Vehicle', 'Area', 'Category']