In [None]:
!pip install "dvc[gdrive]"

Collecting dvc[gdrive]
  Downloading dvc-3.63.0-py3-none-any.whl.metadata (17 kB)
Collecting celery (from dvc[gdrive])
  Downloading celery-5.5.3-py3-none-any.whl.metadata (22 kB)
Collecting colorama>=0.3.9 (from dvc[gdrive])
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting configobj>=5.0.9 (from dvc[gdrive])
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting dpath<3,>=2.1.0 (from dvc[gdrive])
  Downloading dpath-2.2.0-py3-none-any.whl.metadata (15 kB)
Collecting dulwich (from dvc[gdrive])
  Downloading dulwich-0.24.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (5.2 kB)
Collecting dvc-data<3.17,>=3.16.2 (from dvc[gdrive])
  Downloading dvc_data-3.16.12-py3-none-any.whl.metadata (5.1 kB)
Collecting dvc-http>=2.29.0 (from dvc[gdrive])
  Downloading dvc_http-2.32.0-py3-none-any.whl.metadata (1.3 kB)
Collecting dvc-objects (from dvc[gdrive])
  Downloading dvc_objects-5.1.1-py3-none-any.whl.metadata (3.8 kB)
Collecting dvc-render<

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import dvc.api
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

repo_url = 'https://github.com/AshwinVK23/Zomato_delivery_time_analyser'
data_path = 'Zomato_Data.csv'
commit_hash = '2c33766'

# Open the data stream using DVC
with dvc.api.open(
    path=data_path,
    repo=repo_url,
    rev=commit_hash
) as data_stream:
    df = pd.read_csv(data_stream)

print("Dataset loaded successfully!")
print("Shape of the dataset:", df.shape)
df.head()
print(df.columns)

Dataset loaded successfully!
Shape of the dataset: (45094, 21)
Index(['ID', 'Delivery_person_ID', 'Delivery_person_Age',
       'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Order_Date', 'Time_Orderd',
       'Time_Order_picked', 'Weather_conditions', 'Road_traffic_density',
       'Vehicle_condition', 'Type_of_order', 'Type_of_vehicle',
       'multiple_deliveries', 'Festival', 'City', 'Time_taken (min)',
       'distance (km)'],
      dtype='object')


In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop('Time_taken (min)', axis=1)
y = df['Time_taken (min)']

# Drop identifier columns
X = X.drop(['ID', 'Delivery_person_ID'], axis=1)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split successfully!")
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Data split successfully!
Shape of X_train: (36075, 18)
Shape of X_test: (9019, 18)
Shape of y_train: (36075,)
Shape of y_test: (9019,)


### Baseline Model Training

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    A custom transformer to extract features from datetime columns.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Ensure the input is a DataFrame
        df = X.copy()

        # --- Feature 1: Food Preparation Time ---
        # Convert time columns to datetime objects, coercing errors
        time_ordered = pd.to_datetime(df['Time_Orderd'], errors='coerce')
        time_picked = pd.to_datetime(df['Time_Order_picked'], errors='coerce')
        # Calculate the difference in minutes
        prep_time = (time_picked - time_ordered).dt.total_seconds() / 60

        # --- Feature 2: Order Hour ---
        order_hour = time_ordered.dt.hour

        # --- Feature 3: Day of the Week ---
        order_date = pd.to_datetime(df['Order_Date'], format='%d-%m-%Y', errors='coerce')
        day_of_week = order_date.dt.dayofweek # Monday=0, Sunday=6

        # Create a new DataFrame with the extracted features
        extracted_features = pd.DataFrame({
            'preparation_time_mins': prep_time,
            'order_hour': order_hour,
            'day_of_week': day_of_week
        })

        # Handle any NaNs that might have resulted from conversion errors
        return extracted_features.fillna(0)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np

# --- Identify your column types ---
numerical_cols = [
    'Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude',
    'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude',
    'Vehicle_condition', 'multiple_deliveries', 'distance (km)'
]

# Categorical columns
categorical_cols = [
    'Weather_conditions', 'Road_traffic_density', 'Type_of_order',
    'Type_of_vehicle', 'Festival', 'City'
]

# Datetime columns that our custom transformer will handle
datetime_cols = ['Order_Date', 'Time_Orderd', 'Time_Order_picked']

# Create transformers for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        # 1. Apply our custom transformer to the datetime columns
        ('datetime_features', DatetimeFeatureExtractor(), datetime_cols),

        # 2. OneHotEncode the categorical columns
        ('categorical_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_cols),

        # 3. Scale the numerical columns
        ('numerical_scaler', StandardScaler(), numerical_cols)
    ],
    remainder='passthrough' # Keep any other columns (like lat/lon)
)

# --- Re-create your model pipeline ---
# Let's use Random Forest as the example
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42)) # Or your tuned model
])

model_pipeline.fit(X_train, y_train)
new_predictions = model_pipeline.predict(X_test)
new_rmse = np.sqrt(mean_squared_error(y_test, new_predictions))
print(f"RMSE with new features: {new_rmse:.4f}")

RMSE with new features: 3.9938


In [None]:
# Identify non-numeric columns
non_numeric_cols = X_train.select_dtypes(exclude=np.number).columns
print("Non-numeric columns:", non_numeric_cols)

# Display the first few rows of these columns to understand their content
display(X_train[non_numeric_cols].head())

### Hyperparameter Tuning

In [None]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-3.4.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.4.0 (from mlflow)
  Downloading mlflow_skinny-3.4.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.4.0 (from mlflow)
  Downloading mlflow_tracing-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting fastmcp<3,>=2.0.0 (from mlflow)
  Downloading fastmcp-2.12.3-py3-none-any.whl.metadata (17 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.4.0->mlflow)
  Downloading databricks_sdk-0.67.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-proto<3,>=1.9.0 (from mlflow-skinny==3.4.0->mlflow)
  Downloading opentelemetry_proto-1.37.0-py3-none-any.w

In [None]:
from sklearn.model_selection import GridSearchCV
import mlflow

# 1. Use your new, powerful pipeline as the estimator
pipeline_with_new_features = model_pipeline

# 2. Define the parameter grid
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [10, 20, None],
    'regressor__min_samples_split': [2, 5, 10]
}

# 3. Initialize and run GridSearchCV
grid_search_new = GridSearchCV(pipeline_with_new_features, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

print("Starting GridSearchCV with new features...")
grid_search_new.fit(X_train, y_train)

# --- Evaluate the new best model ---
best_model_new = grid_search_new.best_estimator_
y_pred_new_tuned = best_model_new.predict(X_test)
rmse_new_tuned = np.sqrt(mean_squared_error(y_test, y_pred_new_tuned))

print("\n--- RESULTS WITH NEW FEATURES ---")
print("Best parameters found:", grid_search_new.best_params_)
print(f"New Tuned Random Forest RMSE on test set: {rmse_new_tuned:.4f}")


Starting GridSearchCV with new features...

--- RESULTS WITH NEW FEATURES ---
Best parameters found: {'regressor__max_depth': 20, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 300}
New Tuned Random Forest RMSE on test set: 3.9503


# Aim: ML Modeling & Experiment Tracking

**Objective:** Build ML pipeline, tune hyperparameters, track experiments with MLflow.

The aim of this project is to build a machine learning pipeline to predict food delivery times on Zomato using features such as order times, weather conditions, traffic density, and geographic details of restaurants and delivery locations (latitude/longitude). Additionally, MLflow was integrated to track experiments, compare baseline and tuned models, and manage model artifacts.

## Detailed Steps

### Dataset Preparation

Records of Zomato deliveries with attributes including:
*   Time-based features: Order time, day of week.
*   Weather conditions: Sunny, Rainy, Foggy, etc.
*   Traffic density: Low, Medium, High, Jammed.
*   Restaurant & Delivery location: Latitude, Longitude.
*   Target variable: Delivery duration (minutes).

The dataset was split into an 80% training set and a 20% testing set using a random seed of 42 for reproducibility.

### Baseline Model Training

The following baseline models were trained:
*   Linear Regression
*   Random Forest Regressor (default params)
*   Gradient Boosting Regressor (default params)

**Evaluation Metric:**
RMSE (Root Mean Squared Error) was used for prediction accuracy.

**Baseline Results (Test Set):**
Based on the notebook execution, the baseline RMSE values were:
*   Linear Regression: 5.9691
*   Decision Tree: 5.3905
*   Random Forest: 4.0047
*   Gradient Boosting: 4.4806

### Hyperparameter Tuning

Hyperparameter tuning was applied to the Random Forest, Decision Tree, and Gradient Boosting models using `GridSearchCV`.

**Random Forest (Tuned):**
*   Best parameters found: `{'regressor__max_depth': 20, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 300}`
*   Tuned Test RMSE: 3.9503

**Decision Tree (Tuned):**
*   Best parameters found: `{'regressor__max_depth': 10, 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 10}`
*   Tuned Test RMSE: 4.2407

**Gradient Boosting (Tuned):**
*   Best parameters found: `{'regressor__learning_rate': 0.1, 'regressor__max_depth': 7, 'regressor__n_estimators': 100}`
*   Tuned Test RMSE: 3.8871

### Tuned Model Performance Comparison:

Based on the notebook execution, the RMSE scores for all models are:
*   Baseline Linear Regression: 5.9691
*   Baseline Decision Tree: 5.3905
*   Baseline Random Forest: 4.0047
*   Baseline Gradient Boosting: 4.4806
*   Tuned Random Forest: 3.9503
*   Tuned Decision Tree: 4.2407
*   Tuned Gradient Boosting: 3.8871

The lowest RMSE was achieved by the Tuned Gradient Boosting model (3.8871). Hyperparameter tuning improved the performance of Random Forest, Decision Tree, and Gradient Boosting models compared to their baseline versions.

### Experiment Tracking with MLflow

MLflow was used to track the experiments, logging the parameters and metrics for both the baseline and tuned models.

### Model Selection & Saving

The Tuned Gradient Boosting model, having the lowest RMSE, was identified as the best-performing model. The notebook includes code to save this model using `joblib`.

## Summary:

### Data Analysis Key Findings

*   Hyperparameter tuning for the Decision Tree model using `GridSearchCV` identified the best parameters as `{'regressor__max_depth': 10, 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 10}`, resulting in a best cross-validation RMSE of approximately 4.246.
*   Hyperparameter tuning for the Gradient Boosting model using `GridSearchCV` identified the best parameters as `{'regressor__learning_rate': 0.1, 'regressor__max_depth': 7, 'regressor__n_estimators': 100}`, resulting in a best cross-validation RMSE of approximately 3.909.
*   Evaluating the tuned models on the test set showed a tuned Decision Tree RMSE of 4.2407 and a tuned Gradient Boosting RMSE of 3.8871.
*   The tuned Gradient Boosting model achieved the lowest RMSE (3.8871) among all evaluated models (baseline Linear Regression, Decision Tree, Random Forest, Gradient Boosting, and tuned versions).
*   Hyperparameter tuning improved the performance (reduced RMSE) of the Random Forest, Decision Tree, and Gradient Boosting models compared to their baseline counterparts.
*   The hyperparameters and RMSE metrics (cross-validation and test set) for the tuned Decision Tree and Gradient Boosting models were successfully logged to MLflow in a new run named "Tuned Models Comparison".

### Insights or Next Steps

*   The tuned Gradient Boosting model is the best-performing model based on the RMSE metric and should be considered for deployment.
*   Further hyperparameter tuning with a wider grid or more advanced techniques (e.g., RandomizedSearchCV, Bayesian Optimization) could potentially yield even better performance.
*   Consider exploring additional feature engineering based on the datetime features (e.g., rush hour indicators, time of day categories) which were used in one of the successful model runs.
*   Investigate potential outliers or data quality issues that might be affecting model performance.
*   Deploy the best-performing model for real-time delivery time predictions.

## Why RMSE Decreased After Hyperparameter Tuning

Hyperparameter tuning improved the RMSE of the models (Random Forest, Decision Tree, and Gradient Boosting) because it allowed us to find more optimal configurations for the models' internal settings for this specific dataset.

Think of hyperparameters as controls that influence how the model learns. By using `GridSearchCV`, we systematically tested different combinations of these controls (like the number of trees, depth of trees, learning rate, etc.). This process helps in several ways:

*   **Better Fit to Data:** Tuning allows the model to better capture the complex relationships and patterns within your training data.
*   **Improved Generalization:** By finding the right balance, tuning helps prevent the model from simply memorizing the training data (overfitting). A well-tuned model generalizes better to new, unseen data (like your test set).
*   **Reduced Errors:** When the model learns the patterns more accurately and generalizes well, its predictions on the test set are closer to the actual values. RMSE measures the average prediction error, so more accurate predictions lead to a lower RMSE.

In essence, hyperparameter tuning helps the model become more "suited" to the specific characteristics of the Zomato delivery time data, resulting in more accurate predictions and a lower RMSE compared to using default or less optimal settings.

In [None]:
!pip install dvc[gdrive] -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.2/78.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.8/438.8 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.0/210.0 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.9/73.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
# Use your Personal Access Token (PAT) if your repo is private
!git clone https://github.com/AshwinVK23/Zomato_delivery_time_analyser.git

%cd Zomato_delivery_time_analyser

Cloning into 'Zomato_delivery_time_analyser'...
remote: Enumerating objects: 75, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 75 (delta 25), reused 54 (delta 13), pack-reused 0 (from 0)[K
Receiving objects: 100% (75/75), 5.06 MiB | 7.06 MiB/s, done.
Resolving deltas: 100% (25/25), done.
/content/Zomato_delivery_time_analyser/Zomato_delivery_time_analyser


In [None]:
!dvc pull models/delivery_model.pkl.dvc

! !Collecting          |0.00 [00:00,    ?entry/s]Collecting          |0.00 [00:00,    ?entry/s]
Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=178551701615-9f8di2ohoe1ucketr0gjr6h062ronudj.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8090%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.appdata&access_type=offline&response_type=code&approval_prompt=force

4/0AVGzR1ARxjazPulXPtIhct5_1f7Uba5NWnp8NGLOY41fWTSBoQx3rixhcKxcaC4w0MOSlQ
Fetching
[31mERROR[39m: interrupted by the user
[0m

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import joblib

# This is your final, best-performing pipeline
final_model = grid_search_new.best_estimator_

# Save the model to a file named 'delivery_model.pkl'
joblib.dump(final_model, 'delivery_model.pkl')

print("\nFinal tuned model has been saved to 'delivery_model.pkl'")

# --- How to load it back in a new script ---
# loaded_model = joblib.load('delivery_model.pkl')
# predictions = loaded_model.predict(some_new_data)

NameError: name 'grid_search_new' is not defined

### Experiment Tracking with MLflow

In [None]:
import joblib
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

# You still need to include the custom class definition for joblib to work
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X, y=None):
        df = X.copy()
        time_ordered = pd.to_datetime(df['Time_Orderd'], errors='coerce')
        time_picked = pd.to_datetime(df['Time_Order_picked'], errors='coerce')
        prep_time = (time_picked - time_ordered).dt.total_seconds() / 60
        order_hour = time_ordered.dt.hour
        order_date = pd.to_datetime(df['Order_Date'], format='%d-%m-%Y', errors='coerce')
        day_of_week = order_date.dt.dayofweek
        extracted_features = pd.DataFrame({
            'preparation_time_mins': prep_time, 'order_hour': order_hour, 'day_of_week': day_of_week
        })
        return extracted_features.fillna(0)

# Define the path to your model file in Google Drive
# Note: This assumes the file is in your main "My Drive" folder
model_path = '/content/drive/My Drive/delivery_model.pkl'

# Load the model
model = joblib.load(model_path)

print("✅ Model loaded successfully directly from Google Drive!")

✅ Model loaded successfully directly from Google Drive!


In [None]:
import mlflow
import joblib
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

# You must include the custom class definition so joblib can load your model
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X, y=None):
        df = X.copy()
        time_ordered = pd.to_datetime(df['Time_Orderd'], errors='coerce')
        time_picked = pd.to_datetime(df['Time_Order_picked'], errors='coerce')
        prep_time = (time_picked - time_ordered).dt.total_seconds() / 60
        order_hour = time_ordered.dt.hour
        order_date = pd.to_datetime(df['Order_Date'], format='%d-%m-%Y', errors='coerce')
        day_of_week = order_date.dt.dayofweek
        extracted_features = pd.DataFrame({
            'preparation_time_mins': prep_time, 'order_hour': order_hour, 'day_of_week': day_of_week
        })
        return extracted_features.fillna(0)

# 1. Manually define the results from your Colab run
best_params = {
    'regressor__max_depth': 20,
    'regressor__min_samples_split': 10,
    'regressor__n_estimators': 300
}
test_rmse = 3.9503 # The score you found previously

# 2. Load the model you downloaded from Colab
model_to_log = model

# 3. Set up and run MLflow logging
mlflow.set_experiment("Zomato Delivery Time Prediction")

with mlflow.start_run(run_name="Tuned RF (from Colab)"):
    print("Logging parameters...")
    mlflow.log_params(best_params)

    print("Logging metrics...")
    mlflow.log_metric("Test_RMSE", test_rmse)

    print("Logging model artifact...")
    mlflow.sklearn.log_model(model_to_log, "tuned_rf_from_colab")

print("\nFinished logging experiment to your local ML")

2025/09/25 19:15:30 INFO mlflow.tracking.fluent: Experiment with name 'Zomato Delivery Time Prediction' does not exist. Creating a new experiment.


Logging parameters...
Logging metrics...
Logging model artifact...





Finished logging experiment to your local ML


In [None]:
# Install pyngrok
!pip install pyngrok -q

# --- ADD YOUR AUTHTOKEN HERE ---
from pyngrok import ngrok
ngrok.set_auth_token("2o991TRHcoXKCNwuPFeJfVaEJUA_2hyxaxgnv3ekSVG9vDtoZ") # <--- Paste your token here

# Run the MLflow UI in the background
get_ipython().system_raw("mlflow ui --port 5000 &")

# Create the tunnel to the MLflow UI
public_url = ngrok.connect(5000)
print(f"Click this link to view your MLflow UI: {public_url}")

Click this link to view your MLflow UI: NgrokTunnel: "https://09c2931e4ed5.ngrok-free.app" -> "http://localhost:5000"


### Model Selection & Saving

In [None]:
import joblib
import os

# Select the best-performing model (tuned Random Forest)
best_model = grid_search.best_estimator_

# Define the directory to save the model
model_dir = "saved_models"
os.makedirs(model_dir, exist_ok=True)

# Define the model filename
model_filename = os.path.join(model_dir, "tuned_random_forest_model.pkl")

# Save the model using joblib
joblib.dump(best_model, model_filename)

print(f"Best model saved to: {model_filename}")

Best model saved to: saved_models/tuned_random_forest_model.pkl


# Task
Perform hyperparameter tuning for the Decision Tree and Gradient Boosting models using GridSearchCV with RMSE as the scoring metric. Evaluate the tuned models on the test set and compare their performance with the previously evaluated models. Log the results of the tuned models to MLflow.

## Hyperparameter tuning for decision tree

### Subtask:
Define a parameter grid for the `DecisionTreeRegressor`, set up a `GridSearchCV` with the preprocessing pipeline, fit it to the training data, and print the best parameters and cross-validation RMSE.


**Reasoning**:
Define the parameter grid for Decision Tree, set up the pipeline and GridSearchCV, fit it to the training data, and print the best parameters and cross-validation RMSE as requested in the subtask.



In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Define the parameter grid for DecisionTreeRegressor
dt_param_grid = {
    'regressor__max_depth': [10, 20, 30, None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Identify categorical and datetime columns (same as before)
categorical_cols = X_train.select_dtypes(include='object').columns.tolist()
datetime_cols = ['Order_Date', 'Time_Orderd', 'Time_Order_picked']
categorical_cols = [col for col in categorical_cols if col not in datetime_cols]

# Create the preprocessor (same as before)
preprocessor = ColumnTransformer(
    transformers=[
        ('drop_datetime', 'drop', datetime_cols),
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)


# Create a pipeline with the preprocessor and the Decision Tree Regressor
dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', DecisionTreeRegressor(random_state=42))])

# Initialize GridSearchCV for Decision Tree
dt_grid_search = GridSearchCV(dt_pipeline, dt_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit GridSearchCV to the training data
print("Starting GridSearchCV for Decision Tree...")
dt_grid_search.fit(X_train, y_train)

print("GridSearchCV for Decision Tree complete.")
print("Best parameters found for Decision Tree:", dt_grid_search.best_params_)
print("Best cross-validation RMSE for Decision Tree:", np.sqrt(-dt_grid_search.best_score_))

Starting GridSearchCV for Decision Tree...
GridSearchCV for Decision Tree complete.
Best parameters found for Decision Tree: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 10}
Best cross-validation RMSE for Decision Tree: 4.2458788204076425


## Hyperparameter tuning for gradient boosting

### Subtask:
Define a parameter grid for the `GradientBoostingRegressor`, set up a `GridSearchCV` with the preprocessing pipeline, fit it to the training data, and print the best parameters and cross-validation RMSE.


**Reasoning**:
Define the parameter grid for Gradient Boosting, set up the pipeline and GridSearchCV, fit it to the training data, and print the best parameters and cross-validation RMSE as per the instructions.



In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Define the parameter grid for GradientBoostingRegressor
gb_param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 5, 7]
}

# Identify categorical and datetime columns (same as before)
categorical_cols = X_train.select_dtypes(include='object').columns.tolist()
datetime_cols = ['Order_Date', 'Time_Orderd', 'Time_Order_picked']
categorical_cols = [col for col in categorical_cols if col not in datetime_cols]

# Create the preprocessor (same as before)
preprocessor = ColumnTransformer(
    transformers=[
        ('drop_datetime', 'drop', datetime_cols),
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

# Create a pipeline with the preprocessor and the Gradient Boosting Regressor
gb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', GradientBoostingRegressor(random_state=42))])

# Initialize GridSearchCV for Gradient Boosting
gb_grid_search = GridSearchCV(gb_pipeline, gb_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit GridSearchCV to the training data
print("Starting GridSearchCV for Gradient Boosting...")
gb_grid_search.fit(X_train, y_train)

print("GridSearchCV for Gradient Boosting complete.")
print("Best parameters found for Gradient Boosting:", gb_grid_search.best_params_)
print("Best cross-validation RMSE for Gradient Boosting:", np.sqrt(-gb_grid_search.best_score_))

Starting GridSearchCV for Gradient Boosting...
GridSearchCV for Gradient Boosting complete.
Best parameters found for Gradient Boosting: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 7, 'regressor__n_estimators': 100}
Best cross-validation RMSE for Gradient Boosting: 3.9092237076570644


## Evaluate tuned models on test set

### Subtask:
Use the best estimators from the `GridSearchCV` for Decision Tree and Gradient Boosting to make predictions on the test set (`X_test`) and calculate their RMSE.


**Reasoning**:
Use the best estimators from the GridSearch for Decision Tree and Gradient Boosting to predict on the test set and calculate their RMSE.



In [None]:
from sklearn.metrics import mean_squared_error

# Get the best estimator for the Decision Tree model
best_dt_model = dt_grid_search.best_estimator_

# Use the best Decision Tree estimator to predict on the test set
y_pred_dt_tuned = best_dt_model.predict(X_test)

# Calculate the RMSE for the tuned Decision Tree model
rmse_dt_tuned = np.sqrt(mean_squared_error(y_test, y_pred_dt_tuned))

# Get the best estimator for the Gradient Boosting model
best_gb_model = gb_grid_search.best_estimator_

# Use the best Gradient Boosting estimator to predict on the test set
y_pred_gb_tuned = best_gb_model.predict(X_test)

# Calculate the RMSE for the tuned Gradient Boosting model
rmse_gb_tuned = np.sqrt(mean_squared_error(y_test, y_pred_gb_tuned))

# Print the calculated RMSE for both tuned models
print(f"Tuned Decision Tree RMSE on test set: {rmse_dt_tuned:.4f}")
print(f"Tuned Gradient Boosting RMSE on test set: {rmse_gb_tuned:.4f}")

Tuned Decision Tree RMSE on test set: 4.2407
Tuned Gradient Boosting RMSE on test set: 3.8871


## Compare tuned model performance

### Subtask:
Present a comparison of the RMSE scores for all models: the baseline models, the tuned Random Forest, the tuned Decision Tree, and the tuned Gradient Boosting.


**Reasoning**:
Create a dictionary to store all the RMSE values and print them in a formatted way to compare the model performances.



In [None]:
# Create a dictionary to store all RMSE values
all_model_rmse = {
    'Baseline Linear Regression': results['Linear Regression'],
    'Baseline Decision Tree': results['Decision Tree'],
    'Baseline Random Forest': results['Random Forest'],
    'Baseline Gradient Boosting': results['Gradient Boosting'],
    'Tuned Random Forest': rmse_tuned,
    'Tuned Decision Tree': rmse_dt_tuned,
    'Tuned Gradient Boosting': rmse_gb_tuned
}

# Print the RMSE values for all models
print("Model Performance Comparison (RMSE):")
for name, rmse in all_model_rmse.items():
    print(f"- {name}: {rmse:.4f}")

# Interpret the results
print("\nInterpretation:")
print(f"The lowest RMSE was achieved by the Tuned Gradient Boosting model: {all_model_rmse['Tuned Gradient Boosting']:.4f}")
print("Hyperparameter tuning improved the performance of Random Forest, Decision Tree, and Gradient Boosting models compared to their baseline versions.")

Model Performance Comparison (RMSE):
- Baseline Linear Regression: 5.9691
- Baseline Decision Tree: 5.3905
- Baseline Random Forest: 4.0047
- Baseline Gradient Boosting: 4.4806
- Tuned Random Forest: 3.9478
- Tuned Decision Tree: 4.2407
- Tuned Gradient Boosting: 3.8871

Interpretation:
The lowest RMSE was achieved by the Tuned Gradient Boosting model: 3.8871
Hyperparameter tuning improved the performance of Random Forest, Decision Tree, and Gradient Boosting models compared to their baseline versions.


## Log tuned model results to mlflow

### Subtask:
Update the MLflow tracking code to include the results and parameters of the tuned Decision Tree and Gradient Boosting models.


**Reasoning**:
Log the results and parameters of the tuned Decision Tree and Gradient Boosting models to MLflow.



In [None]:
# Start a new MLflow run for tuned models comparison
with mlflow.start_run(run_name="Tuned Models Comparison"):
    # Log hyperparameters for Tuned Decision Tree
    for param, value in dt_grid_search.best_params_.items():
        mlflow.log_param(f"Decision Tree - {param}", value)

    # Log best cross-validation RMSE for Tuned Decision Tree
    mlflow.log_metric("Decision Tree - Best CV RMSE", np.sqrt(-dt_grid_search.best_score_))

    # Log test set RMSE for Tuned Decision Tree
    mlflow.log_metric("Decision Tree - Tuned Test RMSE", rmse_dt_tuned)

    # Optionally log the tuned Decision Tree model
    mlflow.sklearn.log_model(best_dt_model, "tuned_decision_tree_model")

    # Log hyperparameters for Tuned Gradient Boosting
    for param, value in gb_grid_search.best_params_.items():
        mlflow.log_param(f"Gradient Boosting - {param}", value)

    # Log best cross-validation RMSE for Tuned Gradient Boosting
    mlflow.log_metric("Gradient Boosting - Best CV RMSE", np.sqrt(-gb_grid_search.best_score_))

    # Log test set RMSE for Tuned Gradient Boosting
    mlflow.log_metric("Gradient Boosting - Tuned Test RMSE", rmse_gb_tuned)

    # Optionally log the tuned Gradient Boosting model
    mlflow.sklearn.log_model(best_gb_model, "tuned_gradient_boosting_model")

print("Logged tuned Decision Tree and Gradient Boosting model results to MLflow.")



Logged tuned Decision Tree and Gradient Boosting model results to MLflow.


## Summary:

### Data Analysis Key Findings

*   Hyperparameter tuning for the Decision Tree model using `GridSearchCV` identified the best parameters as `{'regressor__max_depth': 10, 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 10}`, resulting in a best cross-validation RMSE of approximately 4.246.
*   Hyperparameter tuning for the Gradient Boosting model using `GridSearchCV` identified the best parameters as `{'regressor__learning_rate': 0.1, 'regressor__max_depth': 7, 'regressor__n_estimators': 100}`, resulting in a best cross-validation RMSE of approximately 3.909.
*   Evaluating the tuned models on the test set showed a tuned Decision Tree RMSE of 4.2407 and a tuned Gradient Boosting RMSE of 3.8871.
*   The tuned Gradient Boosting model achieved the lowest RMSE (3.8871) among all evaluated models (baseline Linear Regression, Decision Tree, Random Forest, Gradient Boosting, and tuned versions).
*   Hyperparameter tuning improved the performance (reduced RMSE) of the Random Forest, Decision Tree, and Gradient Boosting models compared to their baseline counterparts.
*   The hyperparameters and RMSE metrics (cross-validation and test set) for the tuned Decision Tree and Gradient Boosting models were successfully logged to MLflow in a new run named "Tuned Models Comparison".

### Insights or Next Steps

*   The tuned Gradient Boosting model is the best-performing model based on the RMSE metric and should be considered for deployment.
*   Further hyperparameter tuning with a wider grid or more advanced techniques (e.g., RandomizedSearchCV, Bayesian Optimization) could potentially yield even better performance.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
