In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv("dynamic_pricing.csv")
print("Dataset loaded with shape:", data.shape)

Dataset loaded with shape: (1000, 10)


In [3]:
X = data.drop("Historical_Cost_of_Ride", axis=1)
y = data["Historical_Cost_of_Ride"]
print("Features and target separated")

Features and target separated


In [4]:
categorical_cols = ["Location_Category", "Customer_Loyalty_Status", "Time_of_Booking", "Vehicle_Type"]
numerical_cols = ["Number_of_Riders", "Number_of_Drivers", "Number_of_Past_Rides", "Average_Ratings", "Expected_Ride_Duration"]

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_cols)
    ]
)

In [6]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(random_state=42))
])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split: Train size =", X_train.shape[0], "Test size =", X_test.shape[0])

Data split: Train size = 800 Test size = 200


In [8]:
param_grid = {
    "regressor__n_estimators": [100, 200],
    "regressor__max_depth": [10, 20, None],
    "regressor__min_samples_split": [2, 5],
    "regressor__min_samples_leaf": [1, 2]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)

Best parameters: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}


In [9]:
best_model = grid_search.best_estimator_

In [10]:
y_pred = best_model.predict(X_test)

In [11]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Squared Error: 5379.08
R^2 Score: 0.85


In [23]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode="markers", name="Actual vs Predicted"))
fig.add_trace(go.Scatter(x=[min(y_test), max(y_test)], y=[min(y_test), max(y_test)], mode="lines", name="Ideal", line=dict(color="red", dash="dash")))
fig.update_layout(title="Actual vs Predicted Ride Costs", xaxis_title="Actual Cost", yaxis_title="Predicted Cost")
fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [18]:
# Get feature names after preprocessing
feature_names = numerical_cols + list(best_model.named_steps["preprocessor"].named_transformers_["cat"].get_feature_names_out(categorical_cols))

# Get feature importances from the regressor
importances = best_model.named_steps["regressor"].feature_importances_

# Create a dataframe for feature importances
feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
print("Feature Importances:\n", feature_importance_df.sort_values(by="Importance", ascending=False))

Feature Importances:
                             Feature  Importance
4            Expected_Ride_Duration    0.890441
2              Number_of_Past_Rides    0.021373
3                   Average_Ratings    0.020642
1                 Number_of_Drivers    0.019612
0                  Number_of_Riders    0.016783
12             Vehicle_Type_Premium    0.011778
6           Location_Category_Urban    0.003496
9           Time_of_Booking_Evening    0.003024
7   Customer_Loyalty_Status_Regular    0.002742
10          Time_of_Booking_Morning    0.002677
5        Location_Category_Suburban    0.002502
8    Customer_Loyalty_Status_Silver    0.002481
11            Time_of_Booking_Night    0.002449


In [20]:
mean_cost = y.mean()
profitable = (y > mean_cost).sum()
loss = y.count() - profitable
labels = ["Profitable Rides", "Loss Rides"]
values = [profitable, loss]

fig_pie = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.4)])
fig_pie.update_layout(title="Distribution of Profitable and Loss Rides")
fig_pie.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed