In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [18]:
csv_path = ("evride_dataset_5000_records_2025-08-19.csv")
data = pd.read_csv(csv_path)

In [19]:
print(data.head())

    trip_id       city  distance_km  duration_minutes  traffic_level  \
0  EV000001       Pune        15.23              40.5             60   
1  EV000002     Mumbai        23.72              70.5             60   
2  EV000003      Delhi        30.65             145.8             81   
3  EV000004  Bangalore        51.44             109.9             88   
4  EV000005  Bangalore        16.72              67.7             60   

   demand_factor  battery_health_percent  energy_consumption_kwh  \
0        2.58570                      79                    4.84   
1        1.24488                      82                    5.59   
2        1.98000                      96                    8.57   
3        0.97200                      95                    7.44   
4        2.09300                      90                    6.54   

   route_difficulty vehicle_type  ... weather_condition temperature_celsius  \
0                 4        Sedan  ...             Foggy                  17   


In [20]:
# Drop useless column
if "trip_id" in data.columns:
    data = data.drop(columns=["trip_id"])


In [21]:
# Convert all numeric columns to float32 for NumPy 2.x compatibility
for col in data.select_dtypes(include=["float64", "int64"]).columns:
    data[col] = data[col].astype(np.float32)

In [22]:
# -----------------------------
# Split features & target
# -----------------------------
X = data.drop(columns=["fare_amount_inr"])
y = data["fare_amount_inr"].astype(np.float32)

In [23]:
# Separate categorical & numeric columns
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

In [27]:
# -----------------------------
# Preprocessing Pipelines
# -----------------------------
numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
   ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))

])

preprocess = ColumnTransformer([
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols)
])

In [28]:
# -----------------------------
# Train/Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

<class 'ValueError'>: could not convert string to float: 'Low'

In [34]:
# -----------------------------
# GRADIENT BOOSTING
# -----------------------------
gb_model = Pipeline([
    ("prep", preprocess),
    ("gb", GradientBoostingRegressor(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=6,
        random_state=42
    ))
])

gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

print("\n=== GRADIENT BOOSTING ===")
print("R²:", r2_score(y_test, y_pred_gb))
print("MAE:", mean_absolute_error(y_test, y_pred_gb))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_gb)))


=== GRADIENT BOOSTING ===
R²: 0.953134286964665
MAE: 249.00234103558725
RMSE: 506.2553130697868


In [35]:
y_class = pd.qcut(y, q=3, labels=["Low", "Medium", "High"])  # 3 bins

In [36]:
# Train-test split with classification target
X_train, X_test, y_train, y_test = train_test_split(
    X, y_class, test_size=0.2, random_state=42
)


In [37]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score

In [38]:
# Random Forest Classifier
clf = Pipeline([
    ("prep", preprocess),
    ("rf", RandomForestClassifier(n_estimators=500, max_depth=20, random_state=42, n_jobs=-1))
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\n=== CLASSIFICATION METRICS ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred, average="weighted"))


=== CLASSIFICATION METRICS ===
Accuracy: 0.891
F1-score: 0.8893017556538021
