## Random Forest (regression part)

In [14]:
# full_rf_pipeline.py
import pandas as pd
import numpy as np
import time, joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest

# ------------- Helper functions -------------
def haversine_series(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1, lon1, lat2, lon2 = map(np.radians, (lat1, lon1, lat2, lon2))
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

def print_metrics(y_true, y_pred, label="Test"):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    accuracy = 1 - (mae / (y_true.mean() if y_true.mean() != 0 else 1))
    print(f"\n{label} Metrics:")
    print(f"R2:   {r2:.4f}")
    print(f"MAE:  {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"Accuracy (1 - MAE/mean(y)): {accuracy:.4f}")
    return {"r2": r2, "mae": mae, "rmse": rmse, "acc": accuracy}

# ------------- Load data -------------
df = pd.read_csv("newData.csv")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# ------------- Target selection (adjust if needed) -------------
# If your target column is not fare_amount_inr, set it here:
target = "fare_amount_inr"
if target not in df.columns:
    raise ValueError(f"Target column {target} not found. Change `target` variable in the script.")

# ------------- Basic EDA -------------
print(df.head())
print(df.describe(include='all').T)
print("Missing values per column:\n", df.isna().sum().sort_values(ascending=False).head(30))

# ------------- Optional: remove extreme outliers (target) -------------
# If the fare distribution has wild outliers, you may wish to clip or remove extremes.
# Example: keep rows with target within 1st-99th percentile
low, high = df[target].quantile([0.01, 0.99])
df = df[(df[target] >= low) & (df[target] <= high)]
print("After trimming target outliers shape:", df.shape)

# ------------- Feature engineering (customize per your columns) -------------
# If you have pickup/drop lat/lon columns, create distance. Modify names below to match your dataset.
lat_candidates = [c for c in df.columns if 'lat' in c.lower()]
lon_candidates = [c for c in df.columns if 'lon' in c.lower() or 'lng' in c.lower()]

if len(lat_candidates) >= 2 and len(lon_candidates) >= 2:
    p_lat, d_lat = lat_candidates[0], lat_candidates[1]
    p_lon, d_lon = lon_candidates[0], lon_candidates[1]
    try:
        df["distance_km"] = haversine_series(df[p_lat], df[p_lon], df[d_lat], df[d_lon])
        print("Created distance_km from", p_lat, p_lon, d_lat, d_lon)
    except Exception as e:
        print("Could not create distance_km:", e)

# If there are datetime columns, extract hour/day/month/weekend
for c in df.columns:
    if 'time' in c.lower() or 'date' in c.lower() or 'datetime' in c.lower():
        try:
            dt = pd.to_datetime(df[c], errors='coerce')
            if dt.notna().sum() > 0.5*len(df):
                df[c + "_dt"] = dt
                df[c + "_hour"] = dt.dt.hour
                df[c + "_weekday"] = dt.dt.weekday
                df[c + "_month"] = dt.dt.month
                print("Extracted datetime features from", c)
        except Exception:
            pass

# Example domain-specific features for ride-sharing:
if 'duration_minutes' in df.columns and 'distance_km' in df.columns:
    df['speed_kmph'] = df['distance_km'] / (df['duration_minutes'] / 60.0 + 1e-6)
    df['speed_kmph'] = df['speed_kmph'].replace([np.inf, -np.inf], np.nan)

# ------------- Prepare features and labels -------------
id_like = [c for c in df.columns if 'id' in c.lower() or 'trip' in c.lower()]
X = df.drop(columns=[target] + id_like, errors='ignore')
y = df[target].copy()
print("Feature count:", X.shape[1])

# ------------- Categorical handling strategy -------------
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
print("Numeric cols:", len(numeric_cols), "Categorical cols:", len(cat_cols))




Shape: (5000, 22)
Columns: ['trip_id', 'city', 'distance_km', 'duration_minutes', 'traffic_level', 'demand_factor', 'battery_health_percent', 'energy_consumption_kwh', 'route_difficulty', 'vehicle_type', 'time_of_day', 'day_of_week', 'weather_condition', 'temperature_celsius', 'humidity_percent', 'driver_rating', 'surge_multiplier', 'historical_pricing_factor', 'is_holiday', 'charging_stations_nearby', 'user_type', 'fare_amount_inr']
    trip_id       city  distance_km  duration_minutes  traffic_level  \
0  EV000001       Pune        15.23              40.5             60   
1  EV000002     Mumbai        23.72              70.5             60   
2  EV000003      Delhi        30.65             145.8             81   
3  EV000004  Bangalore        51.44             109.9             88   
4  EV000005  Bangalore        16.72              67.7             60   

   demand_factor  battery_health_percent  energy_consumption_kwh  \
0        2.58570                      79                    4

  dt = pd.to_datetime(df[c], errors='coerce')


In [13]:
# ================================
# üöÄ EV Price Prediction RF Model
# ================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import joblib

# ---------------------------------------------
# 1) Load Dataset
# ---------------------------------------------
df = pd.read_csv("newData.csv")

# ---------------------------------------------
# 2) Target Column
# ---------------------------------------------
target = "fare_amount_inr"

# ---------------------------------------------
# 3) Remove ID Column (not useful for ML)
# ---------------------------------------------
df = df.drop(columns=["trip_id"])

# ---------------------------------------------
# 4) Remove Outlier Fares (1% high + low)
# ---------------------------------------------
low, high = df[target].quantile([0.01, 0.99])
df = df[(df[target] >= low) & (df[target] <= high)]

# ---------------------------------------------
# 5) Split Features + Labels
# ---------------------------------------------
X = df.drop(columns=[target])
y = df[target]

# ---------------------------------------------
# 6) Identify Column Types
# ---------------------------------------------
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# Low-cardinality ‚Üí OneHot
ohe_cols = [c for c in cat_cols if X[c].nunique() <= 10]

# High-cardinality ‚Üí LabelEncode
label_cols = [c for c in cat_cols if X[c].nunique() > 10]

# ---------------------------------------------
# 7) Encode high-cardinality categorical columns
# ---------------------------------------------
for c in label_cols:
    X[c] = LabelEncoder().fit_transform(X[c].astype(str))

# ---------------------------------------------
# 8) Convert OHE columns to string (fix sklearn error)
# ---------------------------------------------
for c in ohe_cols:
    X[c] = X[c].astype(str)

# ---------------------------------------------
# 9) Build Preprocessing Transformer
# ---------------------------------------------
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

# NOTE: sparse_output=False is required for new sklearn versions
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ohe_cols)
    ],
    remainder="passthrough"
)

# ---------------------------------------------
# 10) Random Forest Model
# ---------------------------------------------
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

pipe = Pipeline(steps=[("pre", preprocessor), ("model", rf)])

# ---------------------------------------------
# 11) Hyperparameter Search Space
# ---------------------------------------------
param_grid = {
    "model__n_estimators": [200, 400, 600, 800],
    "model__max_depth": [10, 15, 20, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["auto", "sqrt", 0.3],
    "model__bootstrap": [True, False],
}

# ---------------------------------------------
# 12) Train / Test Split
# ---------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

# ---------------------------------------------
# 13) Randomized Search CV
# ---------------------------------------------
rs = RandomizedSearchCV(
    pipe,
    param_distributions=param_grid,
    n_iter=30,
    scoring="r2",
    cv=3,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit model
rs.fit(X_train, y_train)

print("\n============================")
print(" BEST HYPERPARAMETERS FOUND ")
print("============================")
print(rs.best_params_)

best_model = rs.best_estimator_

# ---------------------------------------------
# 14) Predictions
# ---------------------------------------------
y_pred = best_model.predict(X_test)

# ---------------------------------------------
# 15) Metrics
# ---------------------------------------------
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
accuracy = 1 - (mae / y_test.mean())

print("\n============================")
print(" FINAL MODEL PERFORMANCE ")
print("============================")
print(f"R¬≤ Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"Custom Accuracy: {accuracy:.4f}")

# ---------------------------------------------
# 16) Save Best Model
# ---------------------------------------------
joblib.dump(best_model, "best_rf_ev_fare_model.pkl")


print("\nModel saved to: /mnt/data/best_rf_ev_fare_model.pkl")


Fitting 3 folds for each of 30 candidates, totalling 90 fits


27 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", li


 BEST HYPERPARAMETERS FOUND 
{'model__n_estimators': 200, 'model__min_samples_split': 2, 'model__min_samples_leaf': 1, 'model__max_features': 0.3, 'model__max_depth': 20, 'model__bootstrap': False}

 FINAL MODEL PERFORMANCE 
R¬≤ Score: 0.9165
MAE: 328.7700
RMSE: 531.9335
Custom Accuracy: 0.8153

Model saved to: /mnt/data/best_rf_ev_fare_model.pkl


In [17]:
# =====================================
# KNN REGRESSOR (WITH PREPROCESSING)
# =====================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

# --------------------------
# Load clean data
# --------------------------
df = pd.read_csv("newData.csv")
df = df.drop(columns=["trip_id"])
target = "fare_amount_inr"

# Remove outlier fares
low, high = df[target].quantile([0.01, 0.99])
df = df[(df[target] >= low) & (df[target] <= high)]

X = df.drop(columns=[target])
y = df[target]

# --------------------------
# Identify columns
# --------------------------
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

ohe_cols = [c for c in cat_cols if X[c].nunique() <= 10]
label_cols = [c for c in cat_cols if X[c].nunique() > 10]

# Label Encode high card
for c in label_cols:
    X[c] = LabelEncoder().fit_transform(X[c].astype(str))

# Convert OHE columns to str
for c in ohe_cols:
    X[c] = X[c].astype(str)

# --------------------------
# Preprocessor
# --------------------------
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ohe_cols)
    ],
    remainder="passthrough"
)

# --------------------------
# STEP 1: Feature Selection using Random Forest
# --------------------------
rf = RandomForestRegressor(random_state=42)
pipe_rf = Pipeline(steps=[("pre", preprocessor), ("rf", rf)])

# Fit RF for feature importance
pipe_rf.fit(X, y)
importances = pipe_rf.named_steps["rf"].feature_importances_

# Get transformed feature names
ohe_feature_names = []
if ohe_cols:
    ohe_feature_names = pipe_rf.named_steps["pre"].named_transformers_["ohe"].get_feature_names_out(ohe_cols)

all_features = numeric_cols + list(ohe_feature_names) + label_cols

feat_df = pd.DataFrame({"feature": all_features, "importance": importances})
feat_df = feat_df.sort_values("importance", ascending=False)

top_features = feat_df.head(15)["feature"].tolist()
print("\nSelected Top Features:\n", top_features)

# --------------------------
# STEP 2: Transform X AND select features
# --------------------------
X_transformed = pipe_rf.named_steps["pre"].transform(X)
X_transformed = pd.DataFrame(X_transformed, columns=all_features)

X_selected = X_transformed[top_features]

# --------------------------
# Train/Test split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.20, random_state=42
)

# --------------------------
# STEP 3: TRAIN KNN
# --------------------------
knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(X_train, y_train)

# Predict
y_pred = knn.predict(X_test)

# --------------------------
# STEP 4: Metrics
# --------------------------
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
custom_accuracy = 1 - (mae / y_test.mean())

print("\n==============================")
print("      KNN FARE PREDICTION     ")
print("==============================")
print(f"R¬≤ Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"Custom Accuracy: {custom_accuracy:.4f}")



Selected Top Features:
 ['demand_factor', 'distance_km', 'vehicle_type_Premium', 'surge_multiplier', 'driver_rating', 'battery_health_percent', 'vehicle_type_Compact', 'energy_consumption_kwh', 'traffic_level', 'duration_minutes', 'vehicle_type_SUV', 'historical_pricing_factor', 'humidity_percent', 'route_difficulty', 'weather_condition_Storm']

      KNN FARE PREDICTION     
R¬≤ Score: 0.2061
MAE: 1193.4688
RMSE: 1640.4826
Custom Accuracy: 0.3297


In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer

# Load data
df = pd.read_csv("newData.csv")
df = df.drop(columns=["trip_id"])

target = "fare_amount_inr"

# Remove outlier fares
low, high = df[target].quantile([0.01, 0.99])
df = df[(df[target] >= low) & (df[target] <= high)]

X = df.drop(columns=[target])
y = df[target]

# Column types
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

ohe_cols = [c for c in cat_cols if X[c].nunique() <= 10]
label_cols = [c for c in cat_cols if X[c].nunique() > 10]

# Label encode high-card categorical
for c in label_cols:
    X[c] = LabelEncoder().fit_transform(X[c].astype(str))

# Convert OHE cols to string
for c in ohe_cols:
    X[c] = X[c].astype(str)

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())     # ‚≠ê IMPORTANT FOR SVM
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ohe_cols)
    ],
    remainder="passthrough"
)

# SVR model
svr = SVR(kernel="rbf", C=100, gamma="scale")

model = Pipeline(steps=[("pre", preprocessor), ("svr", svr)])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
custom_accuracy = 1 - (mae / y_test.mean())

print("\n==============================")
print("       SVR PERFORMANCE         ")
print("==============================")
print(f"R¬≤ Score: {r2:.4f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"Custom Accuracy: {custom_accuracy:.4f}")



       SVR PERFORMANCE         
R¬≤ Score: 0.8352
MAE: 368.29
RMSE: 747.37
Custom Accuracy: 0.7931
