In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
import warnings

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBRegressor
import optuna

warnings.filterwarnings("ignore")
sns.set_style("whitegrid")

# Load cleaned dataset, ensuring 'Unnamed: 0' is removed if present
df = pd.read_csv("cleaned_data.csv", index_col=0)
df = df.drop(columns=["Unnamed: 0"], errors="ignore")
df = df.drop(columns=["Region"], errors="ignore")  # Drop 'Region' if it exists

# Define categorical and numerical columns
categorical_cols = ["Vehicle Category", "Fuel Type", "Fuel Technology", "Electric Mile Range"]
numerical_cols = ["GVWR Class", "Number of Vehicles Registered at the Same Address"]

# Convert categorical columns to category type
for col in categorical_cols:
    df[col] = df[col].astype("category")

# Handle categorical replacements for ordinal columns
df["Number of Vehicles Registered at the Same Address"] = df["Number of Vehicles Registered at the Same Address"].replace(
    {'1': 1, '2': 2, '3': 3, "\u22654": 4, "Unknown": -1}
)
df["GVWR Class"] = df["GVWR Class"].replace({"Not Applicable": -1, "Unknown": -1})

# Separate features and target variable
X = df.drop(columns=["Vehicle Population"], errors="ignore")
y = df["Vehicle Population"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure 'GVWR Class' is encoded properly
X_train["GVWR Class"] = X_train["GVWR Class"].astype("category").cat.codes.astype("int64")
X_test["GVWR Class"] = X_test["GVWR Class"].astype("category").cat.codes.astype("int64")

# Encoding categorical features
encoder = TargetEncoder(cols=categorical_cols)

X_train_encoded = X_train.copy().reset_index(drop=True)
X_train_encoded[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)

X_test_encoded = X_test.copy().reset_index(drop=True)
X_test_encoded[categorical_cols] = encoder.transform(X_test[categorical_cols])

# Ensure no NaNs exist after encoding
X_test_encoded[categorical_cols] = X_test_encoded[categorical_cols].fillna(X_train_encoded[categorical_cols].mean())

# Ensure numerical columns match training data types
for col in numerical_cols:
    if col in X_test_encoded.columns:
        X_test_encoded[col] = X_test_encoded[col].astype(X_train_encoded[col].dtype)

# Hyperparameter tuning using Optuna
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.02, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.8),
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 0.8),
        'colsample_bynode': trial.suggest_uniform('colsample_bynode', 0.5, 0.8),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 2.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 5.0),
        'gamma': trial.suggest_uniform('gamma', 0.0, 5.0),
        'random_state': 42
    }

    model = XGBRegressor(**params)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    return np.sqrt(mean_squared_error(y_test, y_pred))

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

# Save the best hyperparameters
top_trials = sorted(study.trials, key=lambda x: x.value)[:3]
top_params = [trial.params for trial in top_trials]

with open("best_models.json", "w") as f:
    json.dump(top_params, f, indent=4)

# Load best models for final prediction
with open("best_models.json", "r") as f:
    best_params = json.load(f)

# Load new dataset for final scoring
scoring_data = pd.read_csv("scoring_cleaned_data.csv", index_col=0)

# Drop 'Unnamed: 0' if it exists
scoring_data = scoring_data.drop(columns=["Unnamed: 0"], errors="ignore")

# Ensure categorical columns exist
missing_cols = [col for col in categorical_cols if col not in scoring_data.columns]
if missing_cols:
    raise ValueError(f"Missing categorical columns in scoring dataset: {missing_cols}")

# Prepare features for scoring
X_scoring = scoring_data.drop(columns=["Vehicle Population"], errors="ignore")
y_scoring = scoring_data["Vehicle Population"] if "Vehicle Population" in scoring_data else None

# Apply encoding using the same trained encoder
X_scoring = X_scoring.reset_index(drop=True)  # Reset index before encoding
scoring_encoded = X_scoring.copy()

# Encode categorical columns safely
encoded_cats = encoder.transform(X_scoring[categorical_cols]).reset_index(drop=True)

# Assign transformed categorical features back safely
for col in categorical_cols:
    scoring_encoded[col] = encoded_cats[col]

# Ensure numerical columns match training set types
for col in numerical_cols:
    if col in scoring_encoded.columns:
        scoring_encoded[col] = scoring_encoded[col].astype(X_train_encoded[col].dtype)

# 🔥 Fix: Ensure `scoring_encoded` has the same columns as `X_train_encoded`
missing_features = set(X_train_encoded.columns) - set(scoring_encoded.columns)
extra_features = set(scoring_encoded.columns) - set(X_train_encoded.columns)

# Add missing features if necessary (e.g., "Date")
for feature in missing_features:
    scoring_encoded[feature] = X_train_encoded[feature].mean()  # Fill missing feature with mean value

# Drop extra features not in training data
for feature in extra_features:
    scoring_encoded = scoring_encoded.drop(columns=[feature])

# Ensure column order matches training data
scoring_encoded = scoring_encoded[X_train_encoded.columns]

# Train models on best hyperparameters and predict
rmse_scores = {}
predictions = {}

for i, params in enumerate(best_params):
    model = XGBRegressor(**params)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(scoring_encoded)
    predictions[f"Model_{i+1}"] = y_pred

    if y_scoring is not None:
        rmse = np.sqrt(mean_squared_error(y_scoring, y_pred))
        rmse_scores[f"Model_{i+1}"] = rmse
        print(f"Model {i+1} RMSE: {rmse}")


[I 2025-02-01 15:17:24,626] A new study created in memory with name: no-name-ce754264-2a4b-4bd7-849f-6959954df601
[I 2025-02-01 15:17:24,782] Trial 0 finished with value: 15986.497302411182 and parameters: {'max_depth': 5, 'learning_rate': 0.03479121980879509, 'n_estimators': 133, 'subsample': 0.894184831407942, 'colsample_bytree': 0.6293554536118854, 'colsample_bylevel': 0.5762194699112217, 'colsample_bynode': 0.6019341134791701, 'reg_alpha': 0.05958191772908269, 'reg_lambda': 0.5469249925291472, 'gamma': 3.663134771040312}. Best is trial 0 with value: 15986.497302411182.
[I 2025-02-01 15:17:25,375] Trial 1 finished with value: 16628.052441581967 and parameters: {'max_depth': 8, 'learning_rate': 0.042274777100556674, 'n_estimators': 335, 'subsample': 0.8694405785135927, 'colsample_bytree': 0.6759663828350396, 'colsample_bylevel': 0.7749043629232928, 'colsample_bynode': 0.697355720158363, 'reg_alpha': 0.14483210504730717, 'reg_lambda': 4.490776502825333, 'gamma': 4.762168770671028}. Be

Model 1 RMSE: 19235.836555762267
Model 2 RMSE: 19464.514584237644
Model 3 RMSE: 19208.947914969212

✅ Final Checks Passed. No Index Errors. Model is Ready!
