In [2]:

import pandas as pd
import numpy as np
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import optuna
import json

warnings.filterwarnings("ignore")
sns.set_style("whitegrid")

# Function to check dataset consistency
def check_data_integrity(df, name="Dataset"):
    print(f"Checking {name} integrity:")
    print("Shape:", df.shape)
    print("Missing values:", df.isnull().sum())
    print("Data types:", df.dtypes)
    print("-" * 40)

# Load training data
df = pd.read_csv("training_data.csv")
df.drop_duplicates(inplace=True)

# Ensure Model Year is not missing
df = df.dropna(subset=["Model Year"]).sort_values(by="Date", ascending=True).reset_index(drop=True)
df["Model Year"] = df["Model Year"].astype(int)

# Drop unnecessary columns
df = df.drop(columns="Region", axis=1)

# Convert categorical columns to category type
categorical_cols = ["Vehicle Category", "Fuel Type", "Fuel Technology", "Electric Mile Range"]
for col in categorical_cols:
    df[col] = df[col].astype("category")

# Encode 'Number of Vehicles Registered at the Same Address'
df["Number of Vehicles Registered at the Same Address"] = df["Number of Vehicles Registered at the Same Address"].replace(
    {'1': 1, '2': 2, '3': 3, "≥4": 4, "Unknown": -1})

# Replace unknown values in 'GVWR Class'
df["GVWR Class"] = df["GVWR Class"].replace({"Not Applicable": -1, "Unknown": -1})

# Check integrity
check_data_integrity(df, "Training Data")

# Split into train and test sets
X = df.drop(columns="Vehicle Population")
y = df["Vehicle Population"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert 'GVWR Class' to categorical codes
X_train["GVWR Class"] = X_train["GVWR Class"].astype("category").cat.codes
X_test["GVWR Class"] = X_test["GVWR Class"].astype("category").cat.codes


Checking Training Data integrity:
Shape: (40450, 9)
Missing values: Date                                                 0
Vehicle Category                                     0
GVWR Class                                           0
Fuel Type                                            0
Model Year                                           0
Fuel Technology                                      0
Electric Mile Range                                  0
Number of Vehicles Registered at the Same Address    0
Vehicle Population                                   0
dtype: int64
Data types: Date                                                    int64
Vehicle Category                                     category
GVWR Class                                             object
Fuel Type                                            category
Model Year                                              int64
Fuel Technology                                      category
Electric Mile Range                     

In [3]:

# Load cleaned data for scoring
df1 = pd.read_csv("cleaned_data.csv", index_col=0)
df1 = df1.drop(columns="Region", axis=1)

# Convert categorical columns to category type
for col in categorical_cols:
    df1[col] = df1[col].astype("category")

df1["Number of Vehicles Registered at the Same Address"] = df1["Number of Vehicles Registered at the Same Address"].replace(
    {'1': 1, '2': 2, '3': 3, "≥4": 4, "Unknown": -1})

df1["GVWR Class"] = df1["GVWR Class"].replace({"Not Applicable": -1, "Unknown": -1})

# Check integrity
check_data_integrity(df1, "Cleaned Data")

# Separate features and target variable
X_scoring = df1.drop(columns="Vehicle Population", errors="ignore")
y_scoring = df1["Vehicle Population"] if "Vehicle Population" in df1 else None

# Convert 'GVWR Class' to categorical codes
X_scoring["GVWR Class"] = X_scoring["GVWR Class"].astype("category").cat.codes


Checking Cleaned Data integrity:
Shape: (40450, 9)
Missing values: Date                                                 0
Vehicle Category                                     0
GVWR Class                                           0
Fuel Type                                            0
Model Year                                           0
Fuel Technology                                      0
Electric Mile Range                                  0
Number of Vehicles Registered at the Same Address    0
Vehicle Population                                   0
dtype: int64
Data types: Date                                                    int64
Vehicle Category                                     category
GVWR Class                                             object
Fuel Type                                            category
Model Year                                              int64
Fuel Technology                                      category
Electric Mile Range                      

In [4]:

# Hyperparameter tuning function
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.8),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 2.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 5.0),
        'gamma': trial.suggest_uniform('gamma', 0.0, 5.0),
        'random_state': 42
    }

    encoder = TargetEncoder(cols=categorical_cols)
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()
    X_train_encoded[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)
    X_test_encoded[categorical_cols] = encoder.transform(X_test[categorical_cols])

    model = XGBRegressor(**params)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    return np.sqrt(mean_squared_error(y_test, y_pred))

# Run Optuna tuning
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)


[I 2025-02-01 16:09:57,581] A new study created in memory with name: no-name-00f80338-774f-4ba4-99da-845fd0c60045
[I 2025-02-01 16:09:57,984] Trial 0 finished with value: 7837.805049884821 and parameters: {'max_depth': 4, 'learning_rate': 0.07503221633161003, 'n_estimators': 330, 'subsample': 0.7027219331895219, 'colsample_bytree': 0.6144505619990245, 'reg_alpha': 0.02023211244241637, 'reg_lambda': 1.15732409890649, 'gamma': 4.526365185902652}. Best is trial 0 with value: 7837.805049884821.
[I 2025-02-01 16:09:58,980] Trial 1 finished with value: 5047.648363347035 and parameters: {'max_depth': 7, 'learning_rate': 0.032291526849610494, 'n_estimators': 522, 'subsample': 0.6840923997128939, 'colsample_bytree': 0.6627491386063001, 'reg_alpha': 0.1627639977503332, 'reg_lambda': 0.4381241820206591, 'gamma': 3.0888520372787647}. Best is trial 1 with value: 5047.648363347035.
[I 2025-02-01 16:09:59,809] Trial 2 finished with value: 4097.343773714869 and parameters: {'max_depth': 6, 'learning_r

In [5]:

# Save best hyperparameters
top_trials = sorted(study.trials, key=lambda x: x.value)[:3]
top_params = [trial.params for trial in top_trials]

with open("best_models.json", "w") as f:
    json.dump(top_params, f, indent=4)

print("Best parameters saved successfully.")


Best parameters saved successfully.


In [6]:

# Load best models and apply to scoring dataset
with open("best_models.json", "r") as f:
    best_params = json.load(f)

# Ensure categorical encoding is consistent
encoder = TargetEncoder(cols=categorical_cols)
X_train_encoded = X_train.copy()
X_train_encoded[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)
scoring_encoded = X_scoring.copy()
scoring_encoded[categorical_cols] = encoder.transform(X_scoring[categorical_cols])

# Predict with best models
predictions = {}
rmse_scores = {}

for i, params in enumerate(best_params):
    model = XGBRegressor(**params)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(scoring_encoded)
    predictions[f"Model_{i+1}"] = y_pred

    if y_scoring is not None:
        rmse = np.sqrt(mean_squared_error(y_scoring, y_pred))
        rmse_scores[f"Model_{i+1}"] = rmse
        print(f"Model {i+1} RMSE: {rmse}")

# Save predictions
pd.DataFrame(predictions).to_csv("predictions.csv", index=False)
print("Predictions saved successfully.")


Model 1 RMSE: 1679.9839284945556
Model 2 RMSE: 2049.5636120891686
Model 3 RMSE: 1924.6691923548835
Predictions saved successfully.
