In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
import warnings

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBRegressor

import optuna
warnings.filterwarnings("ignore")
sns.set_style("whitegrid")

### Load training and scoring data
df = pd.read_csv("final_train.csv", index_col=0)
df1 = pd.read_csv("final_score.csv", index_col=0)

categorical_cols = ["Vehicle Category", "Fuel Type", "Fuel Technology", "Electric Mile Range"]

def preprocess(df):
    """Preprocesses the dataset by handling missing values, encoding categorical variables, and dropping unnecessary columns."""
    df.drop_duplicates(inplace=True)
    df = df.drop(columns="Region", axis=1)
    df = df.dropna(subset=["Model Year"]).sort_values("Model Year").reset_index(drop=True)
    df["Model Year"] = df["Model Year"].astype(int)

    # Convert categorical columns to category type
    for col in categorical_cols:
        df[col] = df[col].astype("category")

    # Encode categorical columns
    df["Number of Vehicles Registered at the Same Address"] = df["Number of Vehicles Registered at the Same Address"].replace(
        {'1': 1, '2': 2, '3': 3, "≥4": 4, "Unknown": -1})

    # Replace unknown values in GVWR Class column
    df["GVWR Class"] = df["GVWR Class"].replace({"Not Applicable": -1, "Unknown": -1})
    return df

df = preprocess(df)
df1 = preprocess(df1)

# Split the train data (df) into training and testing sets
X = df.drop(columns="Vehicle Population")
y = df["Vehicle Population"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert GVWR Class to categorical type
X_train["GVWR Class"] = X_train["GVWR Class"].astype("category").cat.codes
X_test["GVWR Class"] = X_test["GVWR Class"].astype("category").cat.codes

# Separate features and target variable for scoring data
X_scoring = df1.drop(columns="Vehicle Population", errors="ignore")
y_scoring = df1["Vehicle Population"] if "Vehicle Population" in df1 else None

# Convert GVWR Class to categorical type
X_scoring["GVWR Class"] = X_scoring["GVWR Class"].astype("category").cat.codes

### Hyperparameter tuning function
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.8),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 2.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 5.0),
        'gamma': trial.suggest_uniform('gamma', 0.0, 5.0),
        'random_state': 42
    }

    encoder = TargetEncoder(cols=categorical_cols)
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()
    X_train_encoded[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)
    X_test_encoded[categorical_cols] = encoder.transform(X_test[categorical_cols])

    model = XGBRegressor(**params)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    return np.sqrt(mean_squared_error(y_test, y_pred))

# Run Optuna tuning
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

# Save models with the best parameters
top_trials = sorted(study.trials, key=lambda x: x.value)[:3]
top_params = [trial.params for trial in top_trials]

with open("final_params.json", "w") as f:
    json.dump(top_params, f, indent=4)

# Load best models' parameters and apply to scoring dataset
with open("final_params.json", "r") as f:
    best_params = json.load(f)

encoder = TargetEncoder(cols=categorical_cols)
X_train_encoded = X_train.copy()
X_train_encoded[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)
scoring_encoded = X_scoring.copy()
scoring_encoded[categorical_cols] = encoder.transform(X_scoring[categorical_cols])

# Prediction with best models
predictions = {}
rmse_scores = {}

for i, params in enumerate(best_params):
    model = XGBRegressor(**params)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(scoring_encoded)
    predictions[f"Model_{i+1}"] = y_pred

    if y_scoring is not None:
        rmse = np.sqrt(mean_squared_error(y_scoring, y_pred))
        rmse_scores[f"Model_{i+1}"] = rmse
        print(f"Model {i+1} RMSE: {rmse}")

[I 2025-02-01 22:34:33,512] A new study created in memory with name: no-name-5a076df1-dc62-4660-88d3-c7140c9a5b72
[I 2025-02-01 22:34:33,795] Trial 0 finished with value: 6863.990093232944 and parameters: {'max_depth': 7, 'learning_rate': 0.07048885540814713, 'n_estimators': 202, 'subsample': 0.757592098335327, 'colsample_bytree': 0.7533842900606312, 'reg_alpha': 0.032761328222110093, 'reg_lambda': 1.5999378469506238, 'gamma': 4.14483770476227}. Best is trial 0 with value: 6863.990093232944.
[I 2025-02-01 22:34:34,095] Trial 1 finished with value: 7598.4645817428145 and parameters: {'max_depth': 4, 'learning_rate': 0.1374873068178687, 'n_estimators': 384, 'subsample': 0.7687341437323257, 'colsample_bytree': 0.6795188587112315, 'reg_alpha': 1.1538795835627889, 'reg_lambda': 0.20876486228317762, 'gamma': 1.043731627841213}. Best is trial 0 with value: 6863.990093232944.
[I 2025-02-01 22:34:34,484] Trial 2 finished with value: 7270.7144078144065 and parameters: {'max_depth': 5, 'learning_

Model 1 RMSE: 8133.165681332208
Model 2 RMSE: 8203.996587030008
Model 3 RMSE: 8025.477182074596
