In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
import warnings

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBRegressor

import optuna

In [2]:
warnings.filterwarnings("ignore")
sns.set_style("whitegrid")

### Load training and scoring data

In [3]:
df = pd.read_csv("final_train.csv")
df1 = pd.read_csv("final_score.csv")

In [4]:
df

Unnamed: 0,Date,Vehicle Category,GVWR Class,Fuel Type,Model Year,Fuel Technology,Electric Mile Range,Number of Vehicles Registered at the Same Address,Region,Vehicle Population
0,2019,P,Not Applicable,Gasoline,2020.0,ICE,Not Applicable,≥4,Statewide,395883
1,2020,P,Not Applicable,Gasoline,2020.0,ICE,Not Applicable,1,Statewide,370954
2,2021,P,Not Applicable,Gasoline,2020.0,ICE,Not Applicable,1,Statewide,349406
3,2019,P,Not Applicable,Gasoline,2019.0,ICE,Not Applicable,≥4,Statewide,348475
4,2019,P,Not Applicable,Gasoline,2018.0,ICE,Not Applicable,≥4,Statewide,333296
...,...,...,...,...,...,...,...,...,...,...
41048,2019,B,Not Applicable,Diesel,1983.0,ICE,Not Applicable,1,Statewide,1
41049,2019,B,Not Applicable,Diesel,1980.0,ICE,Not Applicable,2,Statewide,1
41050,2019,B,Not Applicable,Diesel,1978.0,ICE,Not Applicable,3,Statewide,1
41051,2019,B,Not Applicable,Diesel,1976.0,ICE,Not Applicable,2,Statewide,1


In [5]:
categorical_cols = ["Vehicle Category", "Fuel Type", "Fuel Technology", "Electric Mile Range"]
def preprocess(df):
    df.drop_duplicates(inplace=True)
    df = df.drop(columns = "Region", axis = 1)
    df = df.dropna(subset = ["Model Year"]).sort_values("Model Year").reset_index(drop = True)
    df["Model Year"] = df["Model Year"].astype(int) 

    # Convert certain columns to categorical type
    
    for col in categorical_cols:
        df[col] = df[col].astype("category")

    # Encode categorical columns to category types
    df["Number of Vehicles Registered at the Same Address"] = df["Number of Vehicles Registered at the Same Address"].replace(
    {'1': 1, '2': 2, '3': 3, "≥4": 4, "Unknown": -1})

    # Replace unknown values in GVWR Class column
    df["GVWR Class"] = df["GVWR Class"].replace({"Not Applicable": -1, "Unknown": -1})
    return df 

In [6]:
df = preprocess(df)
df1 = preprocess(df1)

In [7]:
df

Unnamed: 0,Date,Vehicle Category,GVWR Class,Fuel Type,Model Year,Fuel Technology,Electric Mile Range,Number of Vehicles Registered at the Same Address,Vehicle Population
0,2019,T6,4,Gasoline,1975,ICE,Not Applicable,4,265
1,2019,T6,4,Gasoline,1975,ICE,Not Applicable,3,13
2,2019,MC,-1,Gasoline,1975,ICE,Not Applicable,3,1691
3,2019,T7,8,Diesel,1975,ICE,Not Applicable,4,683
4,2019,T6,6,Diesel,1975,ICE,Not Applicable,4,34
...,...,...,...,...,...,...,...,...,...
40445,2023,B,-1,Natural Gas,2025,ICE,Not Applicable,4,4
40446,2023,T6,5,Gasoline,2025,ICE,Not Applicable,4,187
40447,2023,T6,4,Gasoline,2025,ICE,Not Applicable,4,56
40448,2023,T7,8,Diesel,2025,ICE,Not Applicable,1,242


In [8]:
df1

Unnamed: 0,Date,Vehicle Category,GVWR Class,Fuel Type,Model Year,Fuel Technology,Electric Mile Range,Number of Vehicles Registered at the Same Address,Vehicle Population
0,2024,T5,3,Gasoline,1980,ICE,Not Applicable,1,16
1,2024,MH,-1,Gasoline,1980,ICE,Not Applicable,2,582
2,2024,T6,6,Diesel,1980,ICE,Not Applicable,1,73
3,2024,T6,6,Diesel,1980,ICE,Not Applicable,2,4
4,2024,B,-1,Diesel,1980,ICE,Not Applicable,2,16
...,...,...,...,...,...,...,...,...,...
7435,2024,BS,-1,Natural Gas,2026,ICE,Not Applicable,3,3
7436,2024,T5,3,Gasoline,2026,ICE,Not Applicable,2,2
7437,2024,P,-1,Gasoline,2026,ICE,Not Applicable,4,5027
7438,2024,BS,-1,Gasoline,2026,ICE,Not Applicable,2,1


In [9]:
# Split the train data (df) into training and testing sets
X = df.drop(columns="Vehicle Population")
y= df["Vehicle Population"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Convert the GVWR Class to categorical type
X_train["GVWR Class"] = X_train["GVWR Class"].astype("category").cat.codes
X_test["GVWR Class"] = X_test["GVWR Class"].astype("category").cat.codes

In [11]:
# Separate features and target variable
X_scoring = df1.drop(columns="Vehicle Population", errors="ignore")
y_scoring = df1["Vehicle Population"]

# Convert the GVWR Class to categorical type
X_scoring["GVWR Class"] = X_scoring["GVWR Class"].astype("category").cat.codes

### Hyper parameter tuning function

In [12]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.8),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 2.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 5.0),
        'gamma': trial.suggest_uniform('gamma', 0.0, 5.0),
        'random_state': 42
    }

    encoder = TargetEncoder(cols=categorical_cols)
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()
    X_train_encoded[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)
    X_test_encoded[categorical_cols] = encoder.transform(X_test[categorical_cols])

    model = XGBRegressor(**params)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [13]:
# Run Optuna tuning
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

[I 2025-02-02 00:16:31,697] A new study created in memory with name: no-name-7122b17a-cf71-41fd-ac86-520e8f9934c9
[I 2025-02-02 00:16:32,138] Trial 0 finished with value: 4965.507627624793 and parameters: {'max_depth': 5, 'learning_rate': 0.18153534209287447, 'n_estimators': 468, 'subsample': 0.6423035709354669, 'colsample_bytree': 0.5145582281296053, 'reg_alpha': 0.42948635134238394, 'reg_lambda': 1.873497308494468, 'gamma': 0.1369731065448282}. Best is trial 0 with value: 4965.507627624793.
[I 2025-02-02 00:16:32,883] Trial 1 finished with value: 3885.6493665795424 and parameters: {'max_depth': 7, 'learning_rate': 0.016840160700701032, 'n_estimators': 582, 'subsample': 0.8279130489111363, 'colsample_bytree': 0.7988389374085964, 'reg_alpha': 0.015094448100679047, 'reg_lambda': 0.645379871274931, 'gamma': 2.4587521584233407}. Best is trial 1 with value: 3885.6493665795424.
[I 2025-02-02 00:16:34,028] Trial 2 finished with value: 4687.394158805082 and parameters: {'max_depth': 9, 'learn

In [18]:
# Save models with the best parameters
top_trials = sorted(study.trials, key=lambda x: x.value)[:3]
top_params = [trial.params for trial in top_trials]

with open("final_params.json", "w") as f:
    json.dump(top_params, f, indent=4)

In [19]:
# Load best models' parameters and apply to scoring dataset
with open("final_params.json", "r") as f:
    best_params = json.load(f)

encoder = TargetEncoder(cols = categorical_cols)
X_train_encoded = X_train.copy()
X_train_encoded[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)
scoring_encoded = X_scoring.copy()
scoring_encoded[categorical_cols] = encoder.transform(X_scoring[categorical_cols])

In [21]:
# Prediction with best models
predictions = {}
rmse_scores = {}

for i, params in enumerate(best_params):
    model = XGBRegressor(**params)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(scoring_encoded)
    predictions[f"Model_{i+1}"] = y_pred

    if y_scoring is not None:
        rmse = np.sqrt(mean_squared_error(y_scoring, y_pred))
        rmse_scores[f"Model_{i+1}"] = rmse
        print(f"Model {i+1} RMSE: {rmse}")

Model 1 RMSE: 6158.917112609975
Model 2 RMSE: 6026.4702770361355
Model 3 RMSE: 6154.569684388991


In [17]:
comparison_df = pd.DataFrame({
    'Actual': y_scoring,
    'Predicted': y_pred
})

# Display the DataFrame
print(comparison_df.head(50))

    Actual    Predicted
0       16    77.455612
1      582    25.675240
2       73    31.213045
3        4   -16.260893
4       16   149.168655
5      520  -186.040115
6       15   319.053589
7    44018  -455.931641
8      508  -226.278305
9      500    39.273014
10      82    47.461765
11     493   157.656326
12     476   117.016266
13       1    66.994980
14       1   -90.095924
15       1   -51.821568
16     475    11.196621
17      14   -62.505203
18     473  -124.534172
19      88  1053.272827
20      13   -65.011024
21      89   -97.951233
22     440  -181.877426
23       1   143.751297
24       4   836.799438
25    7087  -268.342072
26      71   165.142349
27      17   161.699814
28    5065  -747.968628
29       1   501.626526
30       1   463.438965
31       1    74.747559
32      20    62.523483
33     878   127.266479
34      57  -126.415802
35   68132 -1657.781006
36      19    63.553051
37       3   140.639313
38      18  1419.570190
39       3    37.989075
40     780    -2