In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
import warnings

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBRegressor

import optuna

In [2]:
warnings.filterwarnings("ignore")
sns.set_style("whitegrid")

### Load training and scoring data

In [3]:
df = pd.read_csv("final_train.csv")
df1 = pd.read_csv("final_score.csv")

In [4]:
df

Unnamed: 0,Date,Vehicle Category,GVWR Class,Fuel Type,Model Year,Fuel Technology,Electric Mile Range,Number of Vehicles Registered at the Same Address,Region,Vehicle Population
0,2019,P,Not Applicable,Gasoline,2020.0,ICE,Not Applicable,≥4,Statewide,395883
1,2020,P,Not Applicable,Gasoline,2020.0,ICE,Not Applicable,1,Statewide,370954
2,2021,P,Not Applicable,Gasoline,2020.0,ICE,Not Applicable,1,Statewide,349406
3,2019,P,Not Applicable,Gasoline,2019.0,ICE,Not Applicable,≥4,Statewide,348475
4,2019,P,Not Applicable,Gasoline,2018.0,ICE,Not Applicable,≥4,Statewide,333296
...,...,...,...,...,...,...,...,...,...,...
41048,2019,B,Not Applicable,Diesel,1983.0,ICE,Not Applicable,1,Statewide,1
41049,2019,B,Not Applicable,Diesel,1980.0,ICE,Not Applicable,2,Statewide,1
41050,2019,B,Not Applicable,Diesel,1978.0,ICE,Not Applicable,3,Statewide,1
41051,2019,B,Not Applicable,Diesel,1976.0,ICE,Not Applicable,2,Statewide,1


In [5]:
categorical_cols = ["Vehicle Category", "Fuel Type", "Fuel Technology", "Electric Mile Range"]
def preprocess(df):
    df.drop_duplicates(inplace=True)
    df = df.drop(columns = "Region", axis = 1)
    df = df.dropna(subset = ["Model Year"]).sort_values("Model Year").reset_index(drop = True)
    df["Model Year"] = df["Model Year"].astype(int) 

    # Convert certain columns to categorical type
    
    for col in categorical_cols:
        df[col] = df[col].astype("category")

    # Encode categorical columns to category types
    df["Number of Vehicles Registered at the Same Address"] = df["Number of Vehicles Registered at the Same Address"].replace(
    {'1': 1, '2': 2, '3': 3, "≥4": 4, "Unknown": -1})

    # Replace unknown values in GVWR Class column
    df["GVWR Class"] = df["GVWR Class"].replace({"Not Applicable": -1, "Unknown": -1})
    return df 

In [6]:
df = preprocess(df)
df1 = preprocess(df1)

In [7]:
df

Unnamed: 0,Date,Vehicle Category,GVWR Class,Fuel Type,Model Year,Fuel Technology,Electric Mile Range,Number of Vehicles Registered at the Same Address,Vehicle Population
0,2019,T6,4,Gasoline,1975,ICE,Not Applicable,4,265
1,2019,T6,4,Gasoline,1975,ICE,Not Applicable,3,13
2,2019,MC,-1,Gasoline,1975,ICE,Not Applicable,3,1691
3,2019,T7,8,Diesel,1975,ICE,Not Applicable,4,683
4,2019,T6,6,Diesel,1975,ICE,Not Applicable,4,34
...,...,...,...,...,...,...,...,...,...
40445,2023,B,-1,Natural Gas,2025,ICE,Not Applicable,4,4
40446,2023,T6,5,Gasoline,2025,ICE,Not Applicable,4,187
40447,2023,T6,4,Gasoline,2025,ICE,Not Applicable,4,56
40448,2023,T7,8,Diesel,2025,ICE,Not Applicable,1,242


In [8]:
df1

Unnamed: 0,Date,Vehicle Category,GVWR Class,Fuel Type,Model Year,Fuel Technology,Electric Mile Range,Number of Vehicles Registered at the Same Address,Vehicle Population
0,2024,T5,3,Gasoline,1980,ICE,Not Applicable,1,16
1,2024,MH,-1,Gasoline,1980,ICE,Not Applicable,2,582
2,2024,T6,6,Diesel,1980,ICE,Not Applicable,1,73
3,2024,T6,6,Diesel,1980,ICE,Not Applicable,2,4
4,2024,B,-1,Diesel,1980,ICE,Not Applicable,2,16
...,...,...,...,...,...,...,...,...,...
7435,2024,BS,-1,Natural Gas,2026,ICE,Not Applicable,3,3
7436,2024,T5,3,Gasoline,2026,ICE,Not Applicable,2,2
7437,2024,P,-1,Gasoline,2026,ICE,Not Applicable,4,5027
7438,2024,BS,-1,Gasoline,2026,ICE,Not Applicable,2,1


In [9]:
# Split the train data (df) into training and testing sets
X = df.drop(columns="Vehicle Population")
y= df["Vehicle Population"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Convert the GVWR Class to categorical type
X_train["GVWR Class"] = X_train["GVWR Class"].astype("category").cat.codes
X_test["GVWR Class"] = X_test["GVWR Class"].astype("category").cat.codes

In [11]:
# Separate features and target variable
X_scoring = df1.drop(columns="Vehicle Population", errors="ignore")
y_scoring = df1["Vehicle Population"]

# Convert the GVWR Class to categorical type
X_scoring["GVWR Class"] = X_scoring["GVWR Class"].astype("category").cat.codes

### Hyper parameter tuning function

In [12]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.8),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 2.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 5.0),
        'gamma': trial.suggest_uniform('gamma', 0.0, 5.0),
        'random_state': 42
    }

    encoder = TargetEncoder(cols=categorical_cols)
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()
    X_train_encoded[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)
    X_test_encoded[categorical_cols] = encoder.transform(X_test[categorical_cols])

    model = XGBRegressor(**params)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [13]:
# Run Optuna tuning
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2025-02-02 03:27:55,917] A new study created in memory with name: no-name-031b9997-5a1f-408e-a1fd-e24f42687a58
[I 2025-02-02 03:27:56,688] Trial 0 finished with value: 4591.006643427997 and parameters: {'max_depth': 8, 'learning_rate': 0.010564151617420775, 'n_estimators': 475, 'subsample': 0.7581696513524653, 'colsample_bytree': 0.7930076485599047, 'reg_alpha': 0.44799401371155084, 'reg_lambda': 0.3956834738120941, 'gamma': 4.44412492744118}. Best is trial 0 with value: 4591.006643427997.
[I 2025-02-02 03:27:57,045] Trial 1 finished with value: 6069.671490286768 and parameters: {'max_depth': 6, 'learning_rate': 0.025227384766495223, 'n_estimators': 305, 'subsample': 0.6482485729245094, 'colsample_bytree': 0.7359278318150984, 'reg_alpha': 0.12918939647149402, 'reg_lambda': 4.891564514392121, 'gamma': 3.5009389187287288}. Best is trial 0 with value: 4591.006643427997.
[I 2025-02-02 03:27:57,452] Trial 2 finished with value: 4616.854556946753 and parameters: {'max_depth': 7, 'learning

In [14]:
# Save models with the best parameters
top_trials = sorted(study.trials, key=lambda x: x.value)[:3]
top_params = [trial.params for trial in top_trials]

with open("final_params.json", "w") as f:
    json.dump(top_params, f, indent=4)

In [15]:
# Load best models' parameters and apply to scoring dataset
with open("final_params.json", "r") as f:
    best_params = json.load(f)

encoder = TargetEncoder(cols = categorical_cols)
X_train_encoded = X_train.copy()
X_train_encoded[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)
scoring_encoded = X_scoring.copy()
scoring_encoded[categorical_cols] = encoder.transform(X_scoring[categorical_cols])

In [16]:
# Prediction with best models
predictions = {}
rmse_scores = {}

for i, params in enumerate(best_params):
    model = XGBRegressor(**params)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(scoring_encoded)
    predictions[f"Model_{i+1}"] = y_pred

    if y_scoring is not None:
        rmse = np.sqrt(mean_squared_error(y_scoring, y_pred))
        rmse_scores[f"Model_{i+1}"] = rmse
        print(f"Model {i+1} RMSE: {rmse}")

Model 1 RMSE: 6015.684499705749
Model 2 RMSE: 6000.830942461219
Model 3 RMSE: 6016.396928394935


In [17]:
comparison_df = pd.DataFrame({
    'Actual': y_scoring,
    'Predicted': y_pred
})

# Display the DataFrame
print(comparison_df.head(50))

    Actual    Predicted
0       16    14.909265
1      582   189.342041
2       73    40.558868
3        4     8.948841
4       16   -12.710180
5      520   -17.047905
6       15    74.085022
7    44018   136.831070
8      508    34.203468
9      500    48.379364
10      82   120.558975
11     493   170.616470
12     476   261.382874
13       1   -92.779282
14       1  -202.376495
15       1    38.101372
16     475    -7.437378
17      14   -30.159050
18     473   -23.757666
19      88   108.131447
20      13   -17.800077
21      89   -43.748169
22     440     9.603817
23       1   162.279114
24       4  1127.638062
25    7087   140.096863
26      71    42.893417
27      17  -162.907028
28    5065   302.321045
29       1   378.944946
30       1   375.567535
31       1    71.938065
32      20    -2.193282
33     878    92.935699
34      57  -104.459854
35   68132  1192.214600
36      19    26.202421
37       3   -84.810745
38      18  1162.989624
39       3  -100.172897
40     780    43