In [60]:
import pandas as pd
import numpy as np
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

sns.set_style("whitegrid")

In [63]:
data = pd.read_csv("training_data.csv")
df = pd.DataFrame(data)
df.drop_duplicates()
missing_model_year = df["Model Year"].isna().sum()
df = df.dropna(subset=["Model Year"]).sort_values(by="Date", ascending=True).reset_index(drop=True)
df["Model Year"] = df["Model Year"].astype(int)

Assuming we have a somewhat cleaned dataset (removing Model Year missing values), sort by year,...

In [64]:
df = df.drop(columns="Region", axis = 1)

## Data Wrangling

In [65]:
categorical_cols = ["Vehicle Category", "Fuel Type", "Fuel Technology", "Electric Mile Range"]
for col in categorical_cols:
    df[col] = df[col].astype("category")
    

In [66]:
df.dtypes

Date                                                    int64
Vehicle Category                                     category
GVWR Class                                             object
Fuel Type                                            category
Model Year                                              int64
Fuel Technology                                      category
Electric Mile Range                                  category
Number of Vehicles Registered at the Same Address      object
Vehicle Population                                      int64
dtype: object

In [67]:
df["Number of Vehicles Registered at the Same Address"].apply(lambda x: repr(x)).unique()


array(["'≥4'", "'1'", "'2'", "'3'", "'Unknown'"], dtype=object)

In [7]:
# import unicodedata

# ordinal_mapping = {'1': int(1), '2': int(2), '3': int(3), unicodedata.name("\u03C0"): int(4), "Unknown": int(-1)}

# df["Number of Vehicles Registered at the Same Address"] = df["Number of Vehicles Registered at the Same Address"].astype(str).map(ordinal_mapping)

In [68]:
df["Number of Vehicles Registered at the Same Address"] = df["Number of Vehicles Registered at the Same Address"].replace({'1': 1, '2': 2, '3': 3, "\u22654": 4, "Unknown": -1})

df["GVWR Class"] = df["GVWR Class"].replace({"Not Applicable": -1, "Unknown": -1})

In [69]:
df.isnull().sum()

Date                                                 0
Vehicle Category                                     0
GVWR Class                                           0
Fuel Type                                            0
Model Year                                           0
Fuel Technology                                      0
Electric Mile Range                                  0
Number of Vehicles Registered at the Same Address    0
Vehicle Population                                   0
dtype: int64

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40450 entries, 0 to 40449
Data columns (total 9 columns):
 #   Column                                             Non-Null Count  Dtype   
---  ------                                             --------------  -----   
 0   Date                                               40450 non-null  int64   
 1   Vehicle Category                                   40450 non-null  category
 2   GVWR Class                                         40450 non-null  object  
 3   Fuel Type                                          40450 non-null  category
 4   Model Year                                         40450 non-null  int64   
 5   Fuel Technology                                    40450 non-null  category
 6   Electric Mile Range                                40450 non-null  category
 7   Number of Vehicles Registered at the Same Address  40450 non-null  int64   
 8   Vehicle Population                                 40450 non-null  int64   
d

In [71]:
from sklearn.model_selection import train_test_split

# Separate the feature and the target
X = df.drop(columns= "Vehicle Population")
y = df["Vehicle Population"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a pipeline of training

In [72]:
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBRegressor

In [73]:
categorical_features = ["Vehicle Category", "Fuel Type", "Fuel Technology", "Electric Mile Range"]
estimators = [
    ('encoder', TargetEncoder(cols=categorical_features)),  # Encode categorical features only
    ('clf', XGBRegressor(random_state=42))
]

pipe = Pipeline(steps = estimators)

# Set up hyper parameter tuning 

In [74]:
# from skopt import BayesSearchCV
# from skopt.space import Real, Integer

# search_space = {
#     'clf__max_depth': Integer(3, 8),  # Keep tree depth moderate
#     'clf__learning_rate': Real(0.02, 0.2, prior='log-uniform'),  # Not too slow, not too fast
#     'clf__n_estimators': Integer(100, 500),  # Limit the number of trees to balance performance
#     'clf__subsample': Real(0.6, 0.9),  # Randomly sample data to prevent overfitting
#     'clf__colsample_bytree': Real(0.5, 0.8),  # Use only a subset of features per tree
#     'clf__colsample_bylevel': Real(0.5, 0.8),
#     'clf__colsample_bynode': Real(0.5, 0.8),
#     'clf__reg_alpha': Real(0.01, 2.0),  # Light L1 regularization
#     'clf__reg_lambda': Real(0.1, 5.0),  # Mild L2 regularization
#     'clf__gamma': Real(0.0, 5.0)  # Prevent excessive tree splits
# }

# opt = BayesSearchCV(
#     estimator=pipe,
#     search_spaces=search_space,
#     cv=5,  # More cross-validation folds for stability
#     n_iter=20,  # Optimize in 20 rounds (higher = better)
#     scoring='neg_root_mean_squared_error',  
#     random_state=42
# )

In [75]:
col = 'GVWR Class'
X_train[col] = X_train[col].astype("category").cat.codes
X_test[col] = X_test[col].astype("category").cat.codes

In [76]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")

sns.set_style("whitegrid")

Assuming we have a somewhat cleaned dataset (removing Model Year missing values), sort by year,...

In [77]:
df1 = pd.read_csv("cleaned_data.csv", index_col=0)
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40450 entries, 0 to 40449
Data columns (total 10 columns):
 #   Column                                             Non-Null Count  Dtype 
---  ------                                             --------------  ----- 
 0   Date                                               40450 non-null  int64 
 1   Vehicle Category                                   40450 non-null  object
 2   GVWR Class                                         40450 non-null  object
 3   Fuel Type                                          40450 non-null  object
 4   Model Year                                         40450 non-null  int64 
 5   Fuel Technology                                    40450 non-null  object
 6   Electric Mile Range                                40450 non-null  object
 7   Number of Vehicles Registered at the Same Address  40450 non-null  object
 8   Region                                             40450 non-null  object
 9   Vehicle Population    

In [79]:
df1 = df1.drop(columns="Region", axis = 1)

KeyError: "['Region'] not found in axis"

## Data Wrangling

In [80]:
categorical_cols = ["Vehicle Category", "Fuel Type", "Fuel Technology", "Electric Mile Range"]
for col in categorical_cols:
    df1[col] = df1[col].astype("category")
    

In [82]:
df1.dtypes

Date                                                    int64
Vehicle Category                                     category
GVWR Class                                             object
Fuel Type                                            category
Model Year                                              int64
Fuel Technology                                      category
Electric Mile Range                                  category
Number of Vehicles Registered at the Same Address      object
Vehicle Population                                      int64
dtype: object

In [83]:
df1["Number of Vehicles Registered at the Same Address"].apply(lambda x: repr(x)).unique()


array(["'≥4'", "'1'", "'2'", "'3'", "'Unknown'"], dtype=object)

In [22]:
# import unicodedata

# ordinal_mapping = {'1': int(1), '2': int(2), '3': int(3), unicodedata.name("\u03C0"): int(4), "Unknown": int(-1)}

# df["Number of Vehicles Registered at the Same Address"] = df["Number of Vehicles Registered at the Same Address"].astype(str).map(ordinal_mapping)

In [84]:
df1["Number of Vehicles Registered at the Same Address"] = df["Number of Vehicles Registered at the Same Address"].replace({'1': 1, '2': 2, '3': 3, "\u22654": 4, "Unknown": -1})

df1["GVWR Class"] = df1["GVWR Class"].replace({"Not Applicable": -1, "Unknown": -1})

In [86]:
df1.isnull().sum()
df.isnull().sum()

Date                                                 0
Vehicle Category                                     0
GVWR Class                                           0
Fuel Type                                            0
Model Year                                           0
Fuel Technology                                      0
Electric Mile Range                                  0
Number of Vehicles Registered at the Same Address    0
Vehicle Population                                   0
dtype: int64

In [87]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40450 entries, 0 to 40449
Data columns (total 9 columns):
 #   Column                                             Non-Null Count  Dtype   
---  ------                                             --------------  -----   
 0   Date                                               40450 non-null  int64   
 1   Vehicle Category                                   40450 non-null  category
 2   GVWR Class                                         40450 non-null  object  
 3   Fuel Type                                          40450 non-null  category
 4   Model Year                                         40450 non-null  int64   
 5   Fuel Technology                                    40450 non-null  category
 6   Electric Mile Range                                40450 non-null  category
 7   Number of Vehicles Registered at the Same Address  40450 non-null  int64   
 8   Vehicle Population                                 40450 non-null  int64   
dtypes

In [88]:
from sklearn.model_selection import train_test_split

# Separate the feature and the target
X = df1.drop(columns= "Vehicle Population")
y = df1["Vehicle Population"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [89]:
col = 'GVWR Class'
X_train[col] = X_train[col].astype("category").cat.codes.astype("int64")
X_test[col] = X_test[col].astype("category").cat.codes.astype("int64")

# Build a pipeline of training

In [91]:
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBRegressor

In [90]:
categorical_features = ["Vehicle Category", "Fuel Type", "Fuel Technology", "Electric Mile Range"]
# estimators = [
#     ('encoder', TargetEncoder(cols=categorical_features)),  # Encode categorical features only
#     ('clf', XGBRegressor(random_state=42))
# ]

# pipe = Pipeline(steps = estimators)

# Set up hyper parameter tuning 

In [30]:
# from skopt import BayesSearchCV
# from skopt.space import Real, Integer

# search_space = {
#     'clf__max_depth': Integer(3, 8),  # Keep tree depth moderate
#     'clf__learning_rate': Real(0.02, 0.2, prior='log-uniform'),  # Not too slow, not too fast
#     'clf__n_estimators': Integer(100, 500),  # Limit the number of trees to balance performance
#     'clf__subsample': Real(0.6, 0.9),  # Randomly sample data to prevent overfitting
#     'clf__colsample_bytree': Real(0.5, 0.8),  # Use only a subset of features per tree
#     'clf__colsample_bylevel': Real(0.5, 0.8),
#     'clf__colsample_bynode': Real(0.5, 0.8),
#     'clf__reg_alpha': Real(0.01, 2.0),  # Light L1 regularization
#     'clf__reg_lambda': Real(0.1, 5.0),  # Mild L2 regularization
#     'clf__gamma': Real(0.0, 5.0)  # Prevent excessive tree splits
# }

# opt = BayesSearchCV(
#     estimator=pipe,
#     search_spaces=search_space,
#     cv=5,  # More cross-validation folds for stability
#     n_iter=20,  # Optimize in 20 rounds (higher = better)
#     scoring='neg_root_mean_squared_error',  
#     random_state=42
# )

In [92]:
# from sklearn.pipeline import Pipeline
# from category_encoders.target_encoder import TargetEncoder

# def objective(trial):
#     params = {
#         'max_depth': trial.suggest_int('max_depth', 3, 8),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.02, 0.2),
#         'n_estimators': trial.suggest_int('n_estimators', 100, 500),
#         'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.8),
#         'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 0.8),
#         'colsample_bynode': trial.suggest_uniform('colsample_bynode', 0.5, 0.8),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 2.0),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 5.0),
#         'gamma': trial.suggest_uniform('gamma', 0.0, 5.0),
#         'random_state': 42
#     }

#     # Build pipeline with categorical encoding
#     pipe = Pipeline([
#         ('encoder', TargetEncoder(cols=categorical_cols)),
#         ('clf', XGBRegressor(**params))
#     ])

#     pipe.fit(X_train, y_train)
#     y_pred = pipe.predict(X_test)
#     rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
#     return rmse


In [94]:
import optuna

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.02, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.8),
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 0.8),
        'colsample_bynode': trial.suggest_uniform('colsample_bynode', 0.5, 0.8),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 2.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 5.0),
        'gamma': trial.suggest_uniform('gamma', 0.0, 5.0),
        'random_state': 42
    }

    # Encode categorical features
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()
    encoder = TargetEncoder(cols=categorical_cols)
    X_train_encoded[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)
    X_test_encoded[categorical_cols] = encoder.transform(X_test[categorical_cols])

    model = XGBRegressor(**params)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
    return rmse

def print_rmse(study, trial):
    print(f"Trial {trial.number}: RMSE {trial.value}")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=25, callbacks=[print_rmse]) # Can add callbacks=[print_rmse] to print RMSE here


[I 2025-02-01 16:04:43,187] A new study created in memory with name: no-name-814b69a2-2b5b-4560-a514-6f9187b74281
[I 2025-02-01 16:04:43,721] Trial 0 finished with value: 6647.616758420024 and parameters: {'max_depth': 7, 'learning_rate': 0.026435091969102858, 'n_estimators': 349, 'subsample': 0.6349706729659274, 'colsample_bytree': 0.7980728413941408, 'colsample_bylevel': 0.7367967734095607, 'colsample_bynode': 0.5011020640633209, 'reg_alpha': 0.012242804607734732, 'reg_lambda': 1.1175957431073547, 'gamma': 0.029314573809290922}. Best is trial 0 with value: 6647.616758420024.


Trial 0: RMSE 6647.616758420024


[I 2025-02-01 16:04:44,010] Trial 1 finished with value: 10090.323639941767 and parameters: {'max_depth': 5, 'learning_rate': 0.11352760312044687, 'n_estimators': 191, 'subsample': 0.7118736287072048, 'colsample_bytree': 0.5223808711839054, 'colsample_bylevel': 0.7329957728805684, 'colsample_bynode': 0.5459416619834728, 'reg_alpha': 0.5464547900651238, 'reg_lambda': 1.8766688715592286, 'gamma': 2.557407735508843}. Best is trial 0 with value: 6647.616758420024.
[I 2025-02-01 16:04:44,184] Trial 2 finished with value: 11115.544152050956 and parameters: {'max_depth': 4, 'learning_rate': 0.09812714606364059, 'n_estimators': 168, 'subsample': 0.8864679065756361, 'colsample_bytree': 0.5101851451534378, 'colsample_bylevel': 0.570762541594945, 'colsample_bynode': 0.5285500743058988, 'reg_alpha': 0.9727868683539994, 'reg_lambda': 3.796711620878259, 'gamma': 0.16204543037208086}. Best is trial 0 with value: 6647.616758420024.


Trial 1: RMSE 10090.323639941767
Trial 2: RMSE 11115.544152050956


[I 2025-02-01 16:04:44,395] Trial 3 finished with value: 8483.062383715784 and parameters: {'max_depth': 4, 'learning_rate': 0.09193822504499102, 'n_estimators': 186, 'subsample': 0.8960445958132534, 'colsample_bytree': 0.7867895712362286, 'colsample_bylevel': 0.5165900190693073, 'colsample_bynode': 0.6883690250736331, 'reg_alpha': 0.058428351136986495, 'reg_lambda': 2.221187586957993, 'gamma': 4.758448108125366}. Best is trial 0 with value: 6647.616758420024.


Trial 3: RMSE 8483.062383715784


[I 2025-02-01 16:04:44,659] Trial 4 finished with value: 5530.709125753121 and parameters: {'max_depth': 8, 'learning_rate': 0.17798677591432663, 'n_estimators': 132, 'subsample': 0.6696570122584211, 'colsample_bytree': 0.6908696133962542, 'colsample_bylevel': 0.6132287467928687, 'colsample_bynode': 0.7917282057420908, 'reg_alpha': 0.018944076707285484, 'reg_lambda': 0.10547182108766807, 'gamma': 1.0838613204001253}. Best is trial 4 with value: 5530.709125753121.


Trial 4: RMSE 5530.709125753121


[I 2025-02-01 16:04:45,171] Trial 5 finished with value: 6263.714238242505 and parameters: {'max_depth': 5, 'learning_rate': 0.06362413972891796, 'n_estimators': 491, 'subsample': 0.6131808018120077, 'colsample_bytree': 0.7650451762276783, 'colsample_bylevel': 0.7185033192657373, 'colsample_bynode': 0.5079962623819604, 'reg_alpha': 0.8310583766454845, 'reg_lambda': 0.872472735699382, 'gamma': 1.669213911306882}. Best is trial 4 with value: 5530.709125753121.


Trial 5: RMSE 6263.714238242505


[I 2025-02-01 16:04:45,391] Trial 6 finished with value: 12349.538524284568 and parameters: {'max_depth': 8, 'learning_rate': 0.03856187706370184, 'n_estimators': 123, 'subsample': 0.8162352207403698, 'colsample_bytree': 0.6619401516486662, 'colsample_bylevel': 0.5021597379914892, 'colsample_bynode': 0.5551331211741655, 'reg_alpha': 0.12276193017385166, 'reg_lambda': 2.92929876971317, 'gamma': 1.0442263305998605}. Best is trial 4 with value: 5530.709125753121.


Trial 6: RMSE 12349.538524284568


[I 2025-02-01 16:04:45,682] Trial 7 finished with value: 11846.36876091476 and parameters: {'max_depth': 3, 'learning_rate': 0.04492856927597391, 'n_estimators': 394, 'subsample': 0.762243418036203, 'colsample_bytree': 0.5217274020942001, 'colsample_bylevel': 0.6429633387343358, 'colsample_bynode': 0.6003326915818213, 'reg_alpha': 0.11257221970664773, 'reg_lambda': 2.364957653520921, 'gamma': 0.03352943043720136}. Best is trial 4 with value: 5530.709125753121.
[I 2025-02-01 16:04:45,872] Trial 8 finished with value: 10407.180486971143 and parameters: {'max_depth': 5, 'learning_rate': 0.12221662697009518, 'n_estimators': 174, 'subsample': 0.8538925599225375, 'colsample_bytree': 0.555233495918313, 'colsample_bylevel': 0.5770667685834099, 'colsample_bynode': 0.7980762973957191, 'reg_alpha': 0.8437720330110368, 'reg_lambda': 2.313844217693344, 'gamma': 1.7153113518114242}. Best is trial 4 with value: 5530.709125753121.


Trial 7: RMSE 11846.36876091476
Trial 8: RMSE 10407.180486971143


[I 2025-02-01 16:04:45,990] Trial 9 finished with value: 13624.373140489022 and parameters: {'max_depth': 3, 'learning_rate': 0.05695740407608515, 'n_estimators': 131, 'subsample': 0.6564844979227961, 'colsample_bytree': 0.5158964224257213, 'colsample_bylevel': 0.6292645530039818, 'colsample_bynode': 0.6947812578091805, 'reg_alpha': 0.20567434721517325, 'reg_lambda': 4.885894824385729, 'gamma': 1.8228354851127455}. Best is trial 4 with value: 5530.709125753121.


Trial 9: RMSE 13624.373140489022


[I 2025-02-01 16:04:46,362] Trial 10 finished with value: 4775.830890150832 and parameters: {'max_depth': 7, 'learning_rate': 0.1986201459327877, 'n_estimators': 262, 'subsample': 0.7028469160337381, 'colsample_bytree': 0.6840029313534042, 'colsample_bylevel': 0.788475460328079, 'colsample_bynode': 0.7971146026673445, 'reg_alpha': 0.012085414281759051, 'reg_lambda': 0.12101824809167083, 'gamma': 4.102919493472319}. Best is trial 10 with value: 4775.830890150832.


Trial 10: RMSE 4775.830890150832


[I 2025-02-01 16:04:46,750] Trial 11 finished with value: 4535.443007930113 and parameters: {'max_depth': 7, 'learning_rate': 0.19863486103584319, 'n_estimators': 266, 'subsample': 0.6999987823765885, 'colsample_bytree': 0.678563480210314, 'colsample_bylevel': 0.7797389773280938, 'colsample_bynode': 0.7990764169425126, 'reg_alpha': 0.014198134872149727, 'reg_lambda': 0.11569467006542211, 'gamma': 4.024697211258517}. Best is trial 11 with value: 4535.443007930113.


Trial 11: RMSE 4535.443007930113


[I 2025-02-01 16:04:47,133] Trial 12 finished with value: 5464.602300947897 and parameters: {'max_depth': 7, 'learning_rate': 0.16855422043863325, 'n_estimators': 266, 'subsample': 0.7314166255268729, 'colsample_bytree': 0.6129457424945244, 'colsample_bylevel': 0.7993506598950291, 'colsample_bynode': 0.751087617060807, 'reg_alpha': 0.01017660331170728, 'reg_lambda': 0.11975870399405357, 'gamma': 4.381966087681638}. Best is trial 11 with value: 4535.443007930113.


Trial 12: RMSE 5464.602300947897


[I 2025-02-01 16:04:47,574] Trial 13 finished with value: 4763.700362687063 and parameters: {'max_depth': 7, 'learning_rate': 0.19544206233786587, 'n_estimators': 286, 'subsample': 0.693512386636391, 'colsample_bytree': 0.7175625777924928, 'colsample_bylevel': 0.7996818605288557, 'colsample_bynode': 0.7348850571828401, 'reg_alpha': 0.03646892554477498, 'reg_lambda': 0.26868715766414863, 'gamma': 3.70058643278867}. Best is trial 11 with value: 4535.443007930113.


Trial 13: RMSE 4763.700362687063


[I 2025-02-01 16:04:47,964] Trial 14 finished with value: 5114.126414326154 and parameters: {'max_depth': 6, 'learning_rate': 0.13592228870018147, 'n_estimators': 323, 'subsample': 0.7622673645422777, 'colsample_bytree': 0.731479365529593, 'colsample_bylevel': 0.7604166116802145, 'colsample_bynode': 0.7347895304769738, 'reg_alpha': 0.034507095708523734, 'reg_lambda': 0.2976384022735052, 'gamma': 3.4490756602904065}. Best is trial 11 with value: 4535.443007930113.


Trial 14: RMSE 5114.126414326154


[I 2025-02-01 16:04:48,333] Trial 15 finished with value: 10441.328250200459 and parameters: {'max_depth': 6, 'learning_rate': 0.0796610749908449, 'n_estimators': 262, 'subsample': 0.6913957378405239, 'colsample_bytree': 0.6101283242004486, 'colsample_bylevel': 0.6906583701188586, 'colsample_bynode': 0.7410246317908681, 'reg_alpha': 0.036719223377273534, 'reg_lambda': 0.26897842351080153, 'gamma': 3.3612060234523087}. Best is trial 11 with value: 4535.443007930113.


Trial 15: RMSE 10441.328250200459


[I 2025-02-01 16:04:48,993] Trial 16 finished with value: 7281.038150978618 and parameters: {'max_depth': 7, 'learning_rate': 0.14212285557459844, 'n_estimators': 399, 'subsample': 0.7996628311810992, 'colsample_bytree': 0.7239867661199328, 'colsample_bylevel': 0.6906722402875592, 'colsample_bynode': 0.6481909136934435, 'reg_alpha': 0.027390343891663255, 'reg_lambda': 0.2523168202549589, 'gamma': 3.471831210572791}. Best is trial 11 with value: 4535.443007930113.


Trial 16: RMSE 7281.038150978618


[I 2025-02-01 16:04:49,521] Trial 17 finished with value: 8621.86998601702 and parameters: {'max_depth': 8, 'learning_rate': 0.021328108440186778, 'n_estimators': 227, 'subsample': 0.7376405732517117, 'colsample_bytree': 0.7270353983465495, 'colsample_bylevel': 0.7801720531979438, 'colsample_bynode': 0.6947881140948058, 'reg_alpha': 0.06786254271237585, 'reg_lambda': 0.4765532342170767, 'gamma': 2.816615634274203}. Best is trial 11 with value: 4535.443007930113.


Trial 17: RMSE 8621.86998601702


[I 2025-02-01 16:04:49,925] Trial 18 finished with value: 8018.214553164234 and parameters: {'max_depth': 6, 'learning_rate': 0.15532431388874265, 'n_estimators': 351, 'subsample': 0.6607553419658508, 'colsample_bytree': 0.6236485762877239, 'colsample_bylevel': 0.6770026622449052, 'colsample_bynode': 0.7587368721959891, 'reg_alpha': 1.9953696332894852, 'reg_lambda': 0.18492402965460575, 'gamma': 4.975549110112436}. Best is trial 11 with value: 4535.443007930113.


Trial 18: RMSE 8018.214553164234


[I 2025-02-01 16:04:50,312] Trial 19 finished with value: 9147.54411007245 and parameters: {'max_depth': 7, 'learning_rate': 0.06994547322446369, 'n_estimators': 297, 'subsample': 0.6064712761049826, 'colsample_bytree': 0.6484510612833243, 'colsample_bylevel': 0.7497763166924462, 'colsample_bynode': 0.6498430248950547, 'reg_alpha': 0.021108396010871507, 'reg_lambda': 0.49949868411868303, 'gamma': 3.9809538870724794}. Best is trial 11 with value: 4535.443007930113.


Trial 19: RMSE 9147.54411007245


[I 2025-02-01 16:04:51,120] Trial 20 finished with value: 3837.1034195879197 and parameters: {'max_depth': 8, 'learning_rate': 0.19253960941802928, 'n_estimators': 440, 'subsample': 0.799621474808938, 'colsample_bytree': 0.7555496554607678, 'colsample_bylevel': 0.7677598443863813, 'colsample_bynode': 0.7241278134445378, 'reg_alpha': 0.23776518923969842, 'reg_lambda': 0.17257702527362767, 'gamma': 3.106620321463616}. Best is trial 20 with value: 3837.1034195879197.


Trial 20: RMSE 3837.1034195879197


[I 2025-02-01 16:04:51,968] Trial 21 finished with value: 3743.020209262959 and parameters: {'max_depth': 8, 'learning_rate': 0.18652231001261987, 'n_estimators': 490, 'subsample': 0.7931743217866822, 'colsample_bytree': 0.7672817384234931, 'colsample_bylevel': 0.7675772178899972, 'colsample_bynode': 0.7255764938629962, 'reg_alpha': 0.24998805079593106, 'reg_lambda': 0.17453395332262958, 'gamma': 2.9855029280654284}. Best is trial 21 with value: 3743.020209262959.


Trial 21: RMSE 3743.020209262959


[I 2025-02-01 16:04:52,831] Trial 22 finished with value: 3813.4799982509703 and parameters: {'max_depth': 8, 'learning_rate': 0.15021715725381188, 'n_estimators': 500, 'subsample': 0.7973235130403301, 'colsample_bytree': 0.7679044359665586, 'colsample_bylevel': 0.7686982012203878, 'colsample_bynode': 0.713495581167829, 'reg_alpha': 0.2901190641921708, 'reg_lambda': 0.1690883621243782, 'gamma': 2.9410677736344524}. Best is trial 21 with value: 3743.020209262959.


Trial 22: RMSE 3813.4799982509703


[I 2025-02-01 16:04:53,581] Trial 23 finished with value: 4062.872532642464 and parameters: {'max_depth': 8, 'learning_rate': 0.11023243426817066, 'n_estimators': 492, 'subsample': 0.791356882742024, 'colsample_bytree': 0.7569172920049512, 'colsample_bylevel': 0.7119009795662778, 'colsample_bynode': 0.7134560525266905, 'reg_alpha': 0.24127511846797084, 'reg_lambda': 0.18481147974599646, 'gamma': 3.015061842498286}. Best is trial 21 with value: 3743.020209262959.


Trial 23: RMSE 4062.872532642464


[I 2025-02-01 16:04:54,274] Trial 24 finished with value: 3890.430278112535 and parameters: {'max_depth': 8, 'learning_rate': 0.1501249169937477, 'n_estimators': 458, 'subsample': 0.8358338748238252, 'colsample_bytree': 0.7608304316510353, 'colsample_bylevel': 0.7632557110291239, 'colsample_bynode': 0.6721127058455731, 'reg_alpha': 0.3361185178933097, 'reg_lambda': 0.17966988289787778, 'gamma': 2.2365954510097024}. Best is trial 21 with value: 3743.020209262959.


Trial 24: RMSE 3890.430278112535


Saving the top performing hypermeters to train on the scoring dataset


In [95]:
import json 
top_trials = sorted(study.trials, key=lambda x: x.value)[:3]
top_params = [trial.params for trial in top_trials]

# Save the top parameters to a JSON file
with open("best_models.json", "w") as f:
    json.dump(top_params, f, indent=4)

In [96]:
with open("best_models.json", "r") as f:
    best_params = json.load(f)

# Load the new dataset without index
scoring_data = pd.read_csv("scoring_cleaned_data.csv")

# Define categorical columns (must match those from training)
categorical_cols = ["Vehicle Category", "Fuel Type", "Fuel Technology", "Electric Mile Range"]

In [98]:
# Ensure columns exist in scoring data
missing_cols = [col for col in categorical_cols if col not in scoring_data.columns]
if missing_cols:
    raise ValueError(f"Missing categorical columns in scoring dataset: {missing_cols}")

# Separate features and target variable
X_scoring = scoring_data.drop(columns=["Vehicle Population"], errors="ignore")  # Adjust column name if needed
y_scoring = scoring_data["Vehicle Population"] if "Vehicle Population" in scoring_data else None  # Handle missing target

# Encode categorical features using the same method as training
encoder = TargetEncoder(cols=categorical_cols)
X_train_encoded = X_train.copy()
X_train_encoded[categorical_cols] = encoder.fit_transform(X_train[categorical_cols], y_train)

scoring_encoded = X_scoring.copy()
scoring_encoded[categorical_cols] = encoder.transform(X_scoring[categorical_cols])

In [100]:
# Ensure numerical columns are correctly formatted
numerical_cols = ["GVWR Class", "Number of Vehicles Registered at the Same Address"]
for col in numerical_cols:
    if col in scoring_encoded.columns:
        scoring_encoded[col] = scoring_encoded[col].astype("category").cat.codes

In [101]:
# Dictionary to store RMSE values
rmse_scores = {}
predictions = {}

# Apply the best models to make predictions and compute RMSE
for i, params in enumerate(best_params):
    model = XGBRegressor(**params)
    
    # Train model on original dataset
    model.fit(X_train_encoded, y_train)
    
    # Predict on scoring dataset
    y_pred = model.predict(scoring_encoded)
    predictions[f"Model_{i+1}"] = y_pred
    
    # Compute RMSE (only if y_scoring exists)
    if y_scoring is not None:
        rmse = np.sqrt(mean_squared_error(y_scoring, y_pred))
        rmse_scores[f"Model_{i+1}"] = rmse
        print(f"Model {i+1} RMSE: {rmse}")

Model 1 RMSE: 13128.496029629594
Model 2 RMSE: 16270.675953997732
Model 3 RMSE: 16570.272176400726
