# Training with XGBoost

- In this secttion I will attempt to minimize rMSE by using the XGBoost regressor and fine tunning it using the Optuna library.

In [25]:
%pip install category_encoders

Note: you may need to restart the kernel to use updated packages.


In [26]:
%pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [27]:
# Install XGBoost
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [28]:
#import necessary libraries
import pandas as pd
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np
import optuna
import xgboost as xgb

- Categorical features will be encoded using TargetEncoder as it was done in the previous test.

In [29]:
df = pd.read_csv('cat_backpack.csv')

In [30]:
display(df.sample(5))

Unnamed: 0,id,brand,material,size,compartments,laptop_compartment,waterproof,style,color,weight_cap,Price
174410,174410,Puma,Polyester,Large,8,False,False,Messenger,Gray,9.536959,72.19629
207296,207296,Adidas,Nylon,Large,4,True,True,Tote,Red,28.777509,55.18705
151731,151731,Adidas,Polyester,Small,2,True,False,Messenger,Pink,18.812184,42.12754
197471,197471,Adidas,Leather,Small,10,True,False,Tote,Red,18.936837,144.18475
257544,257544,Nike,Polyester,Large,8,True,False,Backpack,Gray,19.281521,89.6315


- Ordinal encoding for 'size' feature.

In [31]:
# Create a dictionary to map size categories to numerical values
size_mapping = {
    'Small': 0,
    'Medium': 1,
    'Large': 2,
    'Unknown': 3  # Or you can assign it -1 or another distinct value
}

# Apply the mapping to the 'size' column
df['size_encoded'] = df['size'].map(size_mapping)

# Drop the original 'size' column (optional)
df.drop('size', axis=1, inplace=True)

- Target encoding for 'brand', 'material', 'style' and 'color'.

In [None]:
# Define features (X) and target (y)
X = df.drop(['Price', 'id'], axis=1)  # Exclude 'Price' column
y = df['Price']

# List of categorical features to encode
categorical_features = ['brand', 'material', 'style', 'color']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the TargetEncoder
encoder = TargetEncoder(cols=categorical_features)

# Fit the encoder on the training data and transform both training and testing data
X_train_encoded = encoder.fit_transform(X_train, y_train)
X_test_encoded = encoder.transform(X_test)

# Now X_train_encoded and X_test_encoded have the categorical features target encoded

In [33]:
display(X_train_encoded.sample(5))

Unnamed: 0,brand,material,compartments,laptop_compartment,waterproof,style,color,weight_cap,size_encoded
126206,81.333835,80.479359,1,False,False,81.430891,81.014828,20.585557,2
157868,81.58736,82.028371,9,False,True,81.430891,81.014828,21.958857,0
30206,81.858243,82.028371,5,False,True,81.432036,81.675616,8.837172,1
100616,81.956967,80.479359,8,True,False,81.430891,80.985014,11.96772,2
143136,81.333835,80.479359,9,False,False,81.430891,82.010883,23.98551,0


- Train and evaluate an XGBoost regressor.

In [34]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)


In [None]:
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)

# Set XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 100
}

# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=100)

# Predict on the test set
y_pred = model.predict(dtest)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"Root Mean Squared Error: {rmse}")

Parameters: { "n_estimators" } are not used.



Root Mean Squared Error: 38.91025938703038


- Fine tune the hyperparameters of the XGBoost model using Optuna

In [36]:
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameter search space
    param = {
        'objective': 'reg:squarederror',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'alpha': trial.suggest_float('alpha', 0, 10),
        'lambda': trial.suggest_float('lambda', 0, 10)
    }

    # Create DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
    dtest = xgb.DMatrix(X_test_scaled, label=y_test)

    # Train the XGBoost model
    model = xgb.train(param, dtrain, num_boost_round=100)

    # Predict on the test set
    y_pred = model.predict(dtest)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [37]:

# Create a study object and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=69)

[I 2025-02-17 11:55:16,847] A new study created in memory with name: no-name-ef1743be-56c6-412c-b315-812be811a048


Parameters: { "n_estimators" } are not used.

[I 2025-02-17 11:55:20,310] Trial 0 finished with value: 1523.0372416276668 and parameters: {'max_depth': 10, 'learning_rate': 0.06146518304224565, 'n_estimators': 200, 'subsample': 0.835923168705416, 'colsample_bytree': 0.6967638322420567, 'alpha': 6.798346014464882, 'lambda': 1.0724610637442444}. Best is trial 0 with value: 1523.0372416276668.
Parameters: { "n_estimators" } are not used.

[I 2025-02-17 11:55:22,845] Trial 1 finished with value: 1517.5700537262016 and parameters: {'max_depth': 9, 'learning_rate': 0.0419615147205333, 'n_estimators': 214, 'subsample': 0.5772728959171824, 'colsample_bytree': 0.8455375144490951, 'alpha': 4.941307209988421, 'lambda': 7.244740059451232}. Best is trial 1 with value: 1517.5700537262016.
Parameters: { "n_estimators" } are not used.

[I 2025-02-17 11:55:24,642] Trial 2 finished with value: 1526.2160832159934 and parameters: {'max_depth': 7, 'learning_rate': 0.19236146554896386, 'n_estimators': 109, 

In [38]:

# Print the best hyperparameters
print(f"Best hyperparameters: {study.best_params}")

# Train the final model with the best hyperparameters
best_params = study.best_params
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)
final_model = xgb.train(best_params, dtrain, num_boost_round=50)

# Predict on the test set
y_pred = final_model.predict(dtest)

# Evaluate the final model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"Final Root Mean Squared Error: {rmse}")

Best hyperparameters: {'max_depth': 3, 'learning_rate': 0.10951546913095321, 'n_estimators': 117, 'subsample': 0.7990037955264295, 'colsample_bytree': 0.8236912001752946, 'alpha': 1.645145878027562, 'lambda': 5.04011646790288}


Parameters: { "n_estimators" } are not used.



Final Root Mean Squared Error: 38.911478482605396


- After 100 trials using optuna the lowest rMSE obtained was 38.9062, which is a very close result to the one accomplished by using ligthGBM but still far from reaching the competition winner(38.82005) :(