# One Hot Encoding and Training with LightGBM 

- In this first attempt I perform One Hot Encoding to the categorical features and ordinal encoding for the 'size' variable. Later I will train a LightGBM regressor with default parameters to predict The Price target variable.

In [None]:
%pip install lightgbm

In [None]:
%pip install optuna

In [4]:
# Import necessary libraries
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import optuna
from optuna.samplers import TPESampler


In [5]:
# If using local environment
df = pd.read_csv('cat_backpack.csv')

- One hot encoding for non hierarchycal features.

In [6]:

# Select the categorical columns to be encoded
categorical_cols = ['brand', 'material', 'style', 'color']

# Apply one-hot encoding using get_dummies
encoded_data = pd.get_dummies(df[categorical_cols], prefix=categorical_cols)

# Concatenate the encoded data with the original DataFrame
df = pd.concat([df, encoded_data], axis=1)

# Drop the original categorical columns
df.drop(categorical_cols, axis=1, inplace=True)

- Ordinal encoding for 'size' column.

In [7]:
# Create a dictionary to map size categories to numerical values
size_mapping = {
    'Small': 0,
    'Medium': 1,
    'Large': 2,
    'Unknown': 3  # Or you can assign it -1 or another distinct value
}

# Apply the mapping to the 'size' column
df['size_encoded'] = df['size'].map(size_mapping)

# Drop the original 'size' column (optional)
df.drop('size', axis=1, inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 29 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  300000 non-null  int64  
 1   compartments        300000 non-null  int64  
 2   laptop_compartment  300000 non-null  bool   
 3   waterproof          300000 non-null  bool   
 4   weight_cap          300000 non-null  float64
 5   Price               300000 non-null  float64
 6   brand_Adidas        300000 non-null  bool   
 7   brand_Jansport      300000 non-null  bool   
 8   brand_Nike          300000 non-null  bool   
 9   brand_Puma          300000 non-null  bool   
 10  brand_Under Armour  300000 non-null  bool   
 11  brand_Unknown       300000 non-null  bool   
 12  material_Canvas     300000 non-null  bool   
 13  material_Leather    300000 non-null  bool   
 14  material_Nylon      300000 non-null  bool   
 15  material_Polyester  300000 non-nul

# Train with LigthGBM.

In [9]:
# Drop the id column
df = df.drop(columns=['id'])

# Separate features and target variable
X = df.drop(columns=['Price'])
y = df['Price']

# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate the scaler and fit on training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create LightGBM datasets
train_data = lgb.Dataset(X_train_scaled, label=y_train)
test_data = lgb.Dataset(X_test_scaled, label=y_test, reference=train_data)

# Set up model parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'seed': 42
}

In [10]:
# Train the model
regressor = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(100)],  # Include log_evaluation callback
    # verbose_eval=100  # Remove this line
)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[41]	valid_0's rmse: 38.9136


In [11]:
# Predictions and evaluation
y_pred = regressor.predict(X_test_scaled, num_iteration=regressor.best_iteration)
# Calculate RMSE without the 'squared' argument, then take the square root
mse = mean_squared_error(y_test, y_pred)  # Remove squared=False
rmse = mse**0.5  # Calculate the square root to get RMSE
print("Test RMSE:", rmse)

Test RMSE: 38.91359184477554


# Fine Tune LightGBM with Optuna.

- In this section I will try to fine tune the lightGBM model using the optuna library.

In [12]:
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'random_state': 42,
        'n_estimators': 100,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 200),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
    }

    model = lgb.LGBMRegressor(**params)
    # Pass early stopping as a callback
    model.fit(
        X_train_scaled,
        y_train,
        eval_set=[(X_test_scaled, y_test)],
        callbacks=[lgb.early_stopping(stopping_rounds=10)]
    )
    predictions = model.predict(X_test_scaled)
    rmse = mean_squared_error(y_test, predictions)
    return rmse

In [13]:
sampler = TPESampler(seed=1) #  Uses the Tree-Structured Parzen Estimator algorithm [4]
study = optuna.create_study(direction='minimize', sampler = sampler)

[I 2025-02-18 10:45:35,880] A new study created in memory with name: no-name-2a21adfe-7b23-4b38-b9bb-3a14793e98cd


In [None]:
import time
start_time = time.time()
study.optimize(objective, n_trials=100)
end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")

In [15]:
print('Best parameters:', study.best_params)

Best parameters: {'learning_rate': 0.07814577909900611, 'num_leaves': 44, 'max_depth': 4, 'min_child_samples': 29, 'subsample': 0.7921825554183903, 'colsample_bytree': 0.76774512355446, 'reg_alpha': 6.6527011724307155, 'reg_lambda': 1.484352269511822}


Best parameters: {'learning_rate': 0.09751747867142664, 'num_leaves': 72, 'max_depth': 4, 'min_child_samples': 57, 'subsample': 0.5096338621731877, 'colsample_bytree': 0.6570577434553186, 'reg_alpha': 3.7448144357284203, 'reg_lambda': 1.2766258463846785}

In [16]:
# Best parameters from Optuna
best_params = study.best_params
print("Best Parameters:", best_params)

# Create and train the model with the best parameters
model = lgb.LGBMRegressor(**best_params)
model.fit(X_train_scaled, y_train, eval_set=[(X_test_scaled, y_test)],
          callbacks=[lgb.early_stopping(stopping_rounds=10)])

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
print("Test RMSE with Best Parameters:", rmse)

Best Parameters: {'learning_rate': 0.07814577909900611, 'num_leaves': 44, 'max_depth': 4, 'min_child_samples': 29, 'subsample': 0.7921825554183903, 'colsample_bytree': 0.76774512355446, 'reg_alpha': 6.6527011724307155, 'reg_lambda': 1.484352269511822}




Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[75]	valid_0's l2: 1513.91
Test RMSE with Best Parameters: 38.909007429148495




- Best test rMSE metric achived after optimizing parameters with optuna was 38.9085, still far away from best performing model in competition :(