In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from lightgbm import LGBMRegressor
from dotenv import load_dotenv
import os


pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [3]:
def read_data(path):
    return pd.read_csv(path)
load_dotenv() 
data_path = os.getenv("TRAINING_DATA")
df = read_data(data_path)

In [5]:
"""
Split the dataset into features and target, then divide it into training and testing sets.

- X: feature matrix (all columns except the target)
- y: target variable ('HATSURESI')
- 90% of the data is used for training, 10% for testing
- The random_state ensures reproducible results
"""

# Separate features (X) and target variable (y)
X = df.drop(["HATSURESI","Unnamed: 0", "Unnamed: 0.1"], axis=1)  # Drop target column to create feature set
y = df["HATSURESI"]                 # Target variable to predict

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the resulting splits
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((154851, 31), (38713, 31), (154851,), (38713,))

## XGB

In [8]:
"""
Train and evaluate an XGBoost regression model.

The model is trained on the training set and evaluated on the test set using:
- RMSE: Root Mean Squared Error
- MSE: Mean Squared Error
- MAE: Mean Absolute Error
- MAPE: Mean Absolute Percentage Error
- R²: Coefficient of Determination
"""

# Initialize XGBRegressor with a fixed random seed for reproducibility
model = xgb.XGBRegressor(random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5  
r2 = r2_score(y_test, y_pred)  
mae = mean_absolute_error(y_test, y_pred)  
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100  

# Print evaluation results
print(f"Root Mean Square Error (RMSE): {rmse:.2f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
print(f"R² Score: {r2:.4f}")


Root Mean Square Error (RMSE): 6.51
Mean Squared Error: 42.4257
Mean Absolute Error (MAE): 4.30
Mean Absolute Percentage Error (MAPE): 7.12%
R² Score: 0.6937


#### Hyperparameter Tuning

In [None]:
"""
Hyperparameter tuning for XGBoost using GridSearchCV.

This script:
- Defines a parameter grid
- Performs grid search with cross-validation
- Fits the best model
- Evaluates the final model on the test set using key regression metrics
"""

from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Define parameter grid
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'gamma': [0, 0.1],
    'min_child_weight': [1, 3]
}

# Initialize base model
xgb_model = XGBRegressor(random_state=42)

# Set up GridSearchCV to find the best combination of hyperparameters
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error', 
    cv=3,
    verbose=1,
    n_jobs=-1  
)

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Print best parameters found by grid search
print("🔍 Best Parameters:", grid_search.best_params_)

# Extract the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5  
r2 = r2_score(y_test, y_pred)  
mae = mean_absolute_error(y_test, y_pred)  
# MAPE (handle zero division safely)
mape = np.mean(np.abs((y_test - y_pred) / y_test.replace(0, 1e-10))) * 100

# Print final evaluation results
print(f"\n✅ Optimized XGBoost Results:")
print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MAPE: {mape:.2f}%")


Fitting 3 folds for each of 288 candidates, totalling 864 fits


### LightGBM

In [6]:
"""
Train and evaluate a LightGBM regression model on the test set.

The model is assessed using:
- R² Score
- RMSE (Root Mean Squared Error)
- MAE (Mean Absolute Error)
- MAPE (Mean Absolute Percentage Error)
"""
def clean_column_names(df):
    df.columns = df.columns.str.replace(r'[^A-Za-z0-9_]', '', regex=True)
    return df

X_trainL = clean_column_names(X_train)
X_testL = clean_column_names(X_test)

lgbm_model = LGBMRegressor(random_state=42)

lgbm_model.fit(X_trainL, y_train)

y_pred_lgbm = lgbm_model.predict(X_testL)

# Calculate evaluation metrics
r2_lgbm = r2_score(y_test, y_pred_lgbm)  
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
rmse_lgbm = mse_lgbm ** 0.5 
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)  
mape_lgbm = np.mean(np.abs((y_test - y_pred_lgbm) / y_test.replace(0, 1e-10))) * 100  

# Print evaluation results
print("\n🌿 LightGBM Results:")
print(f"R² Score: {r2_lgbm:.4f}")
print(f"RMSE: {rmse_lgbm:.2f}")
print(f"MAE: {mae_lgbm:.2f}")
print(f"MAPE: {mape_lgbm:.2f}%")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007810 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1599
[LightGBM] [Info] Number of data points in the train set: 154851, number of used features: 31
[LightGBM] [Info] Start training from score 60.049415

🌿 LightGBM Results:
R² Score: 0.6887
RMSE: 6.57
MAE: 4.39
MAPE: 7.28%


#### Hyperparameter Tuning

In [None]:
param_dist = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7, -1],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [20, 31, 50],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

# Model
lgbm_model = LGBMRegressor(random_state=42)

# Randomized SearchCV
random_search = RandomizedSearchCV(
    estimator=lgbm_model,
    param_distributions=param_dist,
    n_iter=30,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Train
random_search.fit(X_train, y_train)

# Best Model
best_lgbm = random_search.best_estimator_

# Prediction
y_pred_lgbm = best_lgbm.predict(X_test)

# Metrics
r2_lgbm = r2_score(y_test, y_pred_lgbm)
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
rmse_lgbm = mse_lgbm ** 0.5 
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
mape_lgbm = np.mean(np.abs((y_test - y_pred_lgbm) / y_test.replace(0, 1e-10))) * 100

# Results
print("\n🌿 Optimized LightGBM Results:")
print("Best Parameters:", random_search.best_params_)
print(f"R² Score: {r2_lgbm:.4f}")
print(f"RMSE: {rmse_lgbm:.2f}")
print(f"MAE: {mae_lgbm:.2f}")
print(f"MAPE: {mape_lgbm:.2f}%")

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2109
[LightGBM] [Info] Number of data points in the train set: 174207, number of used features: 33
[LightGBM] [Info] Start training from score 60.068524

🌿 Optimized LightGBM Results:
Best Parameters: {'subsample': 1.0, 'reg_lambda': 0.5, 'reg_alpha': 0.5, 'num_leaves': 50, 'n_estimators': 300, 'max_depth': -1, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
R² Score: 0.7237
RMSE: 6.18
MAE: 3.98
MAPE: 6.62%
