In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from dotenv import load_dotenv
import os

pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
def read_data(path):
    return pd.read_csv(path)
load_dotenv() 
data_path = os.getenv("TRAINING_DATA")
df = read_data(data_path)

In [None]:
"""
Split the dataset into features and target, then divide it into training and testing sets.

- X: feature matrix (all columns except the target)
- y: target variable ('HATSURESI')
- 90% of the data is used for training, 10% for testing
- The random_state ensures reproducible results
"""

# Separate features (X) and target variable (y)
X = df.drop(["HATSURESI","Unnamed: 0", "Unnamed: 0.1"], axis=1)  # Drop target column to create feature set
y = df["HATSURESI"]                 # Target variable to predict

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the resulting splits
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Model Stacking – XGBoost + CatBoost + LinearRegression

In [None]:
# Define a list of base models to be used in stacking
base_models = [
    ('xgb', XGBRegressor(random_state=42)),                # XGBoost Regressor
    ('cat', CatBoostRegressor(verbose=0, random_state=42)),# CatBoost Regressor
    ('lr', LinearRegression())                             # Linear Regression
]

"""
Create a stacking model that combines predictions from multiple base models using a meta-learner.

- base_models: XGBoost, CatBoost, and Linear Regression
- final_estimator: Linear Regression used to learn from base model outputs
- n_jobs=-1: uses all available CPU cores for parallel processing
"""
stacked_model = StackingRegressor(
    estimators=base_models,
    final_estimator=LinearRegression(),  # Meta-learner
    n_jobs=-1
)

"""
Train the stacked model on the training dataset.

- X_train: training features
- y_train: training target values
"""
stacked_model.fit(X_train, y_train)

"""
Make predictions on the test set using the trained stacked model.

- X_test: test features
- y_pred_stack: predicted values from the stacked model
"""
y_pred_stack = stacked_model.predict(X_test)

"""
Evaluate the performance of the stacked model using various metrics:
"""
r2 = r2_score(y_test, y_pred_stack)
mse = mean_squared_error(y_test, y_pred_stack)
rmse = mse ** 0.5 
mae = mean_absolute_error(y_test, y_pred_stack)
mape = np.mean(np.abs((y_test - y_pred_stack) / y_test)) * 100

"""
Print the evaluation results of the stacked model.
"""
print("\n🔁 Model Stacking Results:")
print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MAPE: {mape:.2f}%")


#### Hyperparameter Tuning

In [None]:
xgb_model = XGBRegressor(random_state=42)
cat_model = CatBoostRegressor(verbose=0, random_state=42)


base_models = [
    ('xgb', xgb_model),
    ('cat', cat_model)
]


final_estimator = Ridge()


stacked_model = StackingRegressor(
    estimators=base_models,
    final_estimator=final_estimator,
    n_jobs=-1
)


"""
Define a parameter grid for hyperparameter tuning using GridSearchCV.

- Includes parameters for:
  - XGBoost: number of estimators, max depth, learning rate
  - CatBoost: number of iterations, depth, learning rate
  - Final estimator (Ridge Regression): regularization strength (alpha)
- The double underscores (e.g., xgb__n_estimators) refer to parameters of sub-models in the pipeline
"""
param_grid = {
    # XGBoost parameters
    'xgb__n_estimators': [100, 300],
    'xgb__max_depth': [3, 5],
    'xgb__learning_rate': [0.05, 0.1],
    
    # CatBoost parameters
    'cat__iterations': [200, 300],
    'cat__depth': [6, 8],
    'cat__learning_rate': [0.05, 0.1],

    # Final estimator (Ridge Regression) parameter
    'final_estimator__alpha': [0.1, 1.0, 10.0]
}

"""
Perform a grid search to find the best combination of hyperparameters for the stacked model.

- estimator: the stacking model that includes base and final estimators
- param_grid: dictionary containing all combinations of parameters to test
- cv=3: 3-fold cross-validation is used for evaluation
- scoring='r2': R² score is used as the evaluation metric
- verbose=2: prints detailed log output during training
- n_jobs=-1: uses all available CPU cores
"""
grid_search = GridSearchCV(
    estimator=stacked_model,
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    verbose=2,
    n_jobs=-1
)

"""
Train the grid search on the training data.

"""
grid_search.fit(X_train, y_train)

"""
Make predictions using the best model found during grid search.

- best_stack: the best performing model (with optimal parameters)
- y_pred_stack: predicted values for the test set
"""
best_stack = grid_search.best_estimator_
y_pred_stack = best_stack.predict(X_test)

"""
Evaluate the optimized model's performance on the test set.

"""
r2 = r2_score(y_test, y_pred_stack)
mse = mean_squared_error(y_test, y_pred_stack)
rmse = mse ** 0.5 
mae = mean_absolute_error(y_test, y_pred_stack)
mape = np.mean(np.abs((y_test - y_pred_stack) / y_test.replace(0, 1e-10))) * 100

print("\n🔁 Optimized Full Stacking Model Results:")
print("🔧 Best Parameters:", grid_search.best_params_)
print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MAPE: {mape:.2f}%")


### Model Stacking – XGBoost + CatBoost + Ridge & Lasso

In [None]:
"""
Define base models for stacking.

- base_models: a list of tuples containing model names and their corresponding regressors
- XGBRegressor and CatBoostRegressor are used with default parameters
- random_state ensures reproducibility
"""
base_models = [
    ('xgb', XGBRegressor(random_state=42)),
    ('cat', CatBoostRegressor(verbose=0, random_state=42)),
]

"""
Define two different final estimators (meta-learners) to compare:

- Ridge: L2-regularized linear regression
- Lasso: L1-regularized linear regression
"""
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.1)

"""
Create and train a stacking model using Ridge as the final estimator.

- estimators: base models (XGB + CatBoost)
- final_estimator: Ridge Regression
- n_jobs=-1: utilizes all CPU cores for parallel processing
"""
stack_ridge = StackingRegressor(estimators=base_models, final_estimator=ridge, n_jobs=-1)
stack_ridge.fit(X_train, y_train)

"""
Generate predictions using the Ridge-based stacked model.

- y_pred_ridge: predictions on the test set
"""
y_pred_ridge = stack_ridge.predict(X_test)

"""
Create and train a stacking model using Lasso as the final estimator.

- final_estimator: Lasso Regression (L1 regularization)
"""
stack_lasso = StackingRegressor(estimators=base_models, final_estimator=lasso, n_jobs=-1)
stack_lasso.fit(X_train, y_train)

"""
Generate predictions using the Lasso-based stacked model.

- y_pred_lasso: predictions on the test set
"""
y_pred_lasso = stack_lasso.predict(X_test)

def evaluate_model(name, y_pred):
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5 
    mae = mean_absolute_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100  # handles MAPE calculation

    print(f"\n{name} Final Estimator:")
    print(f"R² Score: {r2:.4f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"MAPE: {mape:.2f}%")

"""
Evaluate and compare both stacking models (Ridge vs. Lasso).
"""
evaluate_model("📘 Ridge", y_pred_ridge)
evaluate_model("📙 Lasso", y_pred_lasso)
