In [1]:
import numpy as np
import pandas as pd
import joblib
import datetime
import matplotlib.pyplot as plt
from hyperopt import hp, Trials, STATUS_OK, fmin, anneal
from functools import partial
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [2]:
# Load the datasets
train_df = pd.read_excel("train_data.xlsx")
test_df = pd.read_excel("test_data.xlsx")

In [3]:
# Define feature matrix and target variable
X_train = train_df.drop(['P'], axis=1)
y_train = train_df['P']
X_test = test_df.drop(['P'], axis=1)
y_test = test_df['P']

In [4]:
# Define Model Evaluation Metric
def model_metrics(model, X, y):
    """Evaluate model performance using RMSE."""
    y_pred = model.predict(X)
    return np.sqrt(mean_squared_error(y, y_pred))

In [6]:

# Bayesian Optimization Function
def bayes_fmin(X_train, X_test, y_train, y_test, eval_iters=100):
    """Optimize hyperparameters using Bayesian optimization."""
    
    def objective(params):
        model = GradientBoostingRegressor(
            learning_rate=float(params['learning_rate']),
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            max_features=int(params['max_features']),
            random_state=42
        )
        model.fit(X_train, y_train)
        loss = model_metrics(model, X_test, y_test)
        
        with open('./GBR-Bayesian-fitness-RMSE.txt', 'a+') as f:
            f.write(f"{loss}\n")
        
        return {"loss": loss, "status": STATUS_OK}
    
    space = {
        'n_estimators': hp.quniform('n_estimators', 1, 200, 1),
        'max_depth': hp.quniform('max_depth', 1, 50, 1),
        'learning_rate': hp.quniform('learning_rate', 0.0001, 1, 0.01),
        'max_features': hp.quniform('max_features', 1, 11, 1)
    }
    
    best_params = fmin(
        fn=objective,
        space=space,
        algo=partial(anneal.suggest),
        max_evals=eval_iters,
        trials=Trials(),
        return_argmin=True
    )
    
    return {k: int(v) if k != 'learning_rate' else float(v) for k, v in best_params.items()}

In [7]:
# Optimize Parameters
best_params = bayes_fmin(X_train, X_test, y_train, y_test, 500)
print("Optimized Parameters:", best_params)

# Train Final Model
gbr_reg = GradientBoostingRegressor(**best_params, random_state=42)
gbr_reg.fit(X_train, y_train)
y_train_pred, y_test_pred = gbr_reg.predict(X_train), gbr_reg.predict(X_test)

# Evaluate Model
metrics = {
    "Train RMSE": np.sqrt(mean_squared_error(y_train, y_train_pred)),
    "Train MAE": mean_absolute_error(y_train, y_train_pred),
    "Train R2": r2_score(y_train, y_train_pred),
    "Test RMSE": np.sqrt(mean_squared_error(y_test, y_test_pred)),
    "Test MAE": mean_absolute_error(y_test, y_test_pred),
    "Test R2": r2_score(y_test, y_test_pred)
}

for metric, value in metrics.items():
    print(f"INFO: {metric} = {value:.2f}")
    



100%|██████████| 500/500 [22:55<00:00,  2.75s/trial, best loss: 16.48570118452016] 
Optimized Parameters: {'learning_rate': 0.1, 'max_depth': 6, 'max_features': 5, 'n_estimators': 200}
INFO: Train RMSE = 2.17
INFO: Train MAE = 1.57
INFO: Train R2 = 1.00
INFO: Test RMSE = 16.49
INFO: Test MAE = 7.23
INFO: Test R2 = 0.99


NameError: name 'model' is not defined

In [9]:
def compute_bic(model, X, y):
    """Compute Bayesian Information Criterion (BIC) for a regression model."""
    n = X.shape[0]
    y_pred = model.predict(X)
    rss = np.sum((y - y_pred) ** 2)

    # Approximate number of parameters:
    n_estimators = model.get_params()['n_estimators']
    max_depth = model.get_params()['max_depth']
    k = n_estimators * max_depth  # Rough approximation for tree models

    bic = n * np.log(rss / n) + k * np.log(n)
    return bic


In [10]:
bic_train = compute_bic(gbr_reg, X_train, y_train)
bic_test = compute_bic(gbr_reg, X_test, y_test)

print(f"BIC (Train): {bic_train:.2f}")
print(f"BIC (Test): {bic_test:.2f}")


BIC (Train): 14440.00
BIC (Test): 16079.65
