In [2]:
import numpy as np
import pandas as pd
import joblib
import datetime
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from hyperopt import hp, anneal, Trials, STATUS_OK
from hyperopt.fmin import fmin
from functools import partial


In [3]:
# Load the datasets
train_df = pd.read_excel("train_data.xlsx")
test_df = pd.read_excel("test_data.xlsx")

In [4]:
# Define feature matrix and target variable
X_train = train_df.drop(['P'], axis=1)
y_train = train_df['P']
X_test = test_df.drop(['P'], axis=1)
y_test = test_df['P']

In [6]:
def model_metrics(model, X, y):
    """Compute RMSE for model evaluation."""
    y_pred = model.predict(X)
    return np.sqrt(mean_squared_error(y, y_pred))

def bayes_fmin(X_train, X_test, y_train, y_test, eval_iters=100):
    """Hyperparameter optimization using Bayesian search."""
    def objective(params):
        model = AdaBoostRegressor(
            learning_rate=float(params['learning_rate']),
            n_estimators=int(params['n_estimators']),
            random_state=42
        )
        model.fit(X_train, y_train)
        loss = model_metrics(model, X_test, y_test)
        return {"loss": loss, "status": STATUS_OK}

    space = {
        'n_estimators': hp.quniform('n_estimators', 1, 200, 1),
        'learning_rate': hp.quniform('learning_rate', 0.001, 1, 0.01),
    }
    best_params = fmin(objective, space, algo=partial(anneal.suggest), max_evals=eval_iters, trials=Trials())
    best_params["n_estimators"] = int(best_params["n_estimators"])
    best_params["learning_rate"] = float(best_params["learning_rate"])
    return best_params

In [7]:
# Hyperparameter tuning
best_params = bayes_fmin(X_train, X_test, y_train, y_test, 500)
print("Best Hyperparameters:", best_params)

100%|██████████| 500/500 [04:36<00:00,  1.81trial/s, best loss: 55.34572303807613]
Best Hyperparameters: {'learning_rate': 0.24, 'n_estimators': 57}


In [8]:
# Train AdaBoost model
model = AdaBoostRegressor(**best_params, random_state=42)
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Number of model parameters (approximate with n_estimators)
k = best_params['n_estimators']

# Number of observations
n_train = len(y_train)
n_test = len(y_test)

# Residual Sum of Squares
rss_train = np.sum((y_train - y_train_pred) ** 2)
rss_test = np.sum((y_test - y_test_pred) ** 2)

# BIC Calculation
bic_train = n_train * np.log(rss_train / n_train) + k * np.log(n_train)
bic_test = n_test * np.log(rss_test / n_test) + k * np.log(n_test)

# Model Evaluation
metrics = {
    "RMSE_Train": np.sqrt(mean_squared_error(y_train, y_train_pred)),
    "MAE_Train": mean_absolute_error(y_train, y_train_pred),
    "R2_Train": r2_score(y_train, y_train_pred),
    "BIC_Train": bic_train,
    "RMSE_Test": np.sqrt(mean_squared_error(y_test, y_test_pred)),
    "MAE_Test": mean_absolute_error(y_test, y_test_pred),
    "R2_Test": r2_score(y_test, y_test_pred),
    "BIC_Test": bic_test
}

for key, value in metrics.items():
    print(f"{key}: {value:.4f}")


RMSE_Train: 57.3616
MAE_Train: 41.2269
R2_Train: 0.8899
BIC_Train: 25556.2419
RMSE_Test: 55.3457
MAE_Test: 40.4841
R2_Test: 0.8846
BIC_Test: 11078.1015
