In [1]:
import sys
import os

# Add the path to your ML directory
sys.path.append(r'C:\Users\admin\Documents\Masters\ES_Masters\Masters-Processing\ML')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import optuna.visualization as vis
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.feature_selection import RFE
from mrmr_wrapper import MRMRTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import optuna
import optuna.visualization as vis
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Import Data

In [2]:
# Load the data
file_path = "../features-Master.csv"
data = pd.read_csv(file_path)

# Shuffle the data
shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True)
data_shuffled = shuffled.iloc[:, 4:]
labels_shuffled = shuffled["Comfort Score"]

# Train/Test Split

In [3]:
# For regression (using stratified split based on binned target)
X_train, X_test, y_train, y_test = train_test_split(
    data_shuffled, 
    labels_shuffled, 
    test_size=0.2, 
    stratify=labels_shuffled,
    random_state=42
)

# Handle Missing Values

In [4]:
imputer = SimpleImputer(strategy='median')

X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Optimize Feature Selection and Gradient Boosting Parameters

In [5]:
X = X_train.copy()
y = y_train.copy()

def objective(trial):
    # Feature selection, only optimizing method and number of features (no hyperparameters of the methods)
    fs_method = trial.suggest_categorical('feature_selection', ['RFE', 'MRMR', 'None'])

    if fs_method != 'None':
        k_features = trial.suggest_int('k_features', 5, 105, step = 10) 
        if fs_method == 'RFE':
            estimator = RandomForestRegressor()
            selector = RFE(estimator, n_features_to_select=k_features)
        else: #MRMR
            selector = MRMRTransformer(k_features=k_features) #https://feature-engine.trainindata.com/en/1.8.x/api_doc/selection/MRMR.html#feature_engine.selection.MRMR
    else:
        selector = 'passthrough'

    # Gradient Boosting Hyperparameters
    params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'random_state': 42
        }
    model = GradientBoostingRegressor(**params)

    # 3. Pipeline 
    pipeline = Pipeline([
        ('feature_selection', selector),
        ('model', model)
    ])

    #4. Cross-validation
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    try:
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2', n_jobs=1)
        return np.mean(scores)
    except Exception:
        return -np.inf

# Run Optuna Study
study = optuna.create_study(direction='maximize') 
study.optimize(objective, n_trials=5, show_progress_bar=True, n_jobs=5) 

# Best result
print("\n Regression Optimization Results")
print("===================================")
print(f"Best R²: {study.best_value:.4f}")
print("Best Parameters: ")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

[I 2025-09-17 10:06:25,214] A new study created in memory with name: no-name-809b7fa4-8426-4be5-87f5-70c691652dd9


  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 95/95 [00:03<00:00, 24.87it/s]


Got MRMR features


100%|██████████| 95/95 [00:03<00:00, 25.21it/s]


Got MRMR features


100%|██████████| 95/95 [00:03<00:00, 25.20it/s]


Got MRMR features


100%|██████████| 95/95 [00:03<00:00, 24.84it/s]


Got MRMR features
[I 2025-09-17 10:06:54,070] Trial 3 finished with value: 0.17006385119398212 and parameters: {'feature_selection': 'None', 'n_estimators': 437, 'max_depth': 3, 'learning_rate': 0.08521270911526138, 'min_samples_split': 6, 'min_samples_leaf': 1, 'subsample': 0.5022870296996784}. Best is trial 3 with value: 0.17006385119398212.


100%|██████████| 95/95 [00:03<00:00, 26.16it/s]


Got MRMR features
[I 2025-09-17 10:07:01,774] Trial 4 finished with value: -0.007538525875551039 and parameters: {'feature_selection': 'MRMR', 'k_features': 95, 'n_estimators': 786, 'max_depth': 5, 'learning_rate': 0.27104266779681546, 'min_samples_split': 10, 'min_samples_leaf': 3, 'subsample': 0.8144915985778627}. Best is trial 3 with value: 0.17006385119398212.
[I 2025-09-17 10:07:09,169] Trial 1 finished with value: 0.2092019943977954 and parameters: {'feature_selection': 'None', 'n_estimators': 389, 'max_depth': 6, 'learning_rate': 0.025675707926282047, 'min_samples_split': 5, 'min_samples_leaf': 4, 'subsample': 0.5911504926967107}. Best is trial 1 with value: 0.2092019943977954.
[I 2025-09-17 10:08:28,148] Trial 0 finished with value: 0.18502478271232908 and parameters: {'feature_selection': 'None', 'n_estimators': 946, 'max_depth': 4, 'learning_rate': 0.0020042615079299297, 'min_samples_split': 4, 'min_samples_leaf': 3, 'subsample': 0.9859760131576549}. Best is trial 1 with valu

# Apply Feature Selection to Training & Test Data

In [6]:
# apply your feature selection code from before
best_fs_method = study.best_params.get('feature_selection', 'None')

if best_fs_method != 'None':
    k_features = study.best_params['k_features']
    if best_fs_method == 'RFE':
        estimator = RandomForestRegressor()
        selector = RFE(estimator, n_features_to_select=k_features,)
    else: #MRMR
        selector = MRMRTransformer(k_features=k_features)
    
    selector.fit(X, y)
    if hasattr(selector, 'get_support'):  # For SelectKBest/RFE
        selected_features = X.columns[selector.get_support()]
    else:  # For MRMR
        selected_features = selector.selected_features
    X_best = X[selected_features]
else:
    X_best = X
    selected_features = X.columns

# Apply the same feature selection to test data
if best_fs_method != 'None':
    if best_fs_method == 'MRMR':
        X_test_final = X_test[selected_features]
    else:
        X_test_final = selector.transform(X_test)  # Use the already fitted selector
        if isinstance(X_test, pd.DataFrame):
            X_test_final = pd.DataFrame(X_test_final, columns=selected_features)
else:
    X_test_final = X_test

# Create and Fit GB Model with Best Parameters

In [7]:
best_model = GradientBoostingRegressor(
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth'],
    learning_rate=study.best_params['learning_rate'],
    min_samples_split=study.best_params['min_samples_split'],
    min_samples_leaf=study.best_params['min_samples_leaf'],
    subsample=study.best_params['subsample'],
    random_state=42
)

best_model.fit(X_best, y)

# Run Model and Get Outcomes

In [8]:
# Make predictions
y_pred = best_model.predict(X_test_final)

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nFinal Model Evaluation on Test Set:")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

# Calculate accuracy within ±1 point
correct = np.sum(np.abs(y_test - y_pred) <= 1)
accuracy = correct / len(y_test)
print(f"Accuracy within ±1 point: {accuracy:.4f}")


Final Model Evaluation on Test Set:
RMSE: 1.1732
R²: 0.1424
Accuracy within ±1 point: 0.5625
