In [1]:
import sys
import os

# Add the path to your ML directory
sys.path.append(r'C:\Users\admin\Documents\Masters\ES_Masters\Masters-Processing\ML')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from mrmr_wrapper import MRMRTransformer
import optuna
import optuna.visualization as vis
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve, average_precision_score)

# Import Data

In [2]:
# Load the data
file_path = "features-Master.csv"
data = pd.read_csv(file_path)

# Shuffle the data
shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True)
data_shuffled = shuffled.iloc[:, 4:]
labels_shuffled = shuffled["Comfort Score"]

# Create binary labels (1,2 = 0; 4,5 = 1; exclude 3 for clearer separation)
binary_labels = labels_shuffled.apply(lambda x: 0 if x <= 2 else (1 if x >=4 else np.nan))
binary_data = data_shuffled[~binary_labels.isna()]
binary_labels = binary_labels[~binary_labels.isna()] 

# Train/Test Split

In [3]:
# For binary classification
X_train, X_test, y_train, y_test = train_test_split(
    binary_data,
    binary_labels,
    test_size=0.2,
    stratify=binary_labels,
    random_state=42
)

# Handle Missing Values

In [4]:
imputer = SimpleImputer(strategy='median')

X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Optimize Feature Selection and Gradient Boosting Parameters

In [None]:
X = X_train.copy()
y = y_train.copy()

def binary_classification_objective(trial):
    # Feature selection, only optimizing method and number of features (no hyperparameters of the methods)
    fs_method = trial.suggest_categorical('feature_selection', ['MRMR', 'RFE', 'None'])
    
    if fs_method != 'None':
        k_features = trial.suggest_int('k_features', 5, 105, step = 10) # Only allow a maximum of 105 features to be selected, with a step of 10, to go up to all features: X.shape[1]
        if fs_method == 'RFE':
            estimator = RandomForestClassifier()
            selector = RFE(estimator, n_features_to_select=k_features)
        else: #MRMR
            selector = MRMRTransformer(k_features=k_features) #https://feature-engine.trainindata.com/en/1.8.x/api_doc/selection/MRMR.html#feature_engine.selection.MRMR
    else:
        selector = 'passthrough'
    
    # Gradient Boosting hyperparameters https://www.geeksforgeeks.org/machine-learning/how-to-tune-hyperparameters-in-gradient-boosting-algorithm/, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500, step=50),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'random_state': 42
    }
    model = GradientBoostingClassifier(**params)
        
    # Pipeline
    pipeline = Pipeline([
        ('feature_selection', selector),
        ('model', model)
    ])

    # Cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    try:
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring='roc_auc', n_jobs=1)
        return np.mean(scores)
    except Exception as e:
        print(f"Error in trial: {e}")
        return -np.inf
    

# Run Optimization 10 Times and Collect Top Features and Best Parameters

In [None]:
# Storage
all_results = []
top_features_all_runs = []

# Repeat optimization + evaluation 10 times
for run in range(10):
    print(f"\n{'='*50}")
    print(f"STARTING RUN {run + 1}/10")
    print(f"{'='*50}")

    study = optuna.create_study(direction='maximize')
    study.optimize(binary_classification_objective, n_trials=5, show_progress_bar=True, n_jobs=4)

    print("\nBinary Classification Optimization Results:")
    print(f"Best ROC AUC Score: {study.best_value:.4f}")
    print("Best Parameters:")
    for key, value in study.best_params.items():
        print(f"  {key}: {value}")

    # ---------------------------
    # Recreate selector using best params and extract selected feature names
    # ---------------------------
    best_fs_method = study.best_params.get('feature_selection', 'None')
    selected_features = None

    if best_fs_method != 'None':
        k_features = study.best_params['k_features']
        if best_fs_method == 'RFE':
            estimator = RandomForestClassifier()
            selector = RFE(estimator, n_features_to_select=k_features)
        elif best_fs_method == 'MRMR':
            selector = MRMRTransformer(k_features=k_features)
        
        selector.fit(X, y)
        if hasattr(selector, 'get_support'):  # For RFE
            selected_features = X.columns[selector.get_support()]
        else:  # For MRMR
            selected_features = selector.selected_features
        X_best = X[selected_features]
    else:
        X_best = X
        selected_features = X.columns

    # Apply the same feature selection to test data
    if best_fs_method != 'None':
        if best_fs_method == 'MRMR':
            X_test_final = X_test[selected_features]  # This should be DataFrame
        else:  # RFE
            X_test_transformed = selector.transform(X_test)
            # Convert back to DataFrame with feature names
            X_test_final = pd.DataFrame(X_test_transformed, columns=selected_features)
    else:
        X_test_final = X_test  # DataFrame

    # ---------------------------
    # Train final GB with best params and evaluate on test set
    # ---------------------------
    best_model_params = {
        'n_estimators' : study.best_params['n_estimators'],
        'max_depth' : study.best_params['max_depth'],
        'learning_rate' : study.best_params['learning_rate'],
        'min_samples_split' : study.best_params['min_samples_split'],
        'min_samples_leaf' : study.best_params['min_samples_leaf'],
        'subsample' : study.best_params['subsample'],
        'random_state' :42
    } 
    
    best_model = GradientBoostingClassifier(**best_model_params)
    best_model.fit(X_best, y)

    # Predictions
    y_pred = best_model.predict(X_test_final)
    y_pred_proba = best_model.predict_proba(X_test_final)[:, 1]

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    # Save run result
    run_result = {
        'run_number': run + 1,
        'best_cv_score': study.best_value,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'selected_features': ";".join(selected_features),
        'feature_selection_method': best_fs_method
    }
    for key, value in study.best_params.items():
        run_result[f'param_{key}'] = value

    all_results.append(run_result)

    print("\nFinal Model Evaluation on Test Set:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")

    # ---------------------------
    # FEATURE IMPORTANCE for this run (Gradient Boosting)
    # ---------------------------
    print("\n" + "="*30)
    print(f"FEATURE IMPORTANCE (run {run+1})")
    print("="*30)

    # Always convert selected_features → indices relative to binary_data.columns
    if best_fs_method != 'None':
        if isinstance(selected_features[0], str):
            selected_indices = [binary_data.columns.get_loc(feat) for feat in selected_features]
        else:
            selected_indices = list(selected_features)
    else:
        selected_indices = list(range(binary_data.shape[1]))

    # Convert back indices → names
    final_feature_names = binary_data.columns[selected_indices].tolist()

    n_features_in_model = len(final_feature_names)
    print(f"Number of features in final model: {n_features_in_model}")

    try:
        # Gradient Boosting has built-in feature importances
        importances = best_model.feature_importances_

        feature_importance_df = pd.DataFrame({
            'feature': final_feature_names,
            'index': selected_indices,
            'importance': importances
        }).sort_values('importance', ascending=False).reset_index(drop=True)

        # Top-20
        top_n = min(20, feature_importance_df.shape[0])
        top_features = feature_importance_df.head(top_n).copy()
        top_features['run'] = run + 1
        top_features['best_cv_score'] = study.best_value

        # Save per-run CSV
        #run_csv = os.path(f'feature_importance_run{run+1}.csv')
        #feature_importance_df.to_csv(run_csv, index=False)
        #print(f"Saved feature importance CSV for run {run+1} -> {run_csv}")

        # Save plot
        try:
            plt.figure(figsize=(8, 6))
            plt.barh(range(top_n), feature_importance_df['importance'].head(top_n))
            plt.yticks(range(top_n), feature_importance_df['feature'].head(top_n))
            plt.gca().invert_yaxis()
            plt.xlabel('Feature Importance')
            plt.title(f'Top {top_n} Features (run {run+1})')
            plt.tight_layout()
           # png_path = os.path.join(out_dir, f'feature_importance_run{run+1}.png')
           # plt.savefig(png_path, dpi=300, bbox_inches='tight')
            plt.close()
        except Exception as e:
            print(f"Couldn't save plot for run {run+1}: {e}")

    except Exception as e:
        print(f"Failed to extract feature_importances_ for run {run+1}: {e}")
        feature_importance_df = pd.DataFrame({
            'feature': final_feature_names,
            'index': selected_indices,
            'importance': np.zeros(len(final_feature_names))
        })
        top_features = feature_importance_df.head(min(20, len(final_feature_names))).copy()
        top_features['run'] = run + 1
        top_features['best_cv_score'] = study.best_value
       # run_csv = os.path(f'feature_importance_run{run+1}.csv')
        #feature_importance_df.to_csv(run_csv, index=False)

    # Collect top features for final combined CSV
    top_features_all_runs.append(top_features)

# ---------------------------
# After all runs: save combined results & top features
# ---------------------------
print(f"\n{'='*50}")
print("SUMMARY ACROSS ALL RUNS")
print(f"{'='*50}")

results_df = pd.DataFrame(all_results)
metrics_to_avg = ['best_cv_score', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc']

summary_row = {'run_number': 'SUMMARY'}
for metric in metrics_to_avg:
    avg_value = results_df[metric].mean()
    std_value = results_df[metric].std()
    summary_row[metric] = f"{avg_value:.4f} ± {std_value:.4f}"
    print(f"{metric}: {avg_value:.4f} ± {std_value:.4f}")

# Mark params N/A for summary
for key in [k for k in results_df.columns if k.startswith('param_')]:
    summary_row[key] = 'N/A'

combined_results = all_results + [summary_row]
df = pd.DataFrame(combined_results)
#df.to_csv(results_outpath, index=False)
#print(f"\nCombined results with averages saved to: {results_outpath}")

# Combine top features from all runs into one CSV
if top_features_all_runs:
    combined_top = pd.concat(top_features_all_runs, ignore_index=True, sort=False)
    #combined_top_csv = os.path.join(out_dir, 'combined_top_features.csv')
    #combined_top.to_csv(combined_top_csv, index=False)
    print(f"Combined top features saved to: ")#combined_top_csv}")
else:
    print("No top features were collected.")

print("Done.")


[I 2025-09-11 11:10:48,968] A new study created in memory with name: no-name-d5d0c9cd-218a-4c10-8656-289dae45aedc



STARTING RUN 1/10


  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:00<00:00, 14.42it/s]


Got MRMR features


100%|██████████| 5/5 [00:00<00:00, 13.72it/s]


Got MRMR features


100%|██████████| 5/5 [00:00<00:00, 14.36it/s]


Got MRMR features


100%|██████████| 5/5 [00:00<00:00, 14.48it/s]


Got MRMR features


100%|██████████| 5/5 [00:00<00:00, 17.70it/s]


Got MRMR features
[I 2025-09-11 11:11:05,362] Trial 3 finished with value: 0.7457049123715791 and parameters: {'feature_selection': 'MRMR', 'k_features': 5, 'n_estimators': 500, 'learning_rate': 0.004861684904250388, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 7, 'subsample': 0.668920084532501}. Best is trial 3 with value: 0.7457049123715791.


100%|██████████| 25/25 [00:02<00:00, 12.29it/s]


Got MRMR features


100%|██████████| 25/25 [00:01<00:00, 12.67it/s]


Got MRMR features


100%|██████████| 25/25 [00:02<00:00, 11.92it/s]


Got MRMR features


100%|██████████| 25/25 [00:02<00:00, 12.19it/s]


Got MRMR features


100%|██████████| 25/25 [00:02<00:00, 11.36it/s]


Got MRMR features
[I 2025-09-11 11:11:24,547] Trial 4 finished with value: 0.7539022705689373 and parameters: {'feature_selection': 'MRMR', 'k_features': 25, 'n_estimators': 450, 'learning_rate': 0.08393985445091634, 'max_depth': 5, 'min_samples_split': 13, 'min_samples_leaf': 10, 'subsample': 0.5959192162138831}. Best is trial 4 with value: 0.7539022705689373.
[I 2025-09-11 11:11:31,231] Trial 2 finished with value: 0.7995911495911494 and parameters: {'feature_selection': 'None', 'n_estimators': 500, 'learning_rate': 0.0031388211171699014, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 8, 'subsample': 0.6321290379307088}. Best is trial 2 with value: 0.7995911495911494.
[I 2025-09-11 11:16:21,624] Trial 0 finished with value: 0.8048698215364883 and parameters: {'feature_selection': 'RFE', 'k_features': 95, 'n_estimators': 200, 'learning_rate': 0.03669169515518512, 'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 1, 'subsample': 0.6875483630998682}. Best is tria

[I 2025-09-11 11:17:50,041] A new study created in memory with name: no-name-8bb1e4ca-ee83-4367-b03d-1d99ed421ead



Final Model Evaluation on Test Set:
Accuracy: 0.7000
Precision: 0.7027
Recall: 0.7879
F1 Score: 0.7429
ROC AUC: 0.7329

FEATURE IMPORTANCE (run 1)
Number of features in final model: 95

STARTING RUN 2/10


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-09-11 11:17:53,730] Trial 3 finished with value: 0.7922669922669922 and parameters: {'feature_selection': 'None', 'n_estimators': 50, 'learning_rate': 0.008364548935092512, 'max_depth': 6, 'min_samples_split': 20, 'min_samples_leaf': 3, 'subsample': 0.5887943086388387}. Best is trial 3 with value: 0.7922669922669922.
[I 2025-09-11 11:18:18,568] Trial 1 finished with value: 0.7845561845561846 and parameters: {'feature_selection': 'None', 'n_estimators': 500, 'learning_rate': 0.14743043526247235, 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 10, 'subsample': 0.6660873490073895}. Best is trial 3 with value: 0.7922669922669922.
[I 2025-09-11 11:18:30,416] Trial 0 finished with value: 0.7971065971065971 and parameters: {'feature_selection': 'None', 'n_estimators': 500, 'learning_rate': 0.09670641666836292, 'max_depth': 4, 'min_samples_split': 6, 'min_samples_leaf': 2, 'subsample': 0.9331174029283502}. Best is trial 0 with value: 0.7971065971065971.
[I 2025-09-11 11:23:

[I 2025-09-11 11:24:16,175] A new study created in memory with name: no-name-dae90102-4849-4ec2-9e3e-d0b6af585cb1



Final Model Evaluation on Test Set:
Accuracy: 0.6500
Precision: 0.6429
Recall: 0.8182
F1 Score: 0.7200
ROC AUC: 0.7475

FEATURE IMPORTANCE (run 2)
Number of features in final model: 85

STARTING RUN 3/10


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-09-11 11:24:25,983] Trial 1 finished with value: 0.7799737299737302 and parameters: {'feature_selection': 'None', 'n_estimators': 150, 'learning_rate': 0.05096225034570484, 'max_depth': 6, 'min_samples_split': 16, 'min_samples_leaf': 6, 'subsample': 0.5185229066892525}. Best is trial 1 with value: 0.7799737299737302.
[I 2025-09-11 11:24:47,296] Trial 4 finished with value: 0.8048574881908215 and parameters: {'feature_selection': 'None', 'n_estimators': 300, 'learning_rate': 0.015768797021318347, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 9, 'subsample': 0.8178638150278545}. Best is trial 4 with value: 0.8048574881908215.
[I 2025-09-11 11:24:48,292] Trial 2 finished with value: 0.8059434392767727 and parameters: {'feature_selection': 'None', 'n_estimators': 300, 'learning_rate': 0.01055226588978486, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 1, 'subsample': 0.6036618543573458}. Best is trial 2 with value: 0.8059434392767727.
[I 2025-09-11 11:29:

[I 2025-09-11 11:30:10,343] A new study created in memory with name: no-name-7b658beb-c0a0-4d75-a0cb-5368b6938ca9



Final Model Evaluation on Test Set:
Accuracy: 0.7333
Precision: 0.7297
Recall: 0.8182
F1 Score: 0.7714
ROC AUC: 0.7868

FEATURE IMPORTANCE (run 3)
Number of features in final model: 336

STARTING RUN 4/10


  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 95/95 [00:11<00:00,  8.60it/s]


Got MRMR features


100%|██████████| 95/95 [00:11<00:00,  8.08it/s]


Got MRMR features


100%|██████████| 95/95 [00:11<00:00,  8.17it/s]


Got MRMR features


100%|██████████| 95/95 [00:12<00:00,  7.63it/s]


Got MRMR features


100%|██████████| 95/95 [00:12<00:00,  7.35it/s]


Got MRMR features
[I 2025-09-11 11:31:24,993] Trial 3 finished with value: 0.7761183261183262 and parameters: {'feature_selection': 'MRMR', 'k_features': 95, 'n_estimators': 200, 'learning_rate': 0.032221887408363646, 'max_depth': 4, 'min_samples_split': 8, 'min_samples_leaf': 7, 'subsample': 0.8665470515700875}. Best is trial 3 with value: 0.7761183261183262.
[I 2025-09-11 11:42:06,942] Trial 0 finished with value: 0.7891201391201392 and parameters: {'feature_selection': 'RFE', 'k_features': 25, 'n_estimators': 300, 'learning_rate': 0.11689950685246343, 'max_depth': 4, 'min_samples_split': 17, 'min_samples_leaf': 8, 'subsample': 0.7984995658360442}. Best is trial 0 with value: 0.7891201391201392.
[I 2025-09-11 11:42:12,222] Trial 2 finished with value: 0.7904385737719071 and parameters: {'feature_selection': 'RFE', 'k_features': 25, 'n_estimators': 400, 'learning_rate': 0.008235633888725162, 'max_depth': 3, 'min_samples_split': 20, 'min_samples_leaf': 9, 'subsample': 0.942060179884730

[I 2025-09-11 11:43:40,853] A new study created in memory with name: no-name-e67d9090-968f-44d2-9a54-b63bc06d63e7



Final Model Evaluation on Test Set:
Accuracy: 0.7000
Precision: 0.7027
Recall: 0.7879
F1 Score: 0.7429
ROC AUC: 0.7688

FEATURE IMPORTANCE (run 4)
Number of features in final model: 25

STARTING RUN 5/10


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-09-11 11:43:47,218] Trial 3 finished with value: 0.7961223961223961 and parameters: {'feature_selection': 'None', 'n_estimators': 100, 'learning_rate': 0.00932431822538412, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 6, 'subsample': 0.7757393434500328}. Best is trial 3 with value: 0.7961223961223961.
[I 2025-09-11 11:43:47,910] Trial 0 finished with value: 0.792604309270976 and parameters: {'feature_selection': 'None', 'n_estimators': 150, 'learning_rate': 0.015000003479811614, 'max_depth': 3, 'min_samples_split': 18, 'min_samples_leaf': 2, 'subsample': 0.5343930017994667}. Best is trial 3 with value: 0.7961223961223961.
[I 2025-09-11 11:44:01,509] Trial 2 finished with value: 0.7936347603014269 and parameters: {'feature_selection': 'None', 'n_estimators': 250, 'learning_rate': 0.1625969083438313, 'max_depth': 4, 'min_samples_split': 15, 'min_samples_leaf': 3, 'subsample': 0.8210647017167181}. Best is trial 3 with value: 0.7961223961223961.
[I 2025-09-11 11:44:0

[I 2025-09-11 11:48:15,919] A new study created in memory with name: no-name-1451e89d-ab63-42ad-a582-1f8b7208386d



Final Model Evaluation on Test Set:
Accuracy: 0.7000
Precision: 0.7027
Recall: 0.7879
F1 Score: 0.7429
ROC AUC: 0.7486

FEATURE IMPORTANCE (run 5)
Number of features in final model: 336

STARTING RUN 6/10


  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 35/35 [00:02<00:00, 17.06it/s]


Got MRMR features


100%|██████████| 35/35 [00:01<00:00, 17.69it/s]


Got MRMR features



[A                                            

[I 2025-09-11 11:48:27,588] Trial 2 finished with value: 0.8203346036679371 and parameters: {'feature_selection': 'None', 'n_estimators': 100, 'learning_rate': 0.11418879666915531, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 10, 'subsample': 0.7982237138402128}. Best is trial 2 with value: 0.8203346036679371.


100%|██████████| 35/35 [00:02<00:00, 16.92it/s]


Got MRMR features


100%|██████████| 35/35 [00:01<00:00, 18.15it/s]


Got MRMR features


100%|██████████| 35/35 [00:01<00:00, 17.75it/s]


Got MRMR features
[I 2025-09-11 11:48:34,804] Trial 1 finished with value: 0.7493506493506492 and parameters: {'feature_selection': 'MRMR', 'k_features': 35, 'n_estimators': 200, 'learning_rate': 0.0909838554783104, 'max_depth': 7, 'min_samples_split': 4, 'min_samples_leaf': 9, 'subsample': 0.6632365889206085}. Best is trial 2 with value: 0.8203346036679371.
[I 2025-09-11 11:48:54,391] Trial 4 finished with value: 0.799318582651916 and parameters: {'feature_selection': 'None', 'n_estimators': 300, 'learning_rate': 0.09314781683506027, 'max_depth': 4, 'min_samples_split': 18, 'min_samples_leaf': 4, 'subsample': 0.822469645776455}. Best is trial 2 with value: 0.8203346036679371.
[I 2025-09-11 11:48:55,932] Trial 3 finished with value: 0.8070139736806403 and parameters: {'feature_selection': 'None', 'n_estimators': 350, 'learning_rate': 0.005723655645194496, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 1, 'subsample': 0.6373917680745265}. Best is trial 2 with value: 0.82033

[I 2025-09-11 11:52:03,981] A new study created in memory with name: no-name-87d7a33c-0207-4e42-9594-5cbb0f48685f



Final Model Evaluation on Test Set:
Accuracy: 0.6500
Precision: 0.6667
Recall: 0.7273
F1 Score: 0.6957
ROC AUC: 0.7351

FEATURE IMPORTANCE (run 6)
Number of features in final model: 336

STARTING RUN 7/10


  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 15/15 [00:01<00:00, 13.25it/s]


Got MRMR features


100%|██████████| 15/15 [00:01<00:00, 13.37it/s]


Got MRMR features


100%|██████████| 15/15 [00:01<00:00, 12.78it/s]


Got MRMR features


100%|██████████| 15/15 [00:01<00:00, 13.04it/s]


Got MRMR features


100%|██████████| 15/15 [00:01<00:00, 12.24it/s]


Got MRMR features
[I 2025-09-11 11:52:17,589] Trial 3 finished with value: 0.7631473464806798 and parameters: {'feature_selection': 'MRMR', 'k_features': 15, 'n_estimators': 450, 'learning_rate': 0.05258946954663219, 'max_depth': 3, 'min_samples_split': 14, 'min_samples_leaf': 3, 'subsample': 0.9934970683575945}. Best is trial 3 with value: 0.7631473464806798.



[A                                           

[I 2025-09-11 11:52:18,269] Trial 2 finished with value: 0.7919080919080919 and parameters: {'feature_selection': 'None', 'n_estimators': 200, 'learning_rate': 0.08615645517656124, 'max_depth': 4, 'min_samples_split': 20, 'min_samples_leaf': 7, 'subsample': 0.6180150732455558}. Best is trial 2 with value: 0.7919080919080919.


100%|██████████| 65/65 [00:04<00:00, 14.42it/s]


Got MRMR features


100%|██████████| 65/65 [00:04<00:00, 14.14it/s]


Got MRMR features


100%|██████████| 65/65 [00:04<00:00, 13.78it/s]


Got MRMR features


100%|██████████| 65/65 [00:04<00:00, 13.74it/s]


Got MRMR features


100%|██████████| 65/65 [00:04<00:00, 13.26it/s]


Got MRMR features
[I 2025-09-11 11:52:50,459] Trial 4 finished with value: 0.7723153389820057 and parameters: {'feature_selection': 'MRMR', 'k_features': 65, 'n_estimators': 300, 'learning_rate': 0.0028385507930360635, 'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 1, 'subsample': 0.8452426080166141}. Best is trial 2 with value: 0.7919080919080919.
[I 2025-09-11 11:58:23,818] Trial 0 finished with value: 0.7602607269273935 and parameters: {'feature_selection': 'RFE', 'k_features': 25, 'n_estimators': 150, 'learning_rate': 0.0010087413765906752, 'max_depth': 3, 'min_samples_split': 10, 'min_samples_leaf': 8, 'subsample': 0.9040768867163687}. Best is trial 2 with value: 0.7919080919080919.
[I 2025-09-11 11:58:25,656] Trial 1 finished with value: 0.7960823127489794 and parameters: {'feature_selection': 'RFE', 'k_features': 25, 'n_estimators': 450, 'learning_rate': 0.016632784250656237, 'max_depth': 7, 'min_samples_split': 3, 'min_samples_leaf': 10, 'subsample': 0.901689378850

[I 2025-09-11 11:59:27,749] A new study created in memory with name: no-name-f02a5472-6eba-4bed-821a-968a6e44ec76



Final Model Evaluation on Test Set:
Accuracy: 0.7167
Precision: 0.7105
Recall: 0.8182
F1 Score: 0.7606
ROC AUC: 0.7856

FEATURE IMPORTANCE (run 7)
Number of features in final model: 25

STARTING RUN 8/10


  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:00<00:00,  5.08it/s]


Got MRMR features


100%|██████████| 5/5 [00:00<00:00, 16.16it/s]


Got MRMR features


100%|██████████| 5/5 [00:00<00:00, 14.94it/s]


Got MRMR features


100%|██████████| 5/5 [00:00<00:00, 16.20it/s]


Got MRMR features


100%|██████████| 5/5 [00:00<00:00, 14.27it/s]


Got MRMR features
[I 2025-09-11 11:59:41,617] Trial 1 finished with value: 0.7400735067401734 and parameters: {'feature_selection': 'MRMR', 'k_features': 5, 'n_estimators': 350, 'learning_rate': 0.004788492271594374, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 9, 'subsample': 0.9390434349219676}. Best is trial 1 with value: 0.7400735067401734.


100%|██████████| 105/105 [00:08<00:00, 12.13it/s]


Got MRMR features


100%|██████████| 105/105 [00:08<00:00, 11.72it/s]


Got MRMR features


100%|██████████| 105/105 [00:09<00:00, 11.45it/s]


Got MRMR features



[A                                             


[I 2025-09-11 12:00:15,021] Trial 2 finished with value: 0.7996034829368164 and parameters: {'feature_selection': 'None', 'n_estimators': 350, 'learning_rate': 0.07848448019697485, 'max_depth': 6, 'min_samples_split': 12, 'min_samples_leaf': 5, 'subsample': 0.8121107550883471}. Best is trial 2 with value: 0.7996034829368164.


100%|██████████| 105/105 [00:08<00:00, 12.13it/s][A


Got MRMR features


100%|██████████| 105/105 [00:08<00:00, 12.81it/s]


Got MRMR features
[I 2025-09-11 12:00:28,917] Trial 4 finished with value: 0.7680609513942847 and parameters: {'feature_selection': 'MRMR', 'k_features': 105, 'n_estimators': 100, 'learning_rate': 0.0028044235451387526, 'max_depth': 3, 'min_samples_split': 16, 'min_samples_leaf': 8, 'subsample': 0.7418143449762633}. Best is trial 2 with value: 0.7996034829368164.
[I 2025-09-11 12:04:56,586] Trial 0 finished with value: 0.7887581554248222 and parameters: {'feature_selection': 'RFE', 'k_features': 95, 'n_estimators': 50, 'learning_rate': 0.010286433764843797, 'max_depth': 4, 'min_samples_split': 20, 'min_samples_leaf': 3, 'subsample': 0.7165301620744986}. Best is trial 2 with value: 0.7996034829368164.
[I 2025-09-11 12:05:21,772] Trial 3 finished with value: 0.7837859671193004 and parameters: {'feature_selection': 'RFE', 'k_features': 65, 'n_estimators': 450, 'learning_rate': 0.014592869407601867, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 3, 'subsample': 0.6538543248962

[I 2025-09-11 12:05:29,854] A new study created in memory with name: no-name-9d0d26c3-cd92-476b-85f6-e52fbb214245



Final Model Evaluation on Test Set:
Accuracy: 0.7500
Precision: 0.7368
Recall: 0.8485
F1 Score: 0.7887
ROC AUC: 0.7677

FEATURE IMPORTANCE (run 8)
Number of features in final model: 336

STARTING RUN 9/10


  0%|          | 0/5 [00:00<?, ?it/s]


[A
[A

[A[A
[A

[A[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A
[A                                            

[A[A                                         


[A[A[A                                      
[A

[A[A

[A[A
[A

[I 2025-09-11 12:05:35,864] Trial 2 finished with value: 0.7961408961408962 and parameters: {'feature_selection': 'None', 'n_estimators': 50, 'learning_rate': 0.03231435994629829, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 5, 'subsample': 0.8261741600122885}. Best is trial 2 with value: 0.7961408961408962.




[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

100%|██████████| 55/55 [00:05<00:00, 10.73it/s]

[A

[A[A

Got MRMR features




[A[A
[A
[A

[A[A
[A

[A[A
[A

[A[A
[A
[A

[A[A
[A

[A[A
[A

100%|██████████| 45/45 [00:04<00:00,  9.67it/s]

[A
[A

Got MRMR features



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
100%|██████████| 95/95 [00:09<00:00, 10.34it/s]


[A[A




Got MRMR features


 35%|███▍      | 19/55 [00:02<00:04,  7.94it/s][A[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

100%|██████████| 45/45 [00:04<00:00,  9.66it/s]


[A[A

[A[A

Got MRMR features




[A[A

[A[A

[A[A

[A[A

[A[A

100%|██████████| 55/55 [00:05<00:00,  9.85it/s]


Got MRMR features



[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 45/45 [00:03<00:00, 11.54it/s]

[A

Got MRMR features



[A
[A
[A

[A[A
[A

[A[A
[A
[A

[A[A
[A
[A

[A[A
[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

100%|██████████| 45/45 [00:05<00:00,  8.11it/s]

[A
[A

Got MRMR features



[A
[A
[A
100%|██████████| 55/55 [00:06<00:00,  8.11it/s]

[A

Got MRMR features



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 95/95 [00:10<00:00,  8.84it/s]


Got MRMR features



[A
100%|██████████| 45/45 [00:03<00:00, 11.49it/s]

[A
[A

Got MRMR features



[A
[A

[A[A                                         
[A
[A
[A

[I 2025-09-11 12:06:02,758] Trial 0 finished with value: 0.7642826309492976 and parameters: {'feature_selection': 'MRMR', 'k_features': 45, 'n_estimators': 150, 'learning_rate': 0.0034605776190593973, 'max_depth': 3, 'min_samples_split': 13, 'min_samples_leaf': 1, 'subsample': 0.5928621584979858}. Best is trial 2 with value: 0.7961408961408962.



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 55/55 [00:03<00:00, 14.41it/s]


Got MRMR features



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 95/95 [00:06<00:00, 14.68it/s]

[A

Got MRMR features



[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 55/55 [00:03<00:00, 14.10it/s]


Got MRMR features
[I 2025-09-11 12:06:14,636] Trial 1 finished with value: 0.7708427375094041 and parameters: {'feature_selection': 'MRMR', 'k_features': 55, 'n_estimators': 500, 'learning_rate': 0.018945855528617186, 'max_depth': 5, 'min_samples_split': 19, 'min_samples_leaf': 8, 'subsample': 0.8156743016382465}. Best is trial 2 with value: 0.7961408961408962.


100%|██████████| 95/95 [00:04<00:00, 23.52it/s]


Got MRMR features


100%|██████████| 95/95 [00:04<00:00, 23.16it/s]


Got MRMR features
[I 2025-09-11 12:06:28,956] Trial 3 finished with value: 0.7754251920918588 and parameters: {'feature_selection': 'MRMR', 'k_features': 95, 'n_estimators': 500, 'learning_rate': 0.0182458358529585, 'max_depth': 6, 'min_samples_split': 9, 'min_samples_leaf': 6, 'subsample': 0.719616844077881}. Best is trial 2 with value: 0.7961408961408962.
[I 2025-09-11 12:10:24,276] Trial 4 finished with value: 0.7640267140267141 and parameters: {'feature_selection': 'RFE', 'k_features': 15, 'n_estimators': 150, 'learning_rate': 0.05726396147089347, 'max_depth': 6, 'min_samples_split': 12, 'min_samples_leaf': 7, 'subsample': 0.761271959670347}. Best is trial 2 with value: 0.7961408961408962.

Binary Classification Optimization Results:
Best ROC AUC Score: 0.7961
Best Parameters:
  feature_selection: None
  n_estimators: 50
  learning_rate: 0.03231435994629829
  max_depth: 5
  min_samples_split: 2
  min_samples_leaf: 5
  subsample: 0.8261741600122885


[I 2025-09-11 12:10:25,359] A new study created in memory with name: no-name-ff7ad195-ca37-4218-ab60-f9da0c3052b0



Final Model Evaluation on Test Set:
Accuracy: 0.7167
Precision: 0.7222
Recall: 0.7879
F1 Score: 0.7536
ROC AUC: 0.7351

FEATURE IMPORTANCE (run 9)
Number of features in final model: 336

STARTING RUN 10/10


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-09-11 12:19:59,523] Trial 2 finished with value: 0.7740358407025074 and parameters: {'feature_selection': 'RFE', 'k_features': 95, 'n_estimators': 300, 'learning_rate': 0.23929964167939521, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 2, 'subsample': 0.5886418477708277}. Best is trial 2 with value: 0.7740358407025074.


100%|██████████| 45/45 [00:06<00:00,  7.11it/s]


Got MRMR features
[I 2025-09-11 12:20:10,863] Trial 3 finished with value: 0.7911717911717911 and parameters: {'feature_selection': 'RFE', 'k_features': 85, 'n_estimators': 150, 'learning_rate': 0.0018311255267523924, 'max_depth': 4, 'min_samples_split': 18, 'min_samples_leaf': 6, 'subsample': 0.7769548648498156}. Best is trial 3 with value: 0.7911717911717911.


100%|██████████| 45/45 [00:03<00:00, 12.99it/s]


Got MRMR features


100%|██████████| 45/45 [00:03<00:00, 12.71it/s]


Got MRMR features


100%|██████████| 45/45 [00:03<00:00, 12.92it/s]


Got MRMR features


100%|██████████| 45/45 [00:03<00:00, 12.40it/s]


Got MRMR features
[I 2025-09-11 12:20:27,324] Trial 4 finished with value: 0.7297887297887298 and parameters: {'feature_selection': 'MRMR', 'k_features': 45, 'n_estimators': 100, 'learning_rate': 0.16051196683124794, 'max_depth': 3, 'min_samples_split': 10, 'min_samples_leaf': 5, 'subsample': 0.9380906273374248}. Best is trial 3 with value: 0.7911717911717911.
[I 2025-09-11 12:20:44,432] Trial 0 finished with value: 0.7984990318323651 and parameters: {'feature_selection': 'RFE', 'k_features': 65, 'n_estimators': 250, 'learning_rate': 0.0011525724157819316, 'max_depth': 6, 'min_samples_split': 19, 'min_samples_leaf': 5, 'subsample': 0.7376264489110075}. Best is trial 0 with value: 0.7984990318323651.
[I 2025-09-11 12:20:53,187] Trial 1 finished with value: 0.7838877172210506 and parameters: {'feature_selection': 'RFE', 'k_features': 45, 'n_estimators': 50, 'learning_rate': 0.21113399729489707, 'max_depth': 6, 'min_samples_split': 14, 'min_samples_leaf': 1, 'subsample': 0.894380000992889