In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor, Pool, cv
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
import mrmr
from mrmr import mrmr_regression
import warnings

# Set random seed for reproducibility
np.random.seed(42)

# Import and Prepare Data

In [2]:
# Load the data
file_path = "features-Master.csv"
data = pd.read_csv(file_path)

# Shuffle the data
shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True)
data_shuffled = shuffled.iloc[:, 4:]
labels_shuffled = shuffled["Comfort Score"]

# Class to Accomodate MRMR Feature Selection Structure

In [3]:
class MRMRTransformer:
    def __init__(self, k_features):
        self.k_features = k_features
        self.selected_features = None
        self.column_names = None
    
    def fit(self, X, y):
        # Convert to DataFrame if not already
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        
        # Reset indices to avoid alignment issues
        X = X.reset_index(drop=True)
        y = pd.Series(y).reset_index(drop=True)
        
        self.column_names = X.columns.tolist()
        try:
            self.selected_features = mrmr_regression(X, y, K=self.k_features)
        except:
            # Fallback to random features if MRMR fails
            self.selected_features = np.random.choice(X.columns, size=min(self.k_features, len(X.columns)), replace=False)
        return self
    
    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.column_names)
        return X[self.selected_features]

# Train/Test Split

In [4]:
# For regression (using stratified split based on binned target)
X_train, X_test, y_train, y_test = train_test_split(
    data_shuffled, 
    labels_shuffled, 
    test_size=0.2, 
    stratify=labels_shuffled,
    random_state=42
)

# Optimize Feature Selection and Catboost Parameters

In [7]:
X = X_train.copy()
y = y_train.copy().values

def objective(trial):
    # 1. Feature Selection
    fs_method = trial.suggest_categorical('feature_selection', ['ANOVA', 'MutualInfo', 'RFE', 'MRMR', 'None'])
    if fs_method != 'None':
        k_features = trial.suggest_int('k_features', 10, min(50, X.shape[1]))
        if fs_method == 'ANOVA':
            selector = SelectKBest(f_regression, k=k_features)
        elif fs_method == 'MutualInfo':
            selector = SelectKBest(mutual_info_regression, k=k_features)
        elif fs_method == 'RFE':
            rfe_step = trial.suggest_float('rfe_step', 0.1, 1.0)
            estimator = RandomForestRegressor(
                n_estimators=trial.suggest_int('rfe_n_estimators', 50, 200),
                max_depth=trial.suggest_int('rfe_max_depth', 3, 10),
                random_state=42
            )
            selector = RFE(estimator, n_features_to_select=k_features, step=rfe_step)
        elif fs_method == 'MRMR':
            selector = MRMRTransformer(k_features=k_features)
    else:
        selector = 'passthrough'

    # 2. CatBoost Parameters
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'random_state': 42,
        'verbose': False
    }
    model = CatBoostRegressor(**params)

    # 3. Pipeline 
    pipeline = Pipeline([
       # ('imputer', imputer),
        ('feature_selection', selector),
        ('model', model)
    ])

    #4. Cross-validation
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    try:
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2', n_jobs=1)
        return np.mean(scores)
    except Exception:
        return -np.inf

# Run Optuna Study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

# Best result
print("Best trial:")
trial = study.best_trial
print(f"R²: {trial.value:.4f}")
print("Params: ")
for key, value in trial.params.items():
    print(f"  {key}: {value}")

[I 2025-08-27 06:39:44,848] A new study created in memory with name: no-name-8c3ca54c-6c71-4980-ac16-da17aa4fe18e


  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 37/37 [00:01<00:00, 28.48it/s]
100%|██████████| 37/37 [00:01<00:00, 28.38it/s]
100%|██████████| 37/37 [00:01<00:00, 29.15it/s]
100%|██████████| 37/37 [00:01<00:00, 27.69it/s]
100%|██████████| 37/37 [00:01<00:00, 25.86it/s]


[I 2025-08-27 06:41:23,367] Trial 0 finished with value: 0.166257073214946 and parameters: {'feature_selection': 'MRMR', 'k_features': 37, 'iterations': 662, 'depth': 9, 'learning_rate': 0.012741369316909492, 'l2_leaf_reg': 0.0009376800610053393}. Best is trial 0 with value: 0.166257073214946.
[I 2025-08-27 06:41:45,312] Trial 1 finished with value: 0.16959335540277212 and parameters: {'feature_selection': 'None', 'iterations': 796, 'depth': 4, 'learning_rate': 0.05286056372230047, 'l2_leaf_reg': 0.02002204986961919}. Best is trial 1 with value: 0.16959335540277212.


100%|██████████| 48/48 [00:01<00:00, 28.60it/s]
100%|██████████| 48/48 [00:01<00:00, 28.65it/s]
100%|██████████| 48/48 [00:01<00:00, 27.99it/s]
100%|██████████| 48/48 [00:01<00:00, 28.24it/s]
100%|██████████| 48/48 [00:01<00:00, 28.25it/s]


[I 2025-08-27 06:41:56,444] Trial 2 finished with value: 0.15663795505932424 and parameters: {'feature_selection': 'MRMR', 'k_features': 48, 'iterations': 571, 'depth': 3, 'learning_rate': 0.003616228299359011, 'l2_leaf_reg': 0.0002322906254235888}. Best is trial 1 with value: 0.16959335540277212.
[I 2025-08-27 06:41:56,483] Trial 3 finished with value: -inf and parameters: {'feature_selection': 'ANOVA', 'k_features': 20, 'iterations': 305, 'depth': 8, 'learning_rate': 0.0585111850331171, 'l2_leaf_reg': 0.0006404721393250113}. Best is trial 1 with value: 0.16959335540277212.
[I 2025-08-27 06:44:43,737] Trial 4 finished with value: 0.2369166860218595 and parameters: {'feature_selection': 'RFE', 'k_features': 20, 'rfe_step': 0.1909690415019311, 'rfe_n_estimators': 183, 'rfe_max_depth': 10, 'iterations': 485, 'depth': 10, 'learning_rate': 0.092922719517782, 'l2_leaf_reg': 0.00025924433123780075}. Best is trial 4 with value: 0.2369166860218595.


100%|██████████| 20/20 [00:00<00:00, 26.50it/s]
100%|██████████| 20/20 [00:00<00:00, 27.49it/s]
100%|██████████| 20/20 [00:00<00:00, 27.36it/s]
100%|██████████| 20/20 [00:00<00:00, 28.79it/s]
100%|██████████| 20/20 [00:00<00:00, 27.93it/s]


[I 2025-08-27 06:45:14,811] Trial 5 finished with value: 0.10068666678605413 and parameters: {'feature_selection': 'MRMR', 'k_features': 20, 'iterations': 363, 'depth': 9, 'learning_rate': 0.1954865577186104, 'l2_leaf_reg': 0.0011133438083864502}. Best is trial 4 with value: 0.2369166860218595.
[I 2025-08-27 06:45:14,842] Trial 6 finished with value: -inf and parameters: {'feature_selection': 'MutualInfo', 'k_features': 14, 'iterations': 516, 'depth': 9, 'learning_rate': 0.140544617334415, 'l2_leaf_reg': 2.265783261538871e-08}. Best is trial 4 with value: 0.2369166860218595.
[I 2025-08-27 06:45:14,872] Trial 7 finished with value: -inf and parameters: {'feature_selection': 'ANOVA', 'k_features': 43, 'iterations': 529, 'depth': 3, 'learning_rate': 0.14143526330159298, 'l2_leaf_reg': 1.6842797257783797}. Best is trial 4 with value: 0.2369166860218595.
[I 2025-08-27 06:45:14,900] Trial 8 finished with value: -inf and parameters: {'feature_selection': 'MutualInfo', 'k_features': 46, 'itera

100%|██████████| 35/35 [00:01<00:00, 28.38it/s]
100%|██████████| 35/35 [00:01<00:00, 29.17it/s]
100%|██████████| 35/35 [00:01<00:00, 29.00it/s]
100%|██████████| 35/35 [00:01<00:00, 29.20it/s]
100%|██████████| 35/35 [00:01<00:00, 29.41it/s]


[I 2025-08-27 07:16:46,037] Trial 29 finished with value: 0.16375616835337925 and parameters: {'feature_selection': 'MRMR', 'k_features': 35, 'iterations': 636, 'depth': 7, 'learning_rate': 0.015304763362418181, 'l2_leaf_reg': 9.04309736465959}. Best is trial 4 with value: 0.2369166860218595.
[I 2025-08-27 07:16:46,086] Trial 30 finished with value: -inf and parameters: {'feature_selection': 'MutualInfo', 'k_features': 10, 'iterations': 729, 'depth': 9, 'learning_rate': 0.13025825982356032, 'l2_leaf_reg': 0.0030597135875064635}. Best is trial 4 with value: 0.2369166860218595.
[I 2025-08-27 07:18:34,208] Trial 31 finished with value: 0.20751854765321456 and parameters: {'feature_selection': 'None', 'iterations': 871, 'depth': 6, 'learning_rate': 0.05217142352135921, 'l2_leaf_reg': 0.2412979112890647}. Best is trial 4 with value: 0.2369166860218595.
[I 2025-08-27 07:20:13,855] Trial 32 finished with value: 0.20429987477629177 and parameters: {'feature_selection': 'None', 'iterations': 79

100%|██████████| 16/16 [00:00<00:00, 25.85it/s]
100%|██████████| 16/16 [00:00<00:00, 29.54it/s]
100%|██████████| 16/16 [00:00<00:00, 29.28it/s]
100%|██████████| 16/16 [00:00<00:00, 29.58it/s]
100%|██████████| 16/16 [00:00<00:00, 29.50it/s]


[I 2025-08-27 07:32:26,047] Trial 35 finished with value: 0.022521962940110486 and parameters: {'feature_selection': 'MRMR', 'k_features': 16, 'iterations': 452, 'depth': 7, 'learning_rate': 0.1923766857405144, 'l2_leaf_reg': 0.0007999791763830159}. Best is trial 4 with value: 0.2369166860218595.
[I 2025-08-27 07:32:26,100] Trial 36 finished with value: -inf and parameters: {'feature_selection': 'ANOVA', 'k_features': 24, 'iterations': 813, 'depth': 8, 'learning_rate': 0.10186965078685659, 'l2_leaf_reg': 0.012213903532044481}. Best is trial 4 with value: 0.2369166860218595.
[I 2025-08-27 07:32:36,964] Trial 37 finished with value: 0.06901681347386786 and parameters: {'feature_selection': 'None', 'iterations': 598, 'depth': 3, 'learning_rate': 0.18690480748454155, 'l2_leaf_reg': 0.0002359607945439222}. Best is trial 4 with value: 0.2369166860218595.


100%|██████████| 41/41 [00:01<00:00, 28.33it/s]
100%|██████████| 41/41 [00:01<00:00, 28.19it/s]
100%|██████████| 41/41 [00:01<00:00, 28.53it/s]
100%|██████████| 41/41 [00:01<00:00, 28.46it/s]
100%|██████████| 41/41 [00:01<00:00, 28.45it/s]


[I 2025-08-27 07:34:10,478] Trial 38 finished with value: 0.17869987261640902 and parameters: {'feature_selection': 'MRMR', 'k_features': 41, 'iterations': 569, 'depth': 9, 'learning_rate': 0.019123822080914028, 'l2_leaf_reg': 5.46765280004366e-05}. Best is trial 4 with value: 0.2369166860218595.
[I 2025-08-27 07:34:32,686] Trial 39 finished with value: 0.19521356577981025 and parameters: {'feature_selection': 'None', 'iterations': 754, 'depth': 4, 'learning_rate': 0.07017674845905293, 'l2_leaf_reg': 0.07701507116440906}. Best is trial 4 with value: 0.2369166860218595.
[I 2025-08-27 07:34:32,739] Trial 40 finished with value: -inf and parameters: {'feature_selection': 'ANOVA', 'k_features': 50, 'iterations': 486, 'depth': 7, 'learning_rate': 0.1340289717299031, 'l2_leaf_reg': 0.0018599443529891742}. Best is trial 4 with value: 0.2369166860218595.
[I 2025-08-27 07:36:17,844] Trial 41 finished with value: 0.23173990701219518 and parameters: {'feature_selection': 'None', 'iterations': 840

In [None]:
# Ensure X_train, y_train are defined beforehand
 X = X_train.copy()
 y = y_train.copy().values

def objective(trial):
    # Feature Selection only (CatBoost handles the rest)    
    fs_method = trial.suggest_categorical('feature_selection', ['ANOVA', 'MutualInfo', 'RFE', 'MRMR', 'None'])
    if fs_method != 'None':
        k_features = trial.suggest_int('k_features', 10, min(50, X.shape[1]))
        if fs_method == 'ANOVA':
            selector = SelectKBest(f_regression, k=k_features)
        elif fs_method == 'MutualInfo':
            selector = SelectKBest(mutual_info_regression, k=k_features)
        elif fs_method == 'RFE':
            rfe_step = trial.suggest_float('rfe_step', 0.1, 1.0)
            estimator = RandomForestRegressor(
                n_estimators=trial.suggest_int('rfe_n_estimators', 50, 200),
                max_depth=trial.suggest_int('rfe_max_depth', 3, 10),
                random_state=42
            )
            selector = RFE(estimator, n_features_to_select=k_features, step=rfe_step)
        elif fs_method == 'MRMR':
            selector = MRMRTransformer(k_features=k_features)
    else:
        selector = 'passthrough'

    # Comprehensive CatBoost parameter tuning
    params = {
        'iterations': trial.suggest_int('iterations', 100, 2000),
        'depth': trial.suggest_int('depth', 2, 12),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.5, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 100.0, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_state': 42,
        'verbose': False,
        'thread_count': -1
    }

    # Add early stopping and grow policy options
    grow_policy = trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide'])
    params['grow_policy'] = grow_policy
    
    if grow_policy == 'Lossguide':
        params['max_leaves'] = trial.suggest_int('max_leaves', 16, 256)
    else:
        params['min_data_in_leaf'] = trial.suggest_int('min_data_in_leaf', 1, 100)

    model = CatBoostRegressor(**params)

    # Simplified pipeline - just feature selection
    pipeline = Pipeline([
        ('feature_selection', selector),
        ('model', model)
    ])

    # Cross-validation
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    
    try:
        scores = cross_val_score(
            pipeline, X, y, 
            cv=cv, 
            scoring='r2', 
            n_jobs=1  # Keep as 1 for Optuna compatibility
        )
        
        # Calculate mean and std for better evaluation
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        
        # Small penalty for high variance to favor stable models
        penalty = std_score * 0.1
        return mean_score - penalty
        
    except Exception as e:
        trial.set_user_attr('error', str(e))
        return -np.inf

# Enhanced study configuration
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10)
)

# Progress callback
def progress_callback(study, trial):
    if trial.number % 10 == 0:
        print(f"Trial {trial.number}: Best R² = {study.best_value:.4f}")

# Run optimization
study.optimize(
    objective, 
    n_trials=100,  # Increased trials for better exploration
    timeout=3600,  # 1 hour timeout
    show_progress_bar=True,
    callbacks=[progress_callback]
)

# Results reporting
print("\n" + "="*50)
print("OPTIMIZATION RESULTS")
print("="*50)
print(f"Best R²: {study.best_value:.4f}")
print(f"Number of trials: {len(study.trials)}")
print(f"Best parameters:")

best_params = study.best_params
for key, value in best_params.items():
    print(f"  {key}: {value}")

# Additional analysis
print(f"\nBest trial details:")
best_trial = study.best_trial
print(f"  Trial number: {best_trial.number}")
print(f"  Duration: {best_trial.duration:.2f} seconds")

# Save study for later analysis
import joblib
joblib.dump(study, 'optuna_study.pkl')
print("\nStudy saved as 'optuna_study.pkl'")

In [13]:
import optuna.visualization as vis

# 1. Parallel Coordinate Plot (Best for seeing parameter relationships)
fig = vis.plot_parallel_coordinate(
    study,
    params=[
        'feature_selection',
        'k_features',
        'depth',
        'learning_rate'
    ],
    target_name="R² Score"
)
fig.show()

# 2. Parameter Importance Plot
fig = vis.plot_param_importances(study, target_name="R² Score")
fig.show()

# 3. Slice Plot (Best for seeing individual parameter effects)
fig = vis.plot_slice(
    study,
    params=[
        'depth',
        'learning_rate',
        'k_features'
    ],
    target_name="R² Score"
)
fig.show()

[W 2025-08-27 08:54:59,871] Trial 3 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,872] Trial 6 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,873] Trial 7 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,873] Trial 8 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,874] Trial 20 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,874] Trial 25 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,874] Trial 30 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,875] Trial 36 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,875] Trial 40 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,875] Tria

[W 2025-08-27 08:54:59,888] Trial 3 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,889] Trial 6 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,889] Trial 7 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,889] Trial 8 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,890] Trial 20 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,891] Trial 25 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,891] Trial 30 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,891] Trial 36 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,892] Trial 40 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:54:59,892] Tria

[W 2025-08-27 08:55:00,241] Trial 3 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:55:00,242] Trial 6 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:55:00,242] Trial 7 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:55:00,242] Trial 8 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:55:00,242] Trial 20 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:55:00,243] Trial 25 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:55:00,243] Trial 30 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:55:00,243] Trial 36 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:55:00,244] Trial 40 is omitted in visualization because its objective value is inf or nan.
[W 2025-08-27 08:55:00,244] Tria

# Apply Feature Selection to Training & Test Data

In [8]:
# apply your feature selection code from before
best_fs_method = study.best_params.get('feature_selection', 'None')

if best_fs_method != 'None':
    k_features = study.best_params['k_features']
    
    if best_fs_method == 'ANOVA':
        selector = SelectKBest(f_regression, k=k_features)
    elif best_fs_method == 'MutualInfo':
        selector = SelectKBest(mutual_info_regression, k=k_features)
    elif best_fs_method == 'RFE':
        estimator = RandomForestRegressor(
            n_estimators=study.best_params.get('rfe_n_estimators', 100),
            max_depth=study.best_params.get('rfe_max_depth', 5),
            random_state=42
        )
        selector = RFE(
            estimator, 
            n_features_to_select=k_features,
            step=study.best_params.get('rfe_step', 1)
        )
    elif best_fs_method == 'MRMR':
        selector = MRMRTransformer(k_features=k_features)
    
    selector.fit(X, y)
    if hasattr(selector, 'get_support'):  # For SelectKBest/RFE
        selected_features = X.columns[selector.get_support()]
    else:  # For MRMRTransformer
        selected_features = selector.selected_features
    X_best = X[selected_features]
else:
    X_best = X
    selected_features = X.columns

# Apply the same feature selection to test data
if best_fs_method != 'None':
    if best_fs_method == 'MRMR':
        X_test_final = X_test[selected_features]
    else:
        X_test_final = selector.transform(X_test)  # Use the already fitted selector
        if isinstance(X_test, pd.DataFrame):
            X_test_final = pd.DataFrame(X_test_final, columns=selected_features)
else:
    X_test_final = X_test

# Create Catboost Model with Optimized Parameters and Fit Model

In [9]:
best_model = CatBoostRegressor(
    iterations=study.best_params['iterations'],
    depth=study.best_params['depth'],
    learning_rate=study.best_params['learning_rate'],
    l2_leaf_reg=study.best_params['l2_leaf_reg'],
    random_state=42,
    verbose=False
)

# Train on full imputed data
best_model.fit(X_best, y)

<catboost.core.CatBoostRegressor at 0x27d73850d40>

# Run Model and Get Outcomes

In [10]:
# Make predictions
y_pred = best_model.predict(X_test_final)

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nFinal Model Evaluation on Test Set:")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

# Calculate accuracy within ±1 point
correct = np.sum(np.abs(y_test - y_pred) <= 1)
accuracy = correct / len(y_test)
print(f"Accuracy within ±1 point: {accuracy:.4f}")


Final Model Evaluation on Test Set:
RMSE: 1.1496
R²: 0.1765
Accuracy within ±1 point: 0.6250
