# Import Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVR, SVC
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, Pool, cv
import optuna
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.pipeline import Pipeline

import mrmr
from mrmr import mrmr_classif, mrmr_regression

import warnings

# Set random seed for reproducibility
np.random.seed(42)

# Import Data

In [None]:
# Load the data
file_path = "features-Master.csv"
data = pd.read_csv(file_path)

# Shuffle the data
shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True)
data_shuffled = shuffled.iloc[:, 4:]
labels_shuffled = shuffled["Comfort Score"]

# Break Data into Train/Test Split

In [None]:
# For regression (using stratified split based on binned target)
X_train, X_test, y_train, y_test = train_test_split(
    data_shuffled, 
    labels_shuffled, 
    test_size=0.1, 
    stratify=labels_shuffled,
    random_state=42
)

# Scale Data

In [None]:
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train))
X_test_scaled = pd.DataFrame(scaler.transform(X_test))

# MRMR Conversion Helper Class

In [None]:
class MRMRTransformer:
    def __init__(self, k_features):
        self.k_features = k_features
        self.selected_features = None
        self.column_names = None
    
    def fit(self, X, y):
        # Convert to DataFrame if not already
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        
        # Reset indices to avoid alignment issues
        X = X.reset_index(drop=True)
        y = pd.Series(y).reset_index(drop=True)
        
        self.column_names = X.columns.tolist()
        try:
            self.selected_features = mrmr_regression(X, y, K=self.k_features)
        except:
            # Fallback to random features if MRMR fails
            self.selected_features = np.random.choice(X.columns, size=min(self.k_features, len(X.columns)), replace=False)
        return self
    
    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.column_names)
        return X[self.selected_features]

# Regression Optimization Pipeline

In [None]:
# X_train, y_train should be defined beforehand
X = X_train_scaled.copy()
y = y_train.copy().values  # Ensure numpy array for y

def objective(trial):
    # 1. Imputation
    impute_method = trial.suggest_categorical('imputation', ['mean', 'median', 'knn', 'iterative'])
    if impute_method == 'mean':
        imputer = SimpleImputer(strategy='mean')
    elif impute_method == 'median':
        imputer = SimpleImputer(strategy='median')
    elif impute_method == 'knn':
        imputer = KNNImputer(n_neighbors=trial.suggest_int('knn_neighbors', 3, 15))
    else:
        imputer = IterativeImputer(
            max_iter=trial.suggest_int('iterative_max_iter', 10, 50),
            random_state=42,
            tol=0.01
        )

    # 2. Feature Selection
    fs_method = trial.suggest_categorical('feature_selection', ['ANOVA', 'MutualInfo', 'RFE', 'MRMR', 'None'])
    if fs_method != 'None':
        k_features = trial.suggest_int('k_features', 10, min(50, X.shape[1]))
        if fs_method == 'ANOVA':
            selector = SelectKBest(f_regression, k=k_features)
        elif fs_method == 'MutualInfo':
            selector = SelectKBest(mutual_info_regression, k=k_features)
        elif fs_method == 'RFE':
            rfe_step = trial.suggest_float('rfe_step', 0.1, 1.0)
            estimator = RandomForestRegressor(
                n_estimators=trial.suggest_int('rfe_n_estimators', 50, 200),
                max_depth=trial.suggest_int('rfe_max_depth', 3, 10),
                random_state=42
            )
            selector = RFE(estimator, n_features_to_select=k_features, step=rfe_step)
        elif fs_method == 'MRMR':
            selector = MRMRTransformer(k_features=k_features)
    else:
        selector = 'passthrough'

    # 3. Model Selection
    model_name = trial.suggest_categorical('model', ['RandomForest', 'GradientBoosting', 'CatBoost', 'XGBoost'])

    if model_name == 'XGBoost':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'random_state': 42
        }
        model = XGBRegressor(**params)
    elif model_name == 'RandomForest':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 20),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
            'random_state': 42
        }
        model = RandomForestRegressor(**params)
    elif model_name == 'GradientBoosting':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'random_state': 42
        }
        model = GradientBoostingRegressor(**params)
    else:  # CatBoost
        params = {
            'iterations': trial.suggest_int('iterations', 100, 1000),
            'depth': trial.suggest_int('depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
            'random_state': 42,
            'verbose': False
        }
        model = CatBoostRegressor(**params)

    # 5. Pipeline
    pipeline = Pipeline([
        ('imputer', imputer),
        ('feature_selection', selector),
        ('model', model)
    ])

    # 6. Cross-validation for regression
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    try:
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2', n_jobs=1)
        return np.mean(scores)
    except Exception:
        return -np.inf

# Run Optuna Study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

# Best result
print("Best trial:")
trial = study.best_trial
print(f"R²: {trial.value:.4f}")
print("Params: ")
for key, value in trial.params.items():
    print(f"  {key}: {value}")

# Visualization

In [None]:
import optuna.visualization as vis

# 1. Parallel Coordinate Plot (Best for seeing parameter relationships)
fig = vis.plot_parallel_coordinate(
    study,
    params=[
        'imputation',
        'feature_selection',
        'k_features',
        'model',
        'n_estimators',
        'max_depth',
        'learning_rate'
    ],
    target_name="R² Score"
)
fig.show()

# 2. Parameter Importance Plot
fig = vis.plot_param_importances(study, target_name="R² Score")
fig.show()

# 3. Slice Plot (Best for seeing individual parameter effects)
fig = vis.plot_slice(
    study,
    params=[
        'n_estimators',
        'max_depth',
        'learning_rate',
        'k_features'
    ],
    target_name="R² Score"
)
fig.show()

# 4. Contour Plot (For seeing parameter interactions)
fig = vis.plot_contour(
    study,
    params=[
        ('n_estimators', 'max_depth'),
        ('learning_rate', 'max_depth'),
    ],
    target_name="R² Score"
)
fig.show()

# Apply best imputation and feature selection to training data

In [None]:
# Get best imputation method from study
best_impute_method = study.best_params.get('imputation', 'mean')

# Apply the best imputation method
if best_impute_method == 'mean':
    imputer = SimpleImputer(strategy='mean')
elif best_impute_method == 'median':
    imputer = SimpleImputer(strategy='median')
elif best_impute_method == 'knn':
    imputer = KNNImputer(n_neighbors=study.best_params.get('knn_neighbors', 5))
elif best_impute_method == 'iterative':
    imputer = IterativeImputer(
        max_iter=study.best_params.get('iterative_max_iter', 50),
        random_state=42
    )

# Fit and transform the data
X_imputed = imputer.fit_transform(X)

# Convert back to DataFrame (if needed)
if isinstance(X, pd.DataFrame):
    X_imputed = pd.DataFrame(X_imputed, columns=X.columns)

# Now apply your feature selection code from before
best_fs_method = study.best_params.get('feature_selection', 'None')

if best_fs_method != 'None':
    k_features = study.best_params['k_features']
    
    if best_fs_method == 'ANOVA':
        selector = SelectKBest(f_regression, k=k_features)
    elif best_fs_method == 'MutualInfo':
        selector = SelectKBest(mutual_info_regression, k=k_features)
    elif best_fs_method == 'RFE':
        estimator = RandomForestRegressor(
            n_estimators=study.best_params.get('rfe_n_estimators', 100),
            max_depth=study.best_params.get('rfe_max_depth', 5),
            random_state=42
        )
        selector = RFE(
            estimator, 
            n_features_to_select=k_features,
            step=study.best_params.get('rfe_step', 1)
        )
    elif best_fs_method == 'MRMR':
        selector = MRMRTransformer(k_features=k_features)
    
    selector.fit(X_imputed, y)
    if hasattr(selector, 'get_support'):  # For SelectKBest/RFE
        selected_features = X.columns[selector.get_support()]
    else:  # For MRMRTransformer
        selected_features = selector.selected_features
    X_best = X_imputed[selected_features]
else:
    X_best = X_imputed
    selected_features = X.columns

# Apply best model and fit

In [None]:
# Prepare the best model
best_model_name = study.best_params['model']

if best_model_name == 'XGBoost':
    best_model = XGBRegressor(
        n_estimators=study.best_params['n_estimators'],
        max_depth=study.best_params['max_depth'],
        learning_rate=study.best_params['learning_rate'],
        subsample=study.best_params['subsample'],
        colsample_bytree=study.best_params['colsample_bytree'],
        random_state=42
    )
elif best_model_name == 'RandomForest':
    best_model = RandomForestRegressor(
        n_estimators=study.best_params['n_estimators'],
        max_depth=study.best_params['max_depth'],
        min_samples_split=study.best_params['min_samples_split'],
        min_samples_leaf=study.best_params['min_samples_leaf'],
        random_state=42
    )
elif best_model_name == 'GradientBoosting':
    best_model = GradientBoostingRegressor(
        n_estimators=study.best_params['n_estimators'],
        max_depth=study.best_params['max_depth'],
        learning_rate=study.best_params['learning_rate'],
        min_samples_split=study.best_params['min_samples_split'],
        min_samples_leaf=study.best_params['min_samples_leaf'],
        subsample=study.best_params['subsample'],
        random_state=42
    )
else: # best_model_name == 'CatBoost':
    best_model = CatBoostRegressor(
        iterations=study.best_params['iterations'],
        depth=study.best_params['depth'],
        learning_rate=study.best_params['learning_rate'],
        l2_leaf_reg=study.best_params['l2_leaf_reg'],
        random_state=42,
        verbose=False
    )

# Train on full imputed data
best_model.fit(X_best, y)

# Evaluate with test data

In [None]:
# Prepare test data with same transformations as training
# 1. Apply the same imputation
X_test_imputed = imputer.transform(X_test_scaled)  # Use the already fitted imputer

# Convert back to DataFrame if needed
if isinstance(X_test_scaled, pd.DataFrame):
    X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test_scaled.columns)

# 2. Apply the same feature selection
if best_fs_method != 'None':
    if best_fs_method == 'MRMR':
        X_test_final = X_test_imputed[selected_features]
    else:
        X_test_final = selector.transform(X_test_imputed)  # Use the already fitted selector
        if isinstance(X_test, pd.DataFrame):
            X_test_final = pd.DataFrame(X_test_final, columns=selected_features)
else:
    X_test_final = X_test_imputed

# Make predictions
y_pred = best_model.predict(X_test_final)

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nFinal Model Evaluation on Test Set:")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

# Calculate accuracy within ±1 point
correct = np.sum(np.abs(y_test - y_pred) <= 1)
accuracy = correct / len(y_test)
print(f"Accuracy within ±1 point: {accuracy:.4f}")

# Plot predicted vs actual
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, alpha=0.5, label='Predictions')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--r', label='Perfect prediction')
plt.xlabel('Actual Comfort Score')
plt.ylabel('Predicted Comfort Score')
plt.title(f'Test Set Performance\n({best_model_name} with {best_fs_method} feature selection)')
plt.legend()
plt.grid(True)
plt.show()

# Optional: Feature importance if available
if hasattr(best_model, 'feature_importances_'):
    plt.figure(figsize=(10, 6))
    if hasattr(selector, 'get_support'):
        features = X.columns[selector.get_support()]
    else:
        features = selected_features
    importances = pd.Series(best_model.feature_importances_, index=features)
    importances.sort_values().plot(kind='barh')
    plt.title('Feature Importances')
    plt.show()