In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
import optuna
from optuna.samplers import TPESampler

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load dataset
df = pd.read_excel(r"C:\Users\jazil\OneDrive\pep-ds\sleep-proj\Balanced_Sleep_Quality_Data.xlsx")

# Prepare features and target
X = df.drop(columns=['Person ID', 'Quality of Sleep'])
y = df['Quality of Sleep']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocess data upfront: encode categorical and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ],
    remainder='passthrough'
)

In [9]:
# Fit and transform the entire dataset once
X_processed = preprocessor.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Define the Optuna objective function for hyperparameter tuning
def objective(trial):
    # Suggest hyperparameters for Random Forest
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 100, 300)
    rf_max_depth = trial.suggest_int('rf_max_depth', 5, 15)

    # Suggest hyperparameters for Neural Network
    nn_units1 = trial.suggest_int('nn_units1', 50, 200)
    nn_units2 = trial.suggest_int('nn_units2', 25, 100)
    nn_alpha = trial.suggest_float('nn_alpha', 1e-5, 1e-3, log=True)

    # Define base learners
    rf_model = RandomForestRegressor(
        n_estimators=rf_n_estimators,
        max_depth=rf_max_depth,
        random_state=42,
        n_jobs=-1
    )

    nn_model = MLPRegressor(
        hidden_layer_sizes=(nn_units1, nn_units2),
        activation='relu',
        solver='adam',
        random_state=42,
        max_iter=500,
        alpha=nn_alpha,
        early_stopping=True
    )
     # Define meta-learner
    meta_model = RandomForestRegressor(random_state=42, n_jobs=-1)
    stacking_regressor = StackingRegressor(
        estimators=[
            ('rf', rf_model),
            ('nn', nn_model),
        ],
        final_estimator=meta_model,
        cv=5,
        passthrough=False
    )

    # Train model
    stacking_regressor.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = stacking_regressor.predict(X_test)
    return r2_score(y_test, y_pred)

In [10]:
# Create Optuna study and optimize
sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=50)

[I 2025-06-09 15:04:01,614] A new study created in memory with name: no-name-df4c92cc-fabb-4bd1-a55d-18133ece03db
[I 2025-06-09 15:04:11,726] Trial 0 finished with value: 0.8288140343239875 and parameters: {'rf_n_estimators': 175, 'rf_max_depth': 15, 'nn_units1': 160, 'nn_units2': 70, 'nn_alpha': 2.0513382630874486e-05}. Best is trial 0 with value: 0.8288140343239875.
[I 2025-06-09 15:04:20,477] Trial 1 finished with value: 0.8704909379546529 and parameters: {'rf_n_estimators': 131, 'rf_max_depth': 5, 'nn_units1': 180, 'nn_units2': 70, 'nn_alpha': 0.0002607024758370766}. Best is trial 1 with value: 0.8704909379546529.
[I 2025-06-09 15:04:28,948] Trial 2 finished with value: 0.8759915953609922 and parameters: {'rf_n_estimators': 104, 'rf_max_depth': 15, 'nn_units1': 175, 'nn_units2': 41, 'nn_alpha': 2.3102018878452926e-05}. Best is trial 2 with value: 0.8759915953609922.
[I 2025-06-09 15:04:35,487] Trial 3 finished with value: 0.8815891880052097 and parameters: {'rf_n_estimators': 136, 

In [11]:
# Output best trial results
print("Best trial:")
trial = study.best_trial
print(f"  R2 Score: {trial.value:.4f}")
print("  Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

Best trial:
  R2 Score: 0.9076
  Params:
    rf_n_estimators: 262
    rf_max_depth: 12
    nn_units1: 59
    nn_units2: 29
    nn_alpha: 1.6023703509843143e-05


In [12]:
# Train final model with best hyperparameters
best_rf_n_estimators = trial.params['rf_n_estimators']
best_rf_max_depth = trial.params['rf_max_depth']
best_nn_units1 = trial.params['nn_units1']
best_nn_units2 = trial.params['nn_units2']
best_nn_alpha = trial.params['nn_alpha']

rf_model = RandomForestRegressor(
    n_estimators=best_rf_n_estimators,
    max_depth=best_rf_max_depth,
    random_state=42,
    n_jobs=-1
)

nn_model = MLPRegressor(
    hidden_layer_sizes=(best_nn_units1, best_nn_units2),
    activation='relu',
    solver='adam',
    random_state=42,
    max_iter=500,
    alpha=best_nn_alpha,
    early_stopping=True
)

meta_model = RandomForestRegressor(random_state=42, n_jobs=-1)

stacking_regressor = StackingRegressor(
    estimators=[
        ('rf', rf_model),
        ('nn', nn_model),
    ],
    final_estimator=meta_model,
    cv=5,
    passthrough=False
)


In [13]:
# Fit final model
stacking_regressor.fit(X_train, y_train)

# Final evaluation on test set
y_pred = stacking_regressor.predict(X_test)
print(f"Final R2 Score: {r2_score(y_test, y_pred):.4f}")
print(f"Final MSE: {mean_squared_error(y_test, y_pred):.4f}")

Final R2 Score: 0.9076
Final MSE: 0.3784


In [5]:
# Get the feature names (keys)
feature_keys = X.columns.tolist()
print("Feature keys (column names):")
print(feature_keys)


Feature keys (column names):
['Gender', 'Age', 'Occupation', 'Sleep Duration', 'Physical Activity Level', 'Stress Level', 'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps']


In [6]:
for col in X.columns:
    unique_vals = X[col].unique()
    print(f"Feature '{col}' unique values ({len(unique_vals)}): {unique_vals}\n")


Feature 'Gender' unique values (2): ['Male' 'Female']

Feature 'Age' unique values (44): [27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 48 49 50 51 52
 53 54 55 56 57 58 59 46 62 20 47 26 63 23 24 22 64 18 19 25]

Feature 'Occupation' unique values (14): ['Software Engineer' 'Doctor' 'Sales Representative' 'Teacher' 'Nurse'
 'Engineer' 'Accountant' 'Scientist' 'Lawyer' 'Salesperson' 'Manager'
 'Unemployed' 'Student' 'Self-employed']

Feature 'Sleep Duration' unique values (463): [ 6.1         6.2         5.9         6.3         7.8         6.
  6.5         7.6         7.7         7.9         6.4         7.5
  7.2         5.8         6.7         7.3         7.1         6.6
  7.4         6.9         8.          6.8         8.1         8.3
  8.5         8.4         8.2         5.5         8.8         5.4
 10.2         9.2         5.3        11.8         9.4         4.2
 10.3         5.          4.4         1.8         9.1         9.7
  8.9         5.2         4.3         4.9   