# New Section

In [9]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint, uniform
import pandas as pd

file_path = r"/content/Balanced_Sleep_Quality_Data.xlsx"
df = pd.read_excel(file_path)

# Drop 'Blood Pressure' which was identified as problematic by XGBoost error
df.drop(['Blood Pressure'], axis=1, inplace=True)

# Separate features and target
X = df.drop('Quality of Sleep', axis=1)
y = df['Quality of Sleep']

# Convert continuous target to discrete categories
# You can adjust the bins and labels based on your data and desired categories
bins = [0, 4, 6, 8, 10] # Example bins
labels = ['Poor', 'Fair', 'Good', 'Excellent'] # Example labels
y_classified = pd.cut(y, bins=bins, labels=labels, right=False)

# Drop rows where y_classified is NaN (values outside bins)
nan_indices = y_classified.isna()
X = X[~nan_indices]
y_classified = y_classified[~nan_indices]

# Encode categorical features
label_cols = ['Gender', 'Occupation', 'BMI Category'] # Exclude 'Blood Pressure' as it's dropped
for col in label_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_classified, test_size=0.2, random_state=42, stratify=y_classified)


# Base learners with pipelines for scaling where needed
base_learners = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('xgb', Pipeline([
        ('scaler', StandardScaler()), # Add scaler for XGBoost after encoding
        ('xgb_model', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))
    ])),
    ('svm', Pipeline([
        ('scaler', StandardScaler()),
        ('svc', SVC(probability=True, random_state=42))
    ])),
    ('et', RandomForestClassifier(random_state=42)) # Changed ExtraTrees to RandomForest for consistency
]

meta_model = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(solver='saga', max_iter=5000, random_state=42))
])

stacked_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_model,
    cv=10,
    n_jobs=-1,
    passthrough=True  # Pass original features to meta-model
)

# Parameter grid for RandomizedSearch
param_dist = {
    'rf__n_estimators': randint(100, 300),
    'rf__max_depth': randint(5, 15),
    'xgb__xgb_model__n_estimators': randint(50, 200), # Adjusted param name for pipeline
    'xgb__xgb_model__max_depth': randint(3, 10), # Adjusted param name for pipeline
    'xgb__xgb_model__learning_rate': uniform(0.01, 0.3), # Adjusted param name for pipeline
    'svm__svc__C': uniform(0.1, 10),
    'et__n_estimators': randint(100, 300),
    'et__max_depth': randint(5, 15),
    'final_estimator__lr__C': uniform(0.1, 10),
}

search = RandomizedSearchCV(
    estimator=stacked_model,
    param_distributions=param_dist,
    n_iter=50,
    scoring='accuracy',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

print("Best Params:", search.best_params_)
print("Best CV Accuracy:", search.best_score_)

# Evaluate on test set
y_pred = search.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Params: {'et__max_depth': 14, 'et__n_estimators': 271, 'final_estimator__lr__C': np.float64(2.541255222477742), 'rf__max_depth': 10, 'rf__n_estimators': 259, 'svm__svc__C': np.float64(5.287906217433661), 'xgb__xgb_model__learning_rate': np.float64(0.22090568766855337), 'xgb__xgb_model__max_depth': 3, 'xgb__xgb_model__n_estimators': 111}
Best CV Accuracy: 0.875
Test Accuracy: 0.9117647058823529
              precision    recall  f1-score   support

   Excellent       1.00      0.91      0.95        34
        Fair       0.84      0.94      0.89        50
        Good       0.89      0.95      0.92        42
        Poor       0.97      0.84      0.90        44

    accuracy                           0.91       170
   macro avg       0.93      0.91      0.92       170
weighted avg       0.92      0.91      0.91       170

[[31  1  2  0]
 [ 0 47  2  1]
 [ 0  2 40  0]
 [ 0  6  1 37]]
