In [None]:
from google.colab import files
uploaded = files.upload()
import pandas as pd
df = pd.read_csv('df_merged.csv')

Saving df_merged.csv to df_merged (1).csv


In [None]:
import pandas as pd

# 1) Reload
df = pd.read_csv('df_merged.csv')

# 2) Inspect the raw quality ratings
print("Unique sleep_quality ratings:\n", df['sleep_quality'].value_counts().sort_index())

# 3) Decide a threshold (here median; you can pick e.g. 6 or any meaningful cut)
threshold = df['sleep_quality'].median()
print("\nBinarizing at median =", threshold)

# 4) Create the binary column: 1 = poor sleep (≤median), 0 = good sleep (>median)
df['sleep_quality_binary'] = (df['sleep_quality'] <= threshold).astype(int)

# 5) Confirm no more NaNs
print("NaNs in binary column:", df['sleep_quality_binary'].isna().sum())


Unique sleep_quality ratings:
 sleep_quality
1     66
2     46
3     54
4     46
5     41
6     57
7     45
8     40
9     55
10    50
Name: count, dtype: int64

Binarizing at median = 5.0
NaNs in binary column: 0


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint, uniform
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import pandas as pd

# 1) Load & clean
df = pd.read_csv('df_merged.csv')
threshold = df['sleep_quality'].median()
df['sleep_quality_binary'] = (df['sleep_quality'] <= threshold).astype(int)
df = df.dropna()
X = df.drop(columns=[
    'sleep_quality', 'sleep_quality_binary','bedtime','wake_time'
])
y = df['sleep_quality_binary']

# 2) Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# 3) Identify column types
numeric_cols     = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# 4) Build ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

# 5) Tuning + evaluation helper
def tune_and_eval(model, param_dist, name):
    pipe = Pipeline([
        ('pre', preprocessor),
        ('clf', model)
    ])
    search = RandomizedSearchCV(
        pipe, param_distributions=param_dist,
        n_iter=20, cv=5, scoring='roc_auc',
        random_state=42, n_jobs=-1, error_score='raise'
    )
    search.fit(X_train, y_train)
    best = search.best_estimator_
    proba = best.predict_proba(X_test)[:,1]
    pred  = best.predict(X_test)

    print(f"\n=== {name} ===")
    print("Best params:", search.best_params_)
    print("ROC AUC:   ", roc_auc_score(y_test, proba))
    print(classification_report(y_test, pred))
    print("Confusion matrix:\n", confusion_matrix(y_test, pred))

# 6) Random Forest
rf_params = {
    'clf__n_estimators':      randint(50, 300),
    'clf__max_depth':         randint(3, 20),
    'clf__min_samples_split': randint(2, 10)
}
tune_and_eval(RandomForestClassifier(random_state=42), rf_params, 'Random Forest')

# 7) Gradient Boosting
gb_params = {
    'clf__n_estimators':  randint(50, 300),
    'clf__learning_rate': uniform(0.01, 0.3),
    'clf__max_depth':     randint(3, 10)
}
tune_and_eval(GradientBoostingClassifier(random_state=42), gb_params, 'Gradient Boosting')



=== Random Forest ===
Best params: {'clf__max_depth': 9, 'clf__min_samples_split': 5, 'clf__n_estimators': 142}
ROC AUC:    0.875
              precision    recall  f1-score   support

           0       0.75      0.75      0.75         4
           1       0.50      0.50      0.50         2

    accuracy                           0.67         6
   macro avg       0.62      0.62      0.62         6
weighted avg       0.67      0.67      0.67         6

Confusion matrix:
 [[3 1]
 [1 1]]

=== Gradient Boosting ===
Best params: {'clf__learning_rate': np.float64(0.12236203565420874), 'clf__max_depth': 7, 'clf__n_estimators': 64}
ROC AUC:    1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         2

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

Confusion matrix:
 [[4 