Install Dependencies

In [25]:
!conda install -c conda-forge scikit-learn=1.5.0 shap xgboost pandas numpy -y
!pip install streamlit joblib

/bin/bash: line 1: conda: command not found


Import Libraries

In [26]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import joblib
import shap
import matplotlib.pyplot as plt

Load & Preprocess Data

In [27]:
# Load dataset
df = pd.read_csv('sleep_deprivation_dataset_detailed (1).csv')
df = df.drop('Participant_ID', axis=1)

# Define target and features
target = 'Stroop_Task_Reaction_Time'
y = df[target]
X = df.drop([target, 'N_Back_Accuracy', 'Emotion_Regulation_Score', 'PVT_Reaction_Time'], axis=1)

# Verify columns
print("Columns used for modeling:\n", X.columns.tolist())

Columns used for modeling:
 ['Sleep_Hours', 'Sleep_Quality_Score', 'Daytime_Sleepiness', 'Age', 'Gender', 'BMI', 'Caffeine_Intake', 'Physical_Activity_Level', 'Stress_Level']


Preprocessing Pipeline

In [28]:
numerical_features = ['Sleep_Hours', 'Sleep_Quality_Score', 'Daytime_Sleepiness',
                     'Age', 'BMI', 'Caffeine_Intake', 'Physical_Activity_Level', 'Stress_Level']
categorical_features = ['Gender']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

Train-Test Split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)
print(f"Training Data: {X_train.shape}, Test Data: {X_test.shape}")

Training Data: (48, 9), Test Data: (12, 9)


Define Models & Hyperparameters

In [30]:
models = {
    'Linear Regression': (LinearRegression(), {}),
    'KNN': (KNeighborsRegressor(), {'model__n_neighbors': [3, 5, 7]}),
    'Decision Tree': (DecisionTreeRegressor(), {'model__max_depth': [3, 5, None]}),
    'Random Forest': (RandomForestRegressor(), {'model__n_estimators': [100, 200], 'model__max_depth': [5, 10]}),
    'XGBoost': (XGBRegressor(), {'model__n_estimators': [100, 200], 'model__max_depth': [3, 5], 'model__learning_rate': [0.01, 0.1]})
}

Train & Compare Models

In [32]:
results = {}

for name, (model, params) in models.items():
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Hyperparameter tuning
    grid = GridSearchCV(
        pipeline,
        params,
        cv=5,
        scoring='neg_root_mean_squared_error'
    )
    grid.fit(X_train, y_train)

    # Evaluate
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Save results
    results[name] = {
        'RMSE': rmse,
        'R²': r2,
        'Best Params': grid.best_params_
    }

# Display results
results_df = pd.DataFrame(results).T
print(results_df[['RMSE', 'R²']])

                       RMSE        R²
Linear Regression  1.026994 -0.459962
KNN                1.260131 -0.791387
Decision Tree       1.02264 -0.453772
Random Forest      0.938969 -0.334827
XGBoost            0.983477 -0.398099


Save Best Model (XGBoost)

In [35]:
# Cell 8: Save Best Model (XGBoost)
# Extract best XGBoost parameters
best_params = results['XGBoost']['Best Params']
best_params = {key.replace('model__', ''): value for key, value in best_params.items()}  # Fix parameter names

# Create final pipeline
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(**best_params))
])
final_pipeline.fit(X_train, y_train)

# Save for deployment
joblib.dump(final_pipeline, 'best_model.pkl')
print("Model saved as best_model.pkl!")

Model saved as best_model.pkl!


SHAP Analysis

In [34]:
# Preprocess test data
X_test_preprocessed = preprocessor.transform(X_test)

# Explain model
explainer = shap.Explainer(final_pipeline.named_steps['model'])
shap_values = explainer(X_test_preprocessed)

# Save plots
plt.figure(figsize=(10, 6))
shap.plots.beeswarm(shap_values, show=False)
plt.savefig('shap_beeswarm.png', bbox_inches='tight')
plt.close()
print("SHAP plot saved as shap_beeswarm.png!")

SHAP plot saved as shap_beeswarm.png!
