# -------------------------------------------
# Sleep Duration Prediction using ML Pipeline
# -------------------------------------------

# Step 1: Data Preprocessing Pipeline

### Importing Libraries

In [32]:
import pandas as pd

In [33]:
import numpy as np

In [34]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [35]:
from sklearn.impute import SimpleImputer

In [36]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures

In [37]:
from sklearn.compose import ColumnTransformer

In [38]:
from sklearn.pipeline import Pipeline

In [39]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [40]:
from sklearn.metrics import mean_absolute_error, r2_score

In [41]:
from sklearn.base import BaseEstimator, TransformerMixin

In [42]:
import joblib

In [43]:
!pip install fastapi



In [44]:
from fastapi import FastAPI

In [45]:
from pydantic import BaseModel

#### 1. Dataset Loading

In [46]:
df = pd.read_csv("data.csv")

In [47]:
print(df.head())

   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   

  Blood Pressure  Heart Rate  Daily Steps Sleep Disorder  
0         126/83          77         4200            NaN  
1         125/80          75      

#### 2. Basic Cleaning

In [48]:
df = df.dropna(subset=['Sleep Duration'])  # Drop rows where target is missing

#### 3. Feature Definitions

In [49]:
target = 'Sleep Duration'
categorical = ['Gender', 'Occupation', 'BMI Category']
numerical = ['Age', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps']

In [50]:
X = df[categorical + numerical]
y = df[target]

#### 4. Custom Transformer for Polynomial Features

In [51]:
class AddPolynomialFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, degree=2):
        self.degree = degree
        self.poly = PolynomialFeatures(degree=self.degree, include_bias=False)

    def fit(self, X, y=None):
        self.feature_names = X.columns
        self.poly.fit(X)
        return self

    def transform(self, X):
        poly_features = self.poly.transform(X)
        poly_df = pd.DataFrame(poly_features, columns=self.poly.get_feature_names_out(self.feature_names), index=X.index)
        return poly_df


#### 5. Preprocessing Pipelines

In [52]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', AddPolynomialFeatures(degree=2))
])

categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numerical),
    ('cat', categorical_pipeline, categorical)
])

#### 6. Full Pipelines for Each Model

In [53]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(random_state=42))
])


#### 7. Train-Test Split

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### 8. Hyperparameter Tuning

In [55]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5, 7]
}

grid_rf = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_rf.fit(X_train, y_train)

grid_gb = GridSearchCV(gb_pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_gb.fit(X_train, y_train)

ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 1001, in fit_transform
    result = self._call_func_on_transformers(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 910, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/utils/parallel.py", line 77, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/joblib/parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/joblib/parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/utils/parallel.py", line 139, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/pipeline.py", line 730, in fit_transform
    return last_step.fit_transform(
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/sklearn/base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_1739/2567451119.py", line 7, in fit
AttributeError: 'numpy.ndarray' object has no attribute 'columns'


#### 9. Evaluation

In [None]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"🔹 {model_name} Evaluation:")
    print(f"    MAE: {mae:.3f}")
    print(f"    R² Score: {r2:.3f}\n")
    return mae, r2

print("\n📊 Model Evaluation Results:")
mae_rf, r2_rf = evaluate_model(grid_rf, X_test, y_test, "Random Forest")
mae_gb, r2_gb = evaluate_model(grid_gb, X_test, y_test, "Gradient Boosting")

#### 10. Best Model Selection (based on MAE)

In [None]:
best_model = grid_rf if mae_rf <= mae_gb else grid_gb
print(f"✅ Best model selected: {'Random Forest' if best_model == grid_rf else 'Gradient Boosting'}")

#### 11. Feature Importance (for Random Forest)

In [None]:
if hasattr(best_model.best_estimator_.named_steps['model'], 'feature_importances_'):
    feature_names = best_model.best_estimator_.named_steps['preprocessor'].transformers_[0][1] \
        .named_steps['poly'].poly.get_feature_names_out(numerical)
    importances = best_model.best_estimator_.named_steps['model'].feature_importances_
    important_features = pd.Series(importances, index=feature_names).sort_values(ascending=False)
    print("\n🔍 Top 10 Important Features:\n", important_features.head(10))

#### 12. Save the Best Model

In [None]:
joblib.dump(best_model, "sleep_duration_model.joblib")
print("\n💾 Model saved successfully as 'sleep_duration_model.joblib'")

#### 13. (Optional) Deployment API using FastAPI

In [None]:
app = FastAPI()

class InputData(BaseModel):
    Gender: str
    Occupation: str
    BMI_Category: str
    Age: float
    Physical_Activity_Level: float
    Stress_Level: float
    Heart_Rate: float
    Daily_Steps: float

@app.post("/predict")
def predict_sleep(data: InputData):
    input_dict = {
        "Gender": [data.Gender],
        "Occupation": [data.Occupation],
        "BMI Category": [data.BMI_Category],
        "Age": [data.Age],
        "Physical Activity Level": [data.Physical_Activity_Level],
        "Stress Level": [data.Stress_Level],
        "Heart Rate": [data.Heart_Rate],
        "Daily Steps": [data.Daily_Steps],
    }
    input_df = pd.DataFrame.from_dict(input_dict)
    model = joblib.load("sleep_duration_model.joblib")
    prediction = model.predict(input_df)[0]
    return {"predicted_sleep_duration": prediction}