# -------------------------------------------
# Sleep Duration Prediction using ML Pipeline
# -------------------------------------------

# Step 1: Data Preprocessing Pipeline

### Importing Libraries

In [27]:
import pandas as pd

In [28]:
import numpy as np

In [29]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [30]:
from sklearn.impute import SimpleImputer

In [31]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures

In [32]:
from sklearn.compose import ColumnTransformer

In [33]:
from sklearn.pipeline import Pipeline

In [34]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [35]:
from sklearn.metrics import mean_absolute_error, r2_score

In [36]:
from sklearn.base import BaseEstimator, TransformerMixin

In [37]:
import joblib

In [38]:
!pip install fastapi



In [39]:
from fastapi import FastAPI

In [40]:
from pydantic import BaseModel

# Preprocessing all steps

In [41]:
df = pd.read_csv('data.csv')

df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [44]:
df.tail()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
369,370,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,371,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,372,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,373,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
373,374,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [45]:
df.shape

(374, 13)

In [46]:
df.tail(20)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
354,355,Female,58,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
355,356,Female,58,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
356,357,Female,58,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
357,358,Female,58,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
358,359,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,
359,360,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,
360,361,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
361,362,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
362,363,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
363,364,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [47]:
df.sample()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
333,334,Female,54,Engineer,8.4,9,30,3,Normal,125/80,65,5000,


In [50]:
df.describe()

Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps
count,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0
mean,187.5,42.184492,7.132086,7.312834,59.171123,5.385027,70.165775,6816.84492
std,108.108742,8.673133,0.795657,1.196956,20.830804,1.774526,4.135676,1617.915679
min,1.0,27.0,5.8,4.0,30.0,3.0,65.0,3000.0
25%,94.25,35.25,6.4,6.0,45.0,4.0,68.0,5600.0
50%,187.5,43.0,7.2,7.0,60.0,5.0,70.0,7000.0
75%,280.75,50.0,7.8,8.0,75.0,7.0,72.0,8000.0
max,374.0,59.0,8.5,9.0,90.0,8.0,86.0,10000.0


#### 2. Basic Cleaning

In [51]:
df = df.dropna(subset=['Sleep Duration'])  # Drop rows where target is missing

#### 3. Feature Definitions

In [52]:
target = 'Sleep Duration'
categorical = ['Gender', 'Occupation', 'BMI Category']
numerical = ['Age', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps']

In [53]:
X = df[categorical + numerical]
y = df[target]

### 4. Outlier Detection and Removing (Z-Score Method)

In [61]:
numeric_cols = ['Age', 'Heart Rate', 'Daily Steps', 'Physical Activity Level', 'Stress Level', 'Sleep Duration']
z_scores = np.abs(stats.zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]
print("\n✅ Outliers removed.")

NameError: name 'stats' is not defined

#### 4. Custom Transformer for Polynomial Features

In [54]:
class AddPolynomialFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, degree=2):
        self.degree = degree
        self.poly = PolynomialFeatures(degree=self.degree, include_bias=False)

    def fit(self, X, y=None):
        self.feature_names = X.columns
        self.poly.fit(X)
        return self

    def transform(self, X):
        poly_features = self.poly.transform(X)
        poly_df = pd.DataFrame(poly_features, columns=self.poly.get_feature_names_out(self.feature_names), index=X.index)
        return poly_df


#### 5. Preprocessing Pipelines

In [55]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', AddPolynomialFeatures(degree=2))
])

categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numerical),
    ('cat', categorical_pipeline, categorical)
])

#### 6. Full Pipelines for Each Model

In [56]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(random_state=42))
])


#### 7. Train-Test Split

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### 8. Hyperparameter Tuning

In [58]:
class AddPolynomialFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, degree=2):
        self.degree = degree
        self.poly = PolynomialFeatures(degree=self.degree, include_bias=False)

    def fit(self, X, y=None):
        self.poly.fit(X)
        return self

    def transform(self, X):
        poly_features = self.poly.transform(X)
        try:
            # Try getting feature names if X is a DataFrame
            feature_names = X.columns
        except AttributeError:
            # Otherwise, create dummy feature names for numpy arrays
            feature_names = [f"x{i}" for i in range(X.shape[1])]
        poly_feature_names = self.poly.get_feature_names_out(feature_names)
        return pd.DataFrame(poly_features, columns=poly_feature_names, index=None)

#### 9. Evaluation

In [59]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"🔹 {model_name} Evaluation:")
    print(f"    MAE: {mae:.3f}")
    print(f"    R² Score: {r2:.3f}\n")
    return mae, r2

print("\n📊 Model Evaluation Results:")
mae_rf, r2_rf = evaluate_model(grid_rf, X_test, y_test, "Random Forest")
mae_gb, r2_gb = evaluate_model(grid_gb, X_test, y_test, "Gradient Boosting")


📊 Model Evaluation Results:


NameError: name 'grid_rf' is not defined

#### 10. Best Model Selection (based on MAE)

In [None]:
best_model = grid_rf if mae_rf <= mae_gb else grid_gb
print(f"✅ Best model selected: {'Random Forest' if best_model == grid_rf else 'Gradient Boosting'}")

#### 11. Feature Importance (for Random Forest)

In [None]:
if hasattr(best_model.best_estimator_.named_steps['model'], 'feature_importances_'):
    feature_names = best_model.best_estimator_.named_steps['preprocessor'].transformers_[0][1] \
        .named_steps['poly'].poly.get_feature_names_out(numerical)
    importances = best_model.best_estimator_.named_steps['model'].feature_importances_
    important_features = pd.Series(importances, index=feature_names).sort_values(ascending=False)
    print("\n🔍 Top 10 Important Features:\n", important_features.head(10))

#### 12. Save the Best Model

In [None]:
joblib.dump(best_model, "sleep_duration_model.joblib")
print("\n💾 Model saved successfully as 'sleep_duration_model.joblib'")

#### 13. (Optional) Deployment API using FastAPI

In [None]:
app = FastAPI()

class InputData(BaseModel):
    Gender: str
    Occupation: str
    BMI_Category: str
    Age: float
    Physical_Activity_Level: float
    Stress_Level: float
    Heart_Rate: float
    Daily_Steps: float

@app.post("/predict")
def predict_sleep(data: InputData):
    input_dict = {
        "Gender": [data.Gender],
        "Occupation": [data.Occupation],
        "BMI Category": [data.BMI_Category],
        "Age": [data.Age],
        "Physical Activity Level": [data.Physical_Activity_Level],
        "Stress Level": [data.Stress_Level],
        "Heart Rate": [data.Heart_Rate],
        "Daily Steps": [data.Daily_Steps],
    }
    input_df = pd.DataFrame.from_dict(input_dict)
    model = joblib.load("sleep_duration_model.joblib")
    prediction = model.predict(input_df)[0]
    return {"predicted_sleep_duration": prediction}