# -------------------------------------------
# Sleep Duration Prediction using ML Pipeline
# -------------------------------------------

# Step 1: Data Preprocessing Pipeline

### Importing Libraries

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
import joblib

In [None]:
!pip install fastapi

In [None]:
from fastapi import FastAPI

In [None]:
from pydantic import BaseModel

#### 1. Dataset Loading

In [None]:
df = pd.read_csv("data.csv")

In [None]:
print(df.head())

#### 2. Basic Cleaning

In [None]:
df = df.dropna(subset=['Sleep Duration'])  # Drop rows where target is missing

#### 3. Feature Definitions

In [None]:
target = 'Sleep Duration'
categorical = ['Gender', 'Occupation', 'BMI Category']
numerical = ['Age', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps']

In [None]:
X = df[categorical + numerical]
y = df[target]

#### 4. Custom Transformer for Polynomial Features

In [None]:
class AddPolynomialFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, degree=2):
        self.degree = degree
        self.poly = PolynomialFeatures(degree=self.degree, include_bias=False)

    def fit(self, X, y=None):
        self.feature_names = X.columns
        self.poly.fit(X)
        return self

    def transform(self, X):
        poly_features = self.poly.transform(X)
        poly_df = pd.DataFrame(poly_features, columns=self.poly.get_feature_names_out(self.feature_names), index=X.index)
        return poly_df


#### 5. Preprocessing Pipelines

In [None]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', AddPolynomialFeatures(degree=2))
])

categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numerical),
    ('cat', categorical_pipeline, categorical)
])

#### 6. Full Pipelines for Each Model

In [None]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(random_state=42))
])


#### 7. Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### 8. Hyperparameter Tuning

In [None]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5, 7]
}

grid_rf = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_rf.fit(X_train, y_train)

grid_gb = GridSearchCV(gb_pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_gb.fit(X_train, y_train)

#### 9. Evaluation

In [None]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"🔹 {model_name} Evaluation:")
    print(f"    MAE: {mae:.3f}")
    print(f"    R² Score: {r2:.3f}\n")
    return mae, r2

print("\n📊 Model Evaluation Results:")
mae_rf, r2_rf = evaluate_model(grid_rf, X_test, y_test, "Random Forest")
mae_gb, r2_gb = evaluate_model(grid_gb, X_test, y_test, "Gradient Boosting")

#### 10. Best Model Selection (based on MAE)

In [None]:
best_model = grid_rf if mae_rf <= mae_gb else grid_gb
print(f"✅ Best model selected: {'Random Forest' if best_model == grid_rf else 'Gradient Boosting'}")

#### 11. Feature Importance (for Random Forest)

In [None]:
if hasattr(best_model.best_estimator_.named_steps['model'], 'feature_importances_'):
    feature_names = best_model.best_estimator_.named_steps['preprocessor'].transformers_[0][1] \
        .named_steps['poly'].poly.get_feature_names_out(numerical)
    importances = best_model.best_estimator_.named_steps['model'].feature_importances_
    important_features = pd.Series(importances, index=feature_names).sort_values(ascending=False)
    print("\n🔍 Top 10 Important Features:\n", important_features.head(10))

#### 12. Save the Best Model

In [None]:
joblib.dump(best_model, "sleep_duration_model.joblib")
print("\n💾 Model saved successfully as 'sleep_duration_model.joblib'")

#### 13. (Optional) Deployment API using FastAPI

In [None]:
app = FastAPI()

class InputData(BaseModel):
    Gender: str
    Occupation: str
    BMI_Category: str
    Age: float
    Physical_Activity_Level: float
    Stress_Level: float
    Heart_Rate: float
    Daily_Steps: float

@app.post("/predict")
def predict_sleep(data: InputData):
    input_dict = {
        "Gender": [data.Gender],
        "Occupation": [data.Occupation],
        "BMI Category": [data.BMI_Category],
        "Age": [data.Age],
        "Physical Activity Level": [data.Physical_Activity_Level],
        "Stress Level": [data.Stress_Level],
        "Heart Rate": [data.Heart_Rate],
        "Daily Steps": [data.Daily_Steps],
    }
    input_df = pd.DataFrame.from_dict(input_dict)
    model = joblib.load("sleep_duration_model.joblib")
    prediction = model.predict(input_df)[0]
    return {"predicted_sleep_duration": prediction}