# Step 1: Data Preprocessing Pipeline

### Importing Libraries

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.impute import SimpleImputer

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures

In [7]:
from sklearn.compose import ColumnTransformer

In [8]:
from sklearn.pipeline import Pipeline

In [9]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [10]:
from sklearn.model_selection import GridSearchCV, cross_val_score

In [11]:
from sklearn.metrics import mean_absolute_error, r2_score

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

In [13]:
import joblib

In [14]:
!pip install fastapi



In [15]:
from fastapi import FastAPI

In [16]:
from pydantic import BaseModel

### Dataset Loading

In [17]:
df = pd.read_csv("data.csv")

In [18]:
print(df.head())

   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   

  Blood Pressure  Heart Rate  Daily Steps Sleep Disorder  
0         126/83          77         4200            NaN  
1         125/80          75      

## From fastapi import FastAPI
### Step 1: Identify Features

In [19]:
target = 'Sleep_Duration'

In [20]:
categorical = ['Gender']

In [21]:
numerical = ['Age', 'Caffeine_Intake', 'Screen_Time', 'Physical_Activity_Level']

### Custom Transformer for Interaction Features

In [22]:
class InteractionFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if 'Screen_Time' in X and 'Caffeine_Intake' in X:
            X['Screen_Caffeine_Interaction'] = X['Screen_Time'] * X['Caffeine_Intake']
        return X

### Preprocessing Pipeline

In [23]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first'))
])

### Full Column Transformer

In [24]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical),
    ('cat', categorical_transformer, categorical)
])

### Polynomial Features

In [25]:
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)

### Full Pipeline with Feature Engineering

In [26]:
full_pipeline = Pipeline(steps=[
    ('interaction', InteractionFeatures()),
    ('preprocess', preprocessor),
    ('poly', poly)
])

### Define the DataFrame with necessary columns

In [27]:
df = pd.read_csv('data.csv')

### Split the data into features and target