In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer

In [27]:
DF = pd.read_csv('synthetic_health_lifestyle_dataset.csv')
DF.head(2)

Unnamed: 0,ID,Age,Gender,Height_cm,Weight_kg,BMI,Smoker,Exercise_Freq,Diet_Quality,Alcohol_Consumption,Chronic_Disease,Stress_Level,Sleep_Hours
0,1,56,Other,177.6,37.3,11.8,Yes,,Poor,,No,9,8.5
1,2,69,Other,169.3,70.7,24.7,No,1-2 times/week,Good,High,No,2,5.9


In [28]:
DF.isnull().sum()

ID                        0
Age                       0
Gender                    0
Height_cm                 0
Weight_kg                 0
BMI                       0
Smoker                    0
Exercise_Freq          1879
Diet_Quality              0
Alcohol_Consumption    1892
Chronic_Disease           0
Stress_Level              0
Sleep_Hours               0
dtype: int64

In [29]:
df = DF.drop(columns=['ID','Sleep_Hours','Stress_Level'])

In [30]:
df.head()

Unnamed: 0,Age,Gender,Height_cm,Weight_kg,BMI,Smoker,Exercise_Freq,Diet_Quality,Alcohol_Consumption,Chronic_Disease
0,56,Other,177.6,37.3,11.8,Yes,,Poor,,No
1,69,Other,169.3,70.7,24.7,No,1-2 times/week,Good,High,No
2,46,Female,159.1,69.0,27.3,No,Daily,Excellent,Moderate,No
3,32,Male,170.6,76.4,26.3,No,3-5 times/week,Excellent,Moderate,No
4,60,Male,158.4,60.4,24.1,No,3-5 times/week,Excellent,Low,Yes


In [31]:
df['Diet_Quality'].value_counts()

Diet_Quality
Good         1918
Poor         1899
Average      1879
Excellent    1804
Name: count, dtype: int64

In [40]:
tf1 = ColumnTransformer([
    ('simple_imp',SimpleImputer(strategy='constant',fill_value='Missing'),[6,8])
],remainder='passthrough')

In [52]:
tf2 = ColumnTransformer([
    ('ohe_',(OneHotEncoder(drop='first',sparse_output=False)),[1,5])
],remainder='passthrough')

In [46]:
tf3 = ColumnTransformer([
    ('ordinal', OrdinalEncoder(categories=[
        ['Low', 'High', 'Moderate', 'Missing'],                  # for Alcohol_Consumption (col 8)
        ['Daily', '1-2 times/week', '3-5 times/week', 'Missing'],# for Exercise_Freq (col 6)
        ['Poor', 'Good', 'Average', 'Excellent']                 # for Diet_Quality (col 7)
    ]), [8, 6, 7])  # pass all 3 columns at once
], remainder='passthrough')

In [35]:
tf4 = ColumnTransformer([
    ('scaler',MinMaxScaler(),[0,2,3,4])
],remainder='passthrough')

In [36]:
tf5 = DecisionTreeClassifier()

In [53]:
from sklearn.pipeline import Pipeline,make_pipeline
pipe = Pipeline([
    ('tf1' ,tf1),
    ('tf2' ,tf2),
    ('tf3' ,tf3),
    ('tf4' ,tf4),
    ('tf5' ,tf5)
])

In [50]:
x_train,x_test,y_train,y_test = train_test_split(df.iloc[:,0:9],df.iloc[:,-1],test_size=0.2,random_state= 42)
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.fit_transform(y_test)

In [54]:
pipe.fit(x_train,y_train)

ValueError: Found unknown categories [0.0, 1.0] in column 0 during fit