In [3]:
import pandas as pd
import numpy as np

In [4]:
DF = pd.read_csv('synthetic_health_lifestyle_dataset.csv')
DF.head(2)

Unnamed: 0,ID,Age,Gender,Height_cm,Weight_kg,BMI,Smoker,Exercise_Freq,Diet_Quality,Alcohol_Consumption,Chronic_Disease,Stress_Level,Sleep_Hours
0,1,56,Other,177.6,37.3,11.8,Yes,,Poor,,No,9,8.5
1,2,69,Other,169.3,70.7,24.7,No,1-2 times/week,Good,High,No,2,5.9


In [5]:
df = DF[['Age','Gender','Smoker','Exercise_Freq','BMI','Stress_Level','Alcohol_Consumption','Diet_Quality','Sleep_Hours','Chronic_Disease']]

In [6]:
df.head()

Unnamed: 0,Age,Gender,Smoker,Exercise_Freq,BMI,Stress_Level,Alcohol_Consumption,Diet_Quality,Sleep_Hours,Chronic_Disease
0,56,Other,Yes,,11.8,9,,Poor,8.5,No
1,69,Other,No,1-2 times/week,24.7,2,High,Good,5.9,No
2,46,Female,No,Daily,27.3,3,Moderate,Excellent,4.8,No
3,32,Male,No,3-5 times/week,26.3,9,Moderate,Excellent,6.6,No
4,60,Male,No,3-5 times/week,24.1,6,Low,Excellent,6.1,Yes


In [5]:
df.isnull().sum()

Age                       0
Gender                    0
Smoker                    0
Exercise_Freq          1879
BMI                       0
Stress_Level              0
Alcohol_Consumption    1892
Diet_Quality              0
Sleep_Hours               0
Chronic_Disease           0
dtype: int64

In [7]:
df.shape

(7500, 10)

## Aam Zindgi

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [9]:
x_train,x_test,y_train,y_test = train_test_split(df[['Age','Gender','Smoker','Exercise_Freq','BMI','Stress_Level'
                                                     ,'Alcohol_Consumption','Diet_Quality','Sleep_Hours']],df[['Chronic_Disease']],test_size=0.2,random_state=0)

In [10]:
si = SimpleImputer(strategy='constant',fill_value="No")
x_train_exc = pd.DataFrame(si.fit_transform(x_train[['Exercise_Freq']]),columns=['Exercise_Freq'])
x_test_exc = pd.DataFrame(si.fit_transform(x_test[['Exercise_Freq']]),columns=['Exercise_Freq'])

In [11]:
x_train_exc.shape

(6000, 1)

In [12]:
x_train.head()

Unnamed: 0,Age,Gender,Smoker,Exercise_Freq,BMI,Stress_Level,Alcohol_Consumption,Diet_Quality,Sleep_Hours
1328,24,Female,No,3-5 times/week,25.8,10,Low,Average,5.2
2368,55,Male,Yes,1-2 times/week,25.7,7,High,Excellent,6.8
4113,51,Male,No,3-5 times/week,24.7,5,Moderate,Poor,8.9
1670,39,Female,Yes,1-2 times/week,39.4,8,Moderate,Poor,9.2
2364,51,Female,No,1-2 times/week,29.9,8,Moderate,Poor,6.6


In [13]:
si = SimpleImputer(strategy='constant',fill_value='Missing')
x_train_alc = pd.DataFrame(si.fit_transform(x_train[['Alcohol_Consumption']]),columns=['Alcohol_Consumption'])
x_test_alc = pd.DataFrame(si.fit_transform(x_test[['Alcohol_Consumption']]),columns=['Alcohol_Consumption'])
x_train_alc.head()

Unnamed: 0,Alcohol_Consumption
0,Low
1,High
2,Moderate
3,Moderate
4,Moderate


In [14]:
df['Smoker'].value_counts()

Smoker
No     5263
Yes    2237
Name: count, dtype: int64

In [15]:
ohe = OneHotEncoder(sparse_output=False,drop='first')
x_train_ohe = ohe.fit_transform(x_train[['Gender','Smoker']])
x_test_ohe = ohe.transform(x_test[['Gender','Smoker']])

In [16]:
x_train_ohe.shape

(6000, 3)

In [17]:
oe = OrdinalEncoder(categories=[['Low','High','Moderate','Missing'],
                                ['Poor','Good','Average','Excellent'],
                                ['Daily','1-2 times/week','3-5 times/week','No']])

x_train_combined = pd.concat([x_train_alc[['Alcohol_Consumption']].reset_index(drop=True),
                              x_train[['Diet_Quality']].reset_index(drop=True),x_train_exc[['Exercise_Freq']].reset_index(drop=True)],axis=1)
x_test_combined = pd.concat([x_test_alc[['Alcohol_Consumption']].reset_index(drop=True),
                              x_test[['Diet_Quality']].reset_index(drop=True),x_test_exc[['Exercise_Freq']].reset_index(drop=True)],axis=1)
x_train_oe = oe.fit_transform(x_train_combined)
x_test_oe = oe.transform(x_test_combined)


In [18]:
x_test_oe.shape

(1500, 3)

In [19]:
x_train_rest = x_train.drop(columns=['Gender','Smoker','Exercise_Freq','Alcohol_Consumption','Diet_Quality']).values
x_test_rest = x_test.drop(columns=['Gender','Smoker','Exercise_Freq','Alcohol_Consumption','Diet_Quality']).values

x_train_rest

array([[24. , 25.8, 10. ,  5.2],
       [55. , 25.7,  7. ,  6.8],
       [51. , 24.7,  5. ,  8.9],
       ...,
       [41. , 30.5, 10. ,  6.9],
       [39. , 25.4,  8. ,  5. ],
       [53. , 22.8,  3. ,  5.6]])

In [20]:
x_train_transformed = np.concatenate((x_train_ohe,x_train_oe,x_train_rest),axis=1)
x_test_transformed = np.concatenate((x_test_ohe,x_test_oe,x_test_rest),axis=1)

# Mentos Zindgi

In [22]:
from sklearn.compose import ColumnTransformer

In [29]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(strategy='constant',fill_value='No'),['Alcohol_Consumption']),
    ('tnf2',SimpleImputer(strategy='constant',fill_value='Missing'),['Exercise_Freq']),
    ('tnf3',OneHotEncoder(drop='first',sparse_output=False),['Gender','Smoker']),
    ('tnf4',OrdinalEncoder(categories=[['Low','High','Moderate','Missing'],
                                ['Poor','Good','Average','Excellent'],
                                ['Daily','1-2 times/week','3-5 times/week','No']]),['Alcohol_Consumption','Diet_Quality','Exercise_Freq'])
],remainder='passthrough')

In [30]:
x_train_transformed = transformer.fit_transform(x_train)

ValueError: Found unknown categories [nan] in column 0 during fit