In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import configparser

In [11]:
dataset = pd.read_csv("../data/raw/ObesityDataSet_raw_and_data_sinthetic.csv", delimiter=';')
dataset.head()

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS,NObeyesdad
0,21.0,,1.62,64.0,no,no,2.0,,no,no,2.0,yes,,1.0,Sometimes,Public_Transportation,Normal_Weight
1,21.0,Female,1.52,56.0,Sometimes,no,3.0,3.0,yes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,23.0,Male,1.8,77.0,Frequently,no,2.0,3.0,no,no,2.0,yes,2.0,1.0,Sometimes,Public_Transportation,Normal_Weight
3,27.0,,1.8,87.0,Frequently,no,3.0,,no,no,2.0,no,,0.0,Sometimes,Walking,Overweight_Level_I
4,22.0,Male,1.78,89.8,Sometimes,no,2.0,1.0,no,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [12]:
config = configparser.ConfigParser()
config.read('../pipeline.cfg')

['../pipeline.cfg']

In [13]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.outliers import Winsorizer
from feature_engine.transformation import LogTransformer

from sklearn.preprocessing import MinMaxScaler

In [14]:
#Leyenda de la codificacion
dataset['NObeyesdad'].value_counts()

NObeyesdad
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64

In [15]:
#Codificacion de la variable target
frecuency_target = dataset['NObeyesdad'].value_counts(ascending=False).to_dict()
dataset['NObeyesdad'] = dataset['NObeyesdad'].map(frecuency_target)
dataset.head()

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS,NObeyesdad
0,21.0,,1.62,64.0,no,no,2.0,,no,no,2.0,yes,,1.0,Sometimes,Public_Transportation,287
1,21.0,Female,1.52,56.0,Sometimes,no,3.0,3.0,yes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,287
2,23.0,Male,1.8,77.0,Frequently,no,2.0,3.0,no,no,2.0,yes,2.0,1.0,Sometimes,Public_Transportation,287
3,27.0,,1.8,87.0,Frequently,no,3.0,,no,no,2.0,no,,0.0,Sometimes,Walking,290
4,22.0,Male,1.78,89.8,Sometimes,no,2.0,1.0,no,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,290


In [16]:
drop_vars = list(config.get('GENERAL', 'VARS_TO_DROP').split(', '))
drop_vars

['NObeyesdad']

In [17]:
x_features = dataset.drop(labels=drop_vars, axis=1)
y_target = dataset['NObeyesdad']
x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size=0.3, shuffle=True, random_state=2025)

In [18]:
obesity_predict_model = Pipeline([
    # imputación de variables continuas.
    ('continues_var_imputation', MeanMedianImputer(imputation_method='mean', variables=config.get('CONTINUES', 'CONTINUE_VARS_TO_IMPUTATION').split(', ') )),

    # imputación de variables categóricas
    ('categorical_var_imputation', CategoricalImputer(imputation_method='frequent', variables=config.get('CATEGORICAL', 'CATEGORICAL_VARS_TO_IMPUTATION').split(', ') )),

    # codificación de variables categoricas
    ('categorical_encode_frequency', CountFrequencyEncoder(encoding_method='count', variables=config.get('CATEGORICAL', 'FREQENC_VARS_TO_ENCODE').split(', ') )),

    # tratamiento de outliers
    ('continues_var_outliers', Winsorizer(capping_method='iqr', variables=config.get('CONTINUES', 'CONTINUE_VARS_TO_OUTLIERS').split(', ') )),

    # transformacion de variables
    ('continues_var_transform', LogTransformer(variables=config.get('CONTINUES', 'CONTINUE_VARS_TO_TRANSFORM').split(', ') )),

    # feature scaling
    ('feature_scaling', MinMaxScaler())
])

In [19]:
# configuramos pipeline
obesity_predict_model.fit(x_train)

In [20]:
x_features_processed = obesity_predict_model.transform(x_train)
df_features_process = pd.DataFrame(x_features_processed, columns=x_train.columns)
df_features_process['NObeyesdad'] = y_train.reset_index()['NObeyesdad']

# guardamos los datos para entrenar los modelos.
df_features_process.to_csv('../data/processed/features_for_model.csv', index=False)
df_features_process.head()

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS,NObeyesdad
0,0.894047,1.0,0.606729,0.779864,1.0,1.0,0.696594,0.883565,1.0,1.0,0.689907,1.0,0.131151,0.172944,1.0,0.261757,297
1,0.716781,1.0,0.54813,0.566201,1.0,1.0,0.63093,0.0,1.0,1.0,0.63093,1.0,0.0,0.0,0.0,0.261757,290
2,0.54179,0.0,0.376318,0.525878,0.45,1.0,0.993173,0.393193,1.0,1.0,0.949893,1.0,0.363825,0.444759,1.0,1.0,351
3,0.339512,1.0,0.637531,0.564626,1.0,1.0,0.922335,0.883565,1.0,1.0,0.987112,1.0,0.747585,0.393305,1.0,1.0,290
4,0.867552,0.0,0.355861,0.384696,1.0,0.0,1.0,0.883565,1.0,1.0,0.63093,0.0,0.333333,0.0,1.0,0.261757,290


In [21]:
import pickle

x_test['NObeyesdad'] = y_test
x_test.to_csv('../data/processed/test_dataset.csv', index=False)

with open('../artifacts/pipeline.pkl', 'wb') as f:
    pickle.dump(obesity_predict_model, f)

In [22]:
y_test

1497    351
1465    351
399     287
379     287
1504    351
       ... 
1264    351
1018    290
169     290
692     272
1962    324
Name: NObeyesdad, Length: 634, dtype: int64

In [23]:
x_test

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS,NObeyesdad
1497,18.000000,,1.692913,89.938890,no,yes,2.818502,,no,no,1.991251,yes,,0.000000,Sometimes,Public_Transportation,351
1465,21.624552,Male,1.790151,106.320686,Sometimes,yes,2.490776,3.000000,no,no,2.654517,yes,0.112454,0.756339,Sometimes,Public_Transportation,351
399,21.000000,Male,1.650000,60.000000,Sometimes,no,3.000000,1.000000,no,no,1.000000,no,0.000000,0.000000,Frequently,Motorbike,287
379,17.000000,Male,1.700000,70.000000,Sometimes,yes,3.000000,3.000000,no,no,2.000000,yes,0.000000,2.000000,Sometimes,Walking,287
1504,18.106820,Female,1.602129,82.412665,no,yes,2.319648,3.000000,no,no,1.107164,yes,0.692123,0.304020,Sometimes,Public_Transportation,351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1264,37.631769,Female,1.513202,75.410647,Sometimes,yes,2.000000,2.582591,no,no,1.535134,yes,1.884520,0.000000,Sometimes,Automobile,351
1018,22.000000,Male,1.691303,80.539000,Sometimes,yes,2.000000,2.038373,no,no,2.000000,yes,2.708250,1.506576,Sometimes,Public_Transportation,290
169,45.000000,Female,1.630000,77.000000,no,yes,2.000000,3.000000,no,no,1.000000,yes,0.000000,0.000000,Frequently,Automobile,290
692,18.000000,,1.767058,51.132809,Sometimes,yes,2.708965,,no,no,1.873004,yes,,1.000000,Sometimes,Public_Transportation,272
