In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [13]:
# Step 1: Load the dataset
dataset = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')
dataset

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


In [12]:
# Step 2: Perform EDA
# Example EDA procedures:
# Check data distribution
print(dataset.describe())
# Handle missing values
dataset = dataset.dropna()
dataset.info()
# Encode categorical variables
#dataset = pd.get_dummies(dataset)

               Age       Height       Weight         FCVC          NCP  \
count  2111.000000  2111.000000  2111.000000  2111.000000  2111.000000   
mean     24.312600     1.701677    86.586058     2.419043     2.685628   
std       6.345968     0.093305    26.191172     0.533927     0.778039   
min      14.000000     1.450000    39.000000     1.000000     1.000000   
25%      19.947192     1.630000    65.473343     2.000000     2.658738   
50%      22.777890     1.700499    83.000000     2.385502     3.000000   
75%      26.000000     1.768464   107.430682     3.000000     3.000000   
max      61.000000     1.980000   173.000000     3.000000     4.000000   

              CH2O          FAF          TUE  
count  2111.000000  2111.000000  2111.000000  
mean      2.008011     1.010298     0.657866  
std       0.612953     0.850592     0.608927  
min       1.000000     0.000000     0.000000  
25%       1.584812     0.124505     0.000000  
50%       2.000000     1.000000     0.625350  
75% 

In [16]:
# Step 3: Perform encoding for categorical variables
cat_vars = ['NObeyesdad', 'Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
label_encoders = {}
for var in cat_vars:
    label_encoders[var] = LabelEncoder()
    dataset[var] = label_encoders[var].fit_transform(dataset[var])

In [17]:
# Step 3: Split the dataset into input features and target variable
X = dataset.drop('NObeyesdad', axis=1)
y = dataset['NObeyesdad']

In [18]:
# Step 5: Perform data scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [19]:
# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [20]:
# Step 7: Decision Tree model building
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

DecisionTreeClassifier()

In [21]:
# Step 8: Make predictions using the Decision Tree model
dt_predictions = dt_model.predict(X_test)

In [22]:
# Step 9: Evaluate the Decision Tree model
print('Decision Tree Model:')
print(classification_report(y_test, dt_predictions))

Decision Tree Model:
              precision    recall  f1-score   support

           0       0.90      0.95      0.93        86
           1       0.82      0.81      0.81        93
           2       0.98      0.90      0.94       102
           3       0.92      0.98      0.95        88
           4       1.00      0.99      0.99        98
           5       0.85      0.81      0.83        88
           6       0.90      0.95      0.93        79

    accuracy                           0.91       634
   macro avg       0.91      0.91      0.91       634
weighted avg       0.91      0.91      0.91       634



In [23]:
# Step 10: Random Forest model building
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

RandomForestClassifier()

In [24]:
# Step 11: Make predictions using the Random Forest model
rf_predictions = rf_model.predict(X_test)

In [25]:
# Step 12: Evaluate the Random Forest model
print('Random Forest Model:')
print(classification_report(y_test, rf_predictions))

Random Forest Model:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        86
           1       0.81      0.92      0.86        93
           2       0.98      0.96      0.97       102
           3       0.97      0.99      0.98        88
           4       1.00      0.99      0.99        98
           5       0.91      0.85      0.88        88
           6       0.97      0.95      0.96        79

    accuracy                           0.94       634
   macro avg       0.95      0.94      0.94       634
weighted avg       0.95      0.94      0.94       634

