In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,RobustScaler,LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


In [59]:
df = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")

In [60]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   int64  
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [62]:
df.shape

(2111, 17)

In [63]:
df.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.315964,1.70162,86.586035,2.418986,2.685651,2.008053,1.010313,0.657861
std,6.357078,0.093368,26.191163,0.533996,0.778079,0.61295,0.850613,0.608926
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,20.0,1.63,65.47,2.0,2.66,1.585,0.125,0.0
50%,23.0,1.7,83.0,2.39,3.0,2.0,1.0,0.625
75%,26.0,1.77,107.43,3.0,3.0,2.48,1.67,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


## checking for missing values

In [64]:
df.isna().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [65]:
num_cols = df.select_dtypes("number")
cat_cols = df.select_dtypes("object")

## handling outliers

In [66]:
def outlier_report():
    for col in num_cols:
                    Q1 = df[col].quantile(0.25)
                    Q3 = df[col].quantile(0.75)
                    IQR=Q3-Q1
                    lower_bound = Q1-1.5*IQR
                    upper_bound = Q3+1.5*IQR

                    outlier_count = ((df[col]<lower_bound)|(df[col]>upper_bound)).sum()
                    outlier_per = (outlier_count/len(df))*100
                    print(f"{col} : {outlier_per.__round__(3)}")

outlier_report()

Age : 7.579
Height : 0.0
Weight : 0.047
FCVC : 0.0
NCP : 27.475
CH2O : 0.0
FAF : 0.0
TUE : 0.0


In [67]:
## capping outliers
for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR=Q3-Q1
        lower_bound = Q1-1.5*IQR
        upper_bound = Q3+1.5*IQR
        df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

In [68]:
outlier_report()

Age : 0.0
Height : 0.0
Weight : 0.0
FCVC : 0.0
NCP : 0.0
CH2O : 0.0
FAF : 0.0
TUE : 0.0


## Handling skewness

In [69]:
def skewness (col):
    return df[col].skew().round(2)

for col in num_cols:
    print (f"{col} : {skewness(col)}")

Age : 0.79
Height : -0.01
Weight : 0.25
FCVC : -0.43
NCP : -0.67
CH2O : -0.11
FAF : 0.5
TUE : 0.62


No high skewness

## splitting data

In [70]:
x = df.drop('NObeyesdad',axis=1)
y= df['NObeyesdad']

x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=.2,random_state=42)

## Encoding


In [71]:
for col in cat_cols:
    print(f"{col} : {df[col].nunique()}")

Gender : 2
family_history_with_overweight : 2
FAVC : 2
CAEC : 4
SMOKE : 2
SCC : 2
CALC : 4
MTRANS : 5
NObeyesdad : 7


In [72]:
cat_cols = x_train.select_dtypes('object').columns
for col in cat_cols:
    le = LabelEncoder()
    x_train[col] = le.fit_transform(x_train[col])
    x_test[col] = le.transform(x_test[col])

x_train

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
162,0,21,1.63,60.00,1,1,3.00,3.00,0,1,2.00,0,2.00,0.000,2,3
2001,0,21,1.75,133.62,1,1,3.00,3.00,2,0,2.89,0,1.48,0.780,2,3
1435,0,23,1.66,82.60,1,1,1.20,2.15,2,0,2.77,0,0.13,1.659,2,3
649,0,22,1.59,44.24,0,0,3.00,2.15,1,0,2.55,0,1.10,0.000,3,3
1280,1,26,1.81,106.04,1,1,3.00,3.00,2,0,2.86,0,1.81,0.680,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,1,32,1.75,120.10,1,1,2.97,3.00,2,0,2.53,0,0.96,1.339,2,0
1095,1,23,1.72,81.67,1,1,2.00,2.15,2,0,1.40,0,0.89,1.012,2,3
1130,0,23,1.65,80.00,1,1,2.00,3.00,2,0,2.00,0,0.15,2.000,3,3
1294,0,23,1.63,84.50,1,1,2.06,2.96,2,0,2.01,0,0.85,0.631,3,3


## Scaling

In [73]:
scaler = StandardScaler()
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.transform(x_test)
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2.0,2.15,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


## Models

In [74]:
## logistic regression
lo = LogisticRegression()
lo.fit(x_train_s,y_train)
y_pred = lo.predict(x_test)
print(classification_report(y_test, y_pred))

                     precision    recall  f1-score   support

Insufficient_Weight       0.00      0.00      0.00        56
      Normal_Weight       0.00      0.00      0.00        62
     Obesity_Type_I       0.00      0.00      0.00        78
    Obesity_Type_II       0.14      1.00      0.24        58
   Obesity_Type_III       0.00      0.00      0.00        63
 Overweight_Level_I       0.00      0.00      0.00        56
Overweight_Level_II       0.00      0.00      0.00        50

           accuracy                           0.14       423
          macro avg       0.02      0.14      0.03       423
       weighted avg       0.02      0.14      0.03       423



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [75]:
## descition tree
dt = DecisionTreeClassifier()
dt.fit(x_test_s,y_test)
y_pred_d = dt.predict(x_test)
print(classification_report(y_test, y_pred_d))

                     precision    recall  f1-score   support

Insufficient_Weight       0.00      0.00      0.00        56
      Normal_Weight       0.00      0.00      0.00        62
     Obesity_Type_I       0.21      0.83      0.33        78
    Obesity_Type_II       0.13      0.24      0.17        58
   Obesity_Type_III       0.00      0.00      0.00        63
 Overweight_Level_I       0.00      0.00      0.00        56
Overweight_Level_II       0.00      0.00      0.00        50

           accuracy                           0.19       423
          macro avg       0.05      0.15      0.07       423
       weighted avg       0.06      0.19      0.08       423



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
