Imagine you have a dataset where you have different features like Age, Gender, Height, Weight, BMI, and Blood Pressure and you have to classify the people into different classes like Normal, Overweight, Obesity, Underweight, and Extreme Obesity by using any 4 different classification algorithms. Now you have to build a model which can classify people into different classes.



In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve, \
                            confusion_matrix

In [72]:
url = "https://raw.githubusercontent.com/jeckymaster/Assignments_16-05-2023/main/ObesityDataSet_raw_and_data_sinthetic.csv"
df = pd.read_csv(url)

In [73]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [74]:
df.shape

(2111, 17)

##### The dataset consist 2111 rows and 17 columns including the output

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [76]:
df.isnull().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

##### There is no null presence in the whole data set.

##### Lets handle the duplicates in the data set.

In [77]:
dup_rows = df[df.duplicated()]
dup_rows

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
98,Female,21.0,1.52,42.0,no,no,3.0,1.0,Frequently,no,1.0,no,0.0,0.0,Sometimes,Public_Transportation,Insufficient_Weight
106,Female,25.0,1.57,55.0,no,yes,2.0,1.0,Sometimes,no,2.0,no,2.0,0.0,Sometimes,Public_Transportation,Normal_Weight
174,Male,21.0,1.62,70.0,no,yes,2.0,1.0,no,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation,Overweight_Level_I
179,Male,21.0,1.62,70.0,no,yes,2.0,1.0,no,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation,Overweight_Level_I
184,Male,21.0,1.62,70.0,no,yes,2.0,1.0,no,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation,Overweight_Level_I
209,Female,22.0,1.69,65.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Sometimes,Public_Transportation,Normal_Weight
309,Female,16.0,1.66,58.0,no,no,2.0,1.0,Sometimes,no,1.0,no,0.0,1.0,no,Walking,Normal_Weight
460,Female,18.0,1.62,55.0,yes,yes,2.0,3.0,Frequently,no,1.0,no,1.0,1.0,no,Public_Transportation,Normal_Weight
467,Male,22.0,1.74,75.0,yes,yes,3.0,3.0,Frequently,no,1.0,no,1.0,0.0,no,Automobile,Normal_Weight
496,Male,18.0,1.72,53.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,0.0,2.0,Sometimes,Public_Transportation,Insufficient_Weight


In [78]:
df = df.drop_duplicates(keep='first')

In [79]:
df.shape

(2087, 17)

In [80]:
df.duplicated().sum()

0

##### The duplicates has been handled by removing all duplicates and keeping the first occurance only.

In [81]:
# Adding the body mass index column to the dataframe 
df['BMI'] = df['Weight'] / df['Height']**2

In [82]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight,24.386526
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight,24.238227
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight,23.765432
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I,26.851852
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II,28.342381


The classification of BMI categories for adults:

- Underweight: BMI less than 18.5
- Normal weight: BMI between 18.5 and 24.9
- Overweight: BMI between 25 and 29.9
- Obesity (Class I): BMI between 30 and 34.9
- Obesity (Class II): BMI between 35 and 39.9
- Extreme obesity (Class III): BMI 40 or higher

The above is the ideal range of BMI for the adults

In [83]:
# Find out which columns have less then 10 unique values
def find_uniq_values(dataframe):
    columns = df.columns
    for col in columns:
        if df[col].nunique() < 10:
            u_value = df[col].unique()
            print(f"Column: {col} -- Unique values are -- {u_value}")

In [84]:
find_uniq_values(df)

Column: Gender -- Unique values are -- ['Female' 'Male']
Column: family_history_with_overweight -- Unique values are -- ['yes' 'no']
Column: FAVC -- Unique values are -- ['no' 'yes']
Column: CAEC -- Unique values are -- ['Sometimes' 'Frequently' 'Always' 'no']
Column: SMOKE -- Unique values are -- ['no' 'yes']
Column: SCC -- Unique values are -- ['no' 'yes']
Column: CALC -- Unique values are -- ['no' 'Sometimes' 'Frequently' 'Always']
Column: MTRANS -- Unique values are -- ['Public_Transportation' 'Walking' 'Automobile' 'Motorbike' 'Bike']
Column: NObeyesdad -- Unique values are -- ['Normal_Weight' 'Overweight_Level_I' 'Overweight_Level_II'
 'Obesity_Type_I' 'Insufficient_Weight' 'Obesity_Type_II'
 'Obesity_Type_III']


In [85]:
# Encoding the columns which have more than 2 catagorical values 
columns_to_encode = ['CAEC', 'CALC', 'MTRANS']

# Using One hot encoding to get dummies columns 
df = pd.get_dummies(df, columns=columns_to_encode)

In [86]:
# Encoding the columns which have binary catagorical values 

def encode_binary(dataframe):
    columns = dataframe.columns
    le = LabelEncoder()
    for col in columns:
        if dataframe[col].nunique() == 2:
            dataframe[col] = le.fit_transform(dataframe[col])
            


In [87]:
encode_binary(df)


In [88]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,SMOKE,CH2O,...,CAEC_no,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,0,21.0,1.62,64.0,1,0,2.0,3.0,0,2.0,...,0,0,0,0,1,0,0,0,1,0
1,0,21.0,1.52,56.0,1,0,3.0,3.0,1,3.0,...,0,0,0,1,0,0,0,0,1,0
2,1,23.0,1.8,77.0,1,0,2.0,3.0,0,2.0,...,0,0,1,0,0,0,0,0,1,0
3,1,27.0,1.8,87.0,0,0,3.0,3.0,0,2.0,...,0,0,1,0,0,0,0,0,0,1
4,1,22.0,1.78,89.8,0,0,2.0,1.0,0,2.0,...,0,0,0,1,0,0,0,0,1,0


#### Lets do the train test split for further preprocessing and trainin the model

In [89]:
y = df['NObeyesdad']

In [121]:
weight_mapping = {
    'Normal_Weight': 'Normal',
    'Overweight_Level_I': 'Overweight',
    'Overweight_Level_II': 'Overweight',
    'Insufficient_Weight': 'Underweight',
    'Obesity_Type_I': 'Obesity',
    'Obesity_Type_II': 'Obesity',
    'Obesity_Type_III': 'Extreme Obesity'
}

In [122]:
y = y.map(weight_mapping)

In [125]:
X = df.drop('NObeyesdad', axis=1)

In [134]:
# Create a function which can evaluate models and return a report 
def evaluate_models(X, y, models):
    '''
    This function takes in X and y and models dictionary as input
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
        
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model

        # Make predictions
        dt_predictions = model.predict(X_test)
        
        
        print('#'*76)
        print('*'*30,list(models.keys())[i],'*'*31)
        print(classification_report(y_test, dt_predictions))
        print('#'*76)

In [135]:
# Dictionary which contains models for experiment
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM Classifier (SVC)" : SVC(),
    "KNN Classfier": KNeighborsClassifier()
}

In [136]:
evaluate_models(X, y, models)

############################################################################
****************************** Random Forest *******************************
                 precision    recall  f1-score   support

Extreme Obesity       1.00      1.00      1.00        60
         Normal       0.98      0.98      0.98        61
        Obesity       1.00      1.00      1.00       134
     Overweight       1.00      0.99      1.00       104
    Underweight       0.98      1.00      0.99        59

       accuracy                           1.00       418
      macro avg       0.99      0.99      0.99       418
   weighted avg       1.00      1.00      1.00       418

############################################################################
############################################################################
****************************** Decision Tree *******************************
                 precision    recall  f1-score   support

Extreme Obesity       1.00      1.00     