In [None]:
!pip install ucimlrepo pyarrow

import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
# load dataset
dataset = fetch_ucirepo(id=544)

In [None]:
# split into features and targets
X = dataset.data.features
y = dataset.data.targets

In [None]:
X

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation


In [None]:
# display data types
print(X.dtypes,'\n')
print(y.dtypes)

Gender                             object
Age                               float64
Height                            float64
Weight                            float64
family_history_with_overweight     object
FAVC                               object
FCVC                              float64
NCP                               float64
CAEC                               object
SMOKE                              object
CH2O                              float64
SCC                                object
FAF                               float64
TUE                               float64
CALC                               object
MTRANS                             object
dtype: object 

NObeyesdad    object
dtype: object


In [None]:
# check for missing values
print(X.isna().sum(), '\n')
print(y.isna().sum())

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
dtype: int64 

NObeyesdad    0
dtype: int64


In [None]:
# Binary to numeric
X['Gender'] = X['Gender'].map({'Female': 1, 'Male': 0})
X.rename(columns={'Gender': 'IsFemale?'}, inplace=True)

X['family_history_with_overweight'] = X['family_history_with_overweight'].map({'yes': 1, 'no': 0})
X['FAVC'] = X['FAVC'].map({'yes': 1, 'no': 0})
X['SMOKE'] = X['SMOKE'].map({'yes': 1, 'no': 0})
X['SCC'] = X['SCC'].map({'yes': 1, 'no': 0})

# Categorical to numeric
X['CAEC'] = X['CAEC'].map({'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3})
X['CALC'] = X['CALC'].map({'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3})

# One-hot encoding for nominal feature - transportation
X = pd.get_dummies(X, columns=['MTRANS'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Gender'] = X['Gender'].map({'Female': 1, 'Male': 0})
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.rename(columns={'Gender': 'IsFemale?'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['family_history_with_overweight'] = X['family_history_with_overweight'].map({'yes': 1, 'no': 0})
A value is trying to be set on a copy of a slice fr

In [None]:
# convert target to binary label: 0 for not obese and 1 for obese
y['NObeyesdad'] = y['NObeyesdad'].map({
    'Insufficient_Weight': 0,
    'Normal_Weight': 0,
    'Overweight_Level_I': 0,
    'Overweight_Level_II': 0,
    'Obesity_Type_I': 1,
    'Obesity_Type_II': 1,
    'Obesity_Type_III': 1
})
y.rename(columns={'NObeyesdad': 'IsObese?'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['NObeyesdad'] = y['NObeyesdad'].map({
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.rename(columns={'NObeyesdad': 'IsObese?'}, inplace=True)


In [None]:
# convert all data to integers
X = X.astype(int)
y = y.astype(int)

In [None]:
scaler1 = StandardScaler()
scaler2 = MinMaxScaler()

# Apply both scalers (double normalization used in KNN as well)
for col in X.columns:
    X[col] = scaler1.fit_transform(X[[col]])
    X[col] = scaler2.fit_transform(X[[col]])

In [None]:
# split dataset into training and testing sets, stratified by obesity status
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

In [None]:
# define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['entropy']
}

In [None]:
#define evaluation metric for tuning
scoring_metrics = ['recall', 'f1', 'accuracy']

In [None]:
# grid search for each metric
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
for metric in scoring_metrics:
    grid_search = GridSearchCV(
        rf,
        param_grid,
        scoring=metric,
        cv=5,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train.values.ravel())
    print(f"Best parameters for {metric}: {grid_search.best_params_}")

Best parameters for recall: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best parameters for f1: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best parameters for accuracy: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [None]:
# evaluate random forest with best parameters
def rf_funct(X, y, test_size=0.2,
             criterion='entropy',
             n_estimators=100,
             max_depth=None,
             min_samples_split=2,
             min_samples_leaf=1):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    rf = RandomForestClassifier(
        criterion=criterion,
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    rf.fit(X_train, y_train.values.ravel())

    y_pred = rf.predict(X_test)

    rec = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    print(f"Sensitivity: {rec:.2f}")
    print(f"F1 Score: {F1:.2f}")
    print(f"Accuracy: {acc:.2f}")

In [None]:
# final evaluation using best parameters found by gridsearchCV
rf_funct(X, y, criterion='entropy', n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1)

Sensitivity: 0.98
F1 Score: 0.98
Accuracy: 0.98
