# 0. Import library

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_curve, auc, precision_recall_curve
import pandas as pd
import numpy as np

Features shape: (2111, 14)
Target shape: (2111,)
Gender                             object
Age                               float64
family_history_with_overweight     object
FAVC                               object
FCVC                              float64
NCP                               float64
CAEC                               object
SMOKE                              object
CH2O                              float64
SCC                                object
FAF                               float64
TUE                               float64
CALC                               object
MTRANS                             object
dtype: object


# 1. Data preparation
## 1.1 Load dataset

In [None]:
data = pd.read_csv('ObesityDataset.csv')

X, y = data.drop('NObeyesdad', axis=1), data['NObeyesdad']

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print(X.dtypes)

## 1.2 Preprocessing

In [2]:
# identify numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

numeric = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric, numerical_features),
        ("cat", categorical, categorical_features),
    ],
    remainder="drop"
)

# merge into a X, y value
X = preprocess.fit_transform(X)
y = y.values

print("Preprocessed features shape:", X.shape)
print("Target shape:", y.shape)

Preprocessed features shape: (2111, 29)
Target shape: (2111,)


## 1.3 Train-test split

We split the dataset into training and testing sets using an 80/20 ratio, with `stratify` = y to preserve class distribution and fixed `random_state` = 42.

In [3]:
# split the preprocessed data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Preprocessed features shape:", X_train.shape)
print("Target shape:", X_test.shape)
print("Preprocessed features shape:", y_train.shape)
print("Target shape:", y_test.shape)

Preprocessed features shape: (1688, 29)
Target shape: (423, 29)
Preprocessed features shape: (1688,)
Target shape: (423,)


# 2. Build Classification Model with K-Nearest Neighbors
## 2.1 Train model

In [None]:
# Create the KNN pipeline
knn_pipeline = Pipeline(steps=[
    ('knn', KNeighborsClassifier(n_neighbors=5))
])
# Fit the model
knn_pipeline.fit(X_train, y_train)


[[49  1  0  2  0  2  0]
 [12 15 13  4  2  9  3]
 [ 0  1 60  5  0  3  1]
 [ 0  0  1 55  0  2  2]
 [ 0  0  0  0 64  1  0]
 [ 5  4 10  0  2 35  2]
 [ 4  1 10  6  1  2 34]]
                     precision    recall  f1-score   support

Insufficient_Weight       0.70      0.91      0.79        54
      Normal_Weight       0.68      0.26      0.38        58
     Obesity_Type_I       0.64      0.86      0.73        70
    Obesity_Type_II       0.76      0.92      0.83        60
   Obesity_Type_III       0.93      0.98      0.96        65
 Overweight_Level_I       0.65      0.60      0.62        58
Overweight_Level_II       0.81      0.59      0.68        58

           accuracy                           0.74       423
          macro avg       0.74      0.73      0.71       423
       weighted avg       0.74      0.74      0.72       423



## 2.2 Model output

In [8]:
# Predict on the test set
y_pred = knn_pipeline.predict(X_test)

# Predicted class probabilities on the test set (required for ROCâ€“AUC).
print(classification_report(y_test, y_pred))

                     precision    recall  f1-score   support

Insufficient_Weight       0.70      0.91      0.79        54
      Normal_Weight       0.68      0.26      0.38        58
     Obesity_Type_I       0.64      0.86      0.73        70
    Obesity_Type_II       0.76      0.92      0.83        60
   Obesity_Type_III       0.93      0.98      0.96        65
 Overweight_Level_I       0.65      0.60      0.62        58
Overweight_Level_II       0.81      0.59      0.68        58

           accuracy                           0.74       423
          macro avg       0.74      0.73      0.71       423
       weighted avg       0.74      0.74      0.72       423

