# Modeling: Softmax Regression, (Logistic regression)

In [14]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier



In [2]:
data = pd.read_csv('ObesityDataset.csv')
print(data.head())

   Gender   Age family_history_with_overweight FAVC  FCVC  NCP       CAEC  \
0  Female  21.0                            yes   no   2.0  3.0  Sometimes   
1  Female  21.0                            yes   no   3.0  3.0  Sometimes   
2    Male  23.0                            yes   no   2.0  3.0  Sometimes   
3    Male  27.0                             no   no   3.0  3.0  Sometimes   
4    Male  22.0                             no   no   2.0  1.0  Sometimes   

  SMOKE  CH2O  SCC  FAF  TUE        CALC                 MTRANS  \
0    no   2.0   no  0.0  1.0          no  Public_Transportation   
1   yes   3.0  yes  3.0  0.0   Sometimes  Public_Transportation   
2    no   2.0   no  2.0  1.0  Frequently  Public_Transportation   
3    no   2.0   no  2.0  0.0  Frequently                Walking   
4    no   2.0   no  0.0  0.0   Sometimes  Public_Transportation   

            NObeyesdad  
0        Normal_Weight  
1        Normal_Weight  
2        Normal_Weight  
3   Overweight_Level_I  
4  Overwe

# 1.1 Data preparation for modeling

## Train-test split

In [4]:
CSV_PATH = "ObesityDataset.csv"  
data = pd.read_csv(CSV_PATH)

print(data)

      Gender        Age family_history_with_overweight FAVC  FCVC  NCP  \
0     Female  21.000000                            yes   no   2.0  3.0   
1     Female  21.000000                            yes   no   3.0  3.0   
2       Male  23.000000                            yes   no   2.0  3.0   
3       Male  27.000000                             no   no   3.0  3.0   
4       Male  22.000000                             no   no   2.0  1.0   
...      ...        ...                            ...  ...   ...  ...   
2106  Female  20.976842                            yes  yes   3.0  3.0   
2107  Female  21.982942                            yes  yes   3.0  3.0   
2108  Female  22.524036                            yes  yes   3.0  3.0   
2109  Female  24.361936                            yes  yes   3.0  3.0   
2110  Female  23.664709                            yes  yes   3.0  3.0   

           CAEC SMOKE      CH2O  SCC       FAF       TUE        CALC  \
0     Sometimes    no  2.000000   no  0

In [18]:
TARGET_COL = "NObeyesdad" 

X = data.drop(columns=[TARGET_COL])
y = data[TARGET_COL]


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42,
)

In [19]:
print(f"Size of data original{X.shape}")
print(f"Size of label original{y.shape}")

print(f"Size of data train{X_train.shape}")
print(f"Size of label train{y_train.shape}")
print(f"Size of data test{X_test.shape}")
print(f"Size of label test{y_test.shape}")


Size of data original(2111, 14)
Size of label original(2111,)
Size of data train(1688, 14)
Size of label train(1688,)
Size of data test(423, 14)
Size of label test(423,)


In [None]:
import numpy as np

def random_state(seed=42):
    np.random.seed(seed)
    
random_state(42)


## Preprocessing:

In [16]:
num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
print(f"Num_cols {num_cols}")
cat_cols = [c for c in X_train.columns if c not in num_cols]
print(f"Cat_cols {cat_cols}")

numeric_tf = Pipeline(steps=[
    ("scaler", StandardScaler()),
])

categorical_tf = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols),
    ],
    remainder="drop",
)

Num_cols ['Age', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
Cat_cols ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']


In [20]:
print(f"preprocess{preprocess}")

preprocessColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['Age', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']),
                                ('cat',
                                 Pipeline(steps=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['Gender', 'family_history_with_overweight',
                                  'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC',
                                  'MTRANS'])])


In [34]:
print(X_train)

      Gender        Age family_history_with_overweight FAVC      FCVC  \
459     Male  19.000000                            yes  yes  2.000000   
426     Male  22.000000                             no   no  2.000000   
326     Male  18.000000                            yes  yes  3.000000   
971     Male  19.506389                            yes  yes  2.793561   
892   Female  17.085250                             no  yes  1.972545   
...      ...        ...                            ...  ...       ...   
90    Female  25.000000                             no   no  3.000000   
1439  Female  40.654155                            yes  yes  2.000000   
609     Male  19.979810                            yes  yes  2.000000   
1589    Male  38.523646                            yes  yes  2.177896   
478     Male  19.000000                             no  yes  2.000000   

           NCP        CAEC SMOKE      CH2O  SCC       FAF       TUE  \
459   3.000000  Frequently    no  3.000000   no  1.0

## 1.2 Build classification models

+ Sử dụng model:
    + Logistic Regression: 
    + Decision Tree
    + Random Forest
    + K-Nearest Neighbor

In [30]:
random_state(42)
models = {
    "logistic_regresion": LogisticRegression(max_iter=2000, random_state=42),
    "decision_tree": DecisionTreeClassifier(random_state=42),
    "random_forest": RandomForestClassifier(random_state=42),
    "knn": KNeighborsClassifier(n_neighbors=5),
}


In [31]:
print(models.items())

dict_items([('logistic_regresion', LogisticRegression(max_iter=2000, random_state=42)), ('decision_tree', DecisionTreeClassifier(random_state=42)), ('random_forest', RandomForestClassifier(random_state=42)), ('knn', KNeighborsClassifier())])


## 1.3 Model outputs for evaluation

In [42]:
def eval_pipeline(pipe: Pipeline, X_test, y_test):
    y_pred = pipe.predict(X_test)

    if hasattr(pipe, "predict_proba"):
        y_proba = pipe.predict_proba(X_test)
        print(f"Test acc {y_proba}")
        auc = roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")
    else:
        y_proba = None
        auc = np.nan

    acc = accuracy_score(y_test, y_pred)
    rep = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    macro_p = rep["macro avg"]["precision"]
    macro_r = rep["macro avg"]["recall"]
    macro_f1 = rep["macro avg"]["f1-score"]
    cm = confusion_matrix(y_test, y_pred)

    return {
        "acc": acc,
        "macro_p": macro_p,
        "macro_r": macro_r,
        "macro_f1": macro_f1,
        "macro_auc_ovr": auc,
        "cm": cm,
    }

# 2. Evaluate

In [44]:
results = []
trained = {}

for name, model in models.items():
    pipe = Pipeline(steps=[
        ("preprocess", preprocess),
        ("clf", model),
    ])
    pipe.fit(X_train, y_train)
    
    
    metrics = eval_pipeline(pipe, X_test, y_test)
    trained[name] = pipe

    results.append({
        "model": name,
        "acc": metrics["acc"],
        "macro_p": metrics["macro_p"],
        "macro_r": metrics["macro_r"],
        "macro_f1": metrics["macro_f1"],
        "macro_auc_ovr": metrics["macro_auc_ovr"],
    })

results_df = pd.DataFrame(results).sort_values(by="macro_f1", ascending=False)
print(results_df)

Test acc [[6.68934987e-01 1.86341002e-01 2.27455086e-03 ... 6.31824481e-02
  7.69796567e-02 2.24584774e-03]
 [4.02968064e-01 4.24542026e-01 3.63158139e-02 ... 3.13958054e-08
  7.66468232e-02 5.15509454e-02]
 [6.17018879e-03 1.92662160e-02 2.60235285e-01 ... 2.17682430e-04
  1.51144209e-01 2.01093795e-01]
 ...
 [7.06273623e-04 1.02321076e-02 4.97950606e-02 ... 4.46462229e-04
  8.80826365e-03 5.10527611e-02]
 [3.75608953e-01 3.51607962e-01 5.18863968e-03 ... 1.03138235e-07
  1.12212280e-01 1.51447467e-01]
 [3.32598012e-02 3.93085210e-02 6.69097220e-01 ... 4.29536399e-04
  1.21225206e-01 1.36138270e-01]]
Test acc [[0. 0. 0. ... 1. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]
Test acc [[0.86 0.06 0.   ... 0.02 0.04 0.02]
 [0.03 0.76 0.06 ... 0.   0.13 0.02]
 [0.   0.01 0.42 ... 0.   0.03 0.31]
 ...
 [0.09 0.02 0.08 ... 0.   0.09 0.05]
 [0.12 0.74 0.   ... 0.   0.03 0.11]
 [0.01 0.02 0.88 ... 0.   0

In [45]:
import joblib
best_name = results_df.iloc[0]["model"]
best_pipe = trained[best_name]
joblib.dump(best_pipe, "best_model.joblib")
print("Saved:", best_name, "-> best_model.joblib")

Saved: random_forest -> best_model.joblib
