In [None]:
#Load Libaries
import pandas as pd

In [4]:
# File paths for the obesity risk dataset
train_path = "/Users/machome/Downloads/Multi-Class Prediction of Obesity Risk/train.csv"
test_path  = "/Users/machome/Downloads/Multi-Class Prediction of Obesity Risk/test.csv"
sub_path   = "/Users/machome/Downloads/Multi-Class Prediction of Obesity Risk/sample_submission.csv"

# Load the datasets
train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)
sub_df   = pd.read_csv(sub_path)

# Check dimensions of each file
train_df.shape, test_df.shape, sub_df.shape


((20758, 18), (13840, 17), (13840, 2))

In [5]:
# Preview the training data
train_df.head()


Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [6]:
# View column names
train_df.columns


Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [7]:
# Basic info about variables and missing values
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [8]:
# Summary statistics for numeric variables
train_df.describe()


Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,10378.5,23.841804,1.700245,87.887768,2.445908,2.761332,2.029418,0.981747,0.616756
std,5992.46278,5.688072,0.087312,26.379443,0.533218,0.705375,0.608467,0.838302,0.602113
min,0.0,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,5189.25,20.0,1.631856,66.0,2.0,3.0,1.792022,0.008013,0.0
50%,10378.5,22.815416,1.7,84.064875,2.393837,3.0,2.0,1.0,0.573887
75%,15567.75,26.0,1.762887,111.600553,3.0,3.0,2.549617,1.587406,1.0
max,20757.0,61.0,1.975663,165.057269,3.0,4.0,3.0,3.0,2.0


In [9]:
# Class distribution of target variable
train_df["NObeyesdad"].value_counts()
train_df["NObeyesdad"].value_counts(normalize=True)


NObeyesdad
Obesity_Type_III       0.194913
Obesity_Type_II        0.156470
Normal_Weight          0.148473
Obesity_Type_I         0.140187
Insufficient_Weight    0.121544
Overweight_Level_II    0.121495
Overweight_Level_I     0.116919
Name: proportion, dtype: float64

In [11]:
# Separate predictors and target variable
X = train_df.drop(columns=["NObeyesdad"])
y = train_df["NObeyesdad"]

X.shape, y.shape


((20758, 17), (20758,))

In [12]:
# Identify numeric and categorical columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

num_cols, cat_cols


(['id', 'Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'],
 ['Gender',
  'family_history_with_overweight',
  'FAVC',
  'CAEC',
  'SMOKE',
  'SCC',
  'CALC',
  'MTRANS'])

In [13]:
# Remove id from predictors
num_cols = [col for col in num_cols if col != "id"]

num_cols


['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Preprocessing: scale numeric, one-hot encode categorical
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

preprocess


0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [15]:
from sklearn.model_selection import train_test_split

# Train/validation split for model evaluation and interpretation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_val.shape


((16606, 17), (4152, 17))

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Multinomial logistic regression model
logit_model = LogisticRegression(
    multi_class="multinomial",
    max_iter=5000,
    n_jobs=-1
)

logit_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", logit_model)
])

# Fit model
logit_pipe.fit(X_train, y_train)


  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights


0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,5000


In [17]:
logit_pred = logit_pipe.predict(X_val)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_val, logit_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, logit_pred))
print("\nClassification Report:\n", classification_report(y_val, logit_pred))


Accuracy: 0.8684971098265896

Confusion Matrix:
 [[479  25   0   0   0   1   0]
 [ 58 505   2   0   0  41  11]
 [  1   0 493  36   3  11  38]
 [  0   0  22 625   0   0   3]
 [  0   0   0   1 807   1   0]
 [  1  46  15   0   0 342  81]
 [  0   3  79   8   0  59 355]]

Classification Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.89      0.95      0.92       505
      Normal_Weight       0.87      0.82      0.84       617
     Obesity_Type_I       0.81      0.85      0.83       582
    Obesity_Type_II       0.93      0.96      0.95       650
   Obesity_Type_III       1.00      1.00      1.00       809
 Overweight_Level_I       0.75      0.71      0.73       485
Overweight_Level_II       0.73      0.70      0.72       504

           accuracy                           0.87      4152
          macro avg       0.85      0.85      0.85      4152
       weighted avg       0.87      0.87      0.87      4152



  ret = a @ b
  ret = a @ b
  ret = a @ b


In [18]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_model = LinearDiscriminantAnalysis()

lda_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", lda_model)
])

lda_pipe.fit(X_train, y_train)

lda_pred = lda_pipe.predict(X_val)

print("Accuracy:", accuracy_score(y_val, lda_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, lda_pred))
print("\nClassification Report:\n", classification_report(y_val, lda_pred))


Accuracy: 0.8229768786127167

Confusion Matrix:
 [[470  33   0   0   0   1   1]
 [101 442   1   0   0  54  19]
 [  1   1 456  54   7  13  50]
 [  0   0  29 612   5   0   4]
 [  0   0   1   2 805   1   0]
 [  6  74  16   0   0 289 100]
 [  0   9  82   4   0  66 343]]

Classification Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.81      0.93      0.87       505
      Normal_Weight       0.79      0.72      0.75       617
     Obesity_Type_I       0.78      0.78      0.78       582
    Obesity_Type_II       0.91      0.94      0.93       650
   Obesity_Type_III       0.99      1.00      0.99       809
 Overweight_Level_I       0.68      0.60      0.64       485
Overweight_Level_II       0.66      0.68      0.67       504

           accuracy                           0.82      4152
          macro avg       0.80      0.81      0.80      4152
       weighted avg       0.82      0.82      0.82      4152



  ret = a @ b
  ret = a @ b
  ret = a @ b


In [19]:
from sklearn.naive_bayes import GaussianNB

# Naive Bayes needs dense matrix
class ToDenseTransformer:
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.toarray() if hasattr(X, "toarray") else X

nb_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("dense", ToDenseTransformer()),
    ("model", GaussianNB())
])

nb_pipe.fit(X_train, y_train)

nb_pred = nb_pipe.predict(X_val)

print("Accuracy:", accuracy_score(y_val, nb_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, nb_pred))
print("\nClassification Report:\n", classification_report(y_val, nb_pred))


Accuracy: 0.5859826589595376

Confusion Matrix:
 [[377  25  89   4   2   5   3]
 [257 123  82  61  17  37  40]
 [  1  17 239 275   5   9  36]
 [  0   2   5 624   0   3  16]
 [  0   1   0   1 806   1   0]
 [ 42  53 113 103   9 114  51]
 [  2  29  77 223   1  22 150]]

Classification Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.56      0.75      0.64       505
      Normal_Weight       0.49      0.20      0.28       617
     Obesity_Type_I       0.40      0.41      0.40       582
    Obesity_Type_II       0.48      0.96      0.64       650
   Obesity_Type_III       0.96      1.00      0.98       809
 Overweight_Level_I       0.60      0.24      0.34       485
Overweight_Level_II       0.51      0.30      0.38       504

           accuracy                           0.59      4152
          macro avg       0.57      0.55      0.52      4152
       weighted avg       0.59      0.59      0.55      4152



In [20]:
from sklearn.svm import SVC

svm_model = SVC(kernel="rbf", C=1.0, gamma="scale")

svm_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", svm_model)
])

svm_pipe.fit(X_train, y_train)

svm_pred = svm_pipe.predict(X_val)

print("Accuracy:", accuracy_score(y_val, svm_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, svm_pred))
print("\nClassification Report:\n", classification_report(y_val, svm_pred))


Accuracy: 0.8810211946050096

Confusion Matrix:
 [[472  32   0   0   0   1   0]
 [ 47 510   1   0   0  51   8]
 [  1   0 513  21   3  12  32]
 [  0   0  19 630   0   0   1]
 [  0   0   2   1 806   0   0]
 [  2  47  18   1   0 340  77]
 [  0   6  47   6   0  58 387]]

Classification Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.90      0.93      0.92       505
      Normal_Weight       0.86      0.83      0.84       617
     Obesity_Type_I       0.85      0.88      0.87       582
    Obesity_Type_II       0.96      0.97      0.96       650
   Obesity_Type_III       1.00      1.00      1.00       809
 Overweight_Level_I       0.74      0.70      0.72       485
Overweight_Level_II       0.77      0.77      0.77       504

           accuracy                           0.88      4152
          macro avg       0.87      0.87      0.87      4152
       weighted avg       0.88      0.88      0.88      4152



In [21]:
# Fit all models on full training data and create Kaggle submissions

def make_submission(pipe, filename):
    pipe.fit(X, y)
    test_pred = pipe.predict(test_df)
    out = pd.DataFrame({
        "id": test_df["id"],
        "NObeyesdad": test_pred
    })
    out.to_csv(filename, index=False)
    print("Saved:", filename)

make_submission(logit_pipe, "submission_logistic.csv")
make_submission(lda_pipe, "submission_lda.csv")
make_submission(nb_pipe, "submission_nb.csv")
make_submission(svm_pipe, "submission_svm.csv")


  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


Saved: submission_logistic.csv
Saved: submission_lda.csv
Saved: submission_nb.csv
Saved: submission_svm.csv
