In [2]:
#Load Libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
# File paths for the obesity risk dataset
train_path = "/Users/machome/Downloads/Multi-Class Prediction of Obesity Risk/train.csv"
test_path  = "/Users/machome/Downloads/Multi-Class Prediction of Obesity Risk/test.csv"
sub_path   = "/Users/machome/Downloads/Multi-Class Prediction of Obesity Risk/sample_submission.csv"

# Load the datasets
train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)
sub_df   = pd.read_csv(sub_path)

# Check dimensions of each file
train_df.shape, test_df.shape, sub_df.shape


((20758, 18), (13840, 17), (13840, 2))

In [4]:
# Preview the training data
train_df.head()


Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [5]:
# View column names
train_df.columns


Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [7]:
# Basic info about variables and missing values
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [8]:
# Summary statistics for numeric variables
train_df.describe()


Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,10378.5,23.841804,1.700245,87.887768,2.445908,2.761332,2.029418,0.981747,0.616756
std,5992.46278,5.688072,0.087312,26.379443,0.533218,0.705375,0.608467,0.838302,0.602113
min,0.0,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,5189.25,20.0,1.631856,66.0,2.0,3.0,1.792022,0.008013,0.0
50%,10378.5,22.815416,1.7,84.064875,2.393837,3.0,2.0,1.0,0.573887
75%,15567.75,26.0,1.762887,111.600553,3.0,3.0,2.549617,1.587406,1.0
max,20757.0,61.0,1.975663,165.057269,3.0,4.0,3.0,3.0,2.0


In [9]:
# Separate predictors and target
X = train_df.drop(columns=["NObeyesdad", "id"])
y = train_df["NObeyesdad"]

X.shape, y.shape


((20758, 16), (20758,))

In [10]:
# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns

numeric_cols, categorical_cols


(Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'], dtype='object'),
 Index(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
        'SCC', 'CALC', 'MTRANS'],
       dtype='object'))

In [11]:
# Numeric preprocessing: impute just in case
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

# Categorical preprocessing: impute + one-hot encode
categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessing
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numeric_cols),
        ("cat", categorical_pipe, categorical_cols)
    ]
)

preprocess


0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=1,
    stratify=y
)


In [13]:
# Decision Tree model
dt_model = DecisionTreeClassifier(
    random_state=1
)

dt_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", dt_model)
])

# Fit the model
dt_pipe.fit(X_train, y_train)

# Predict on validation set
dt_preds = dt_pipe.predict(X_val)

# Evaluation
dt_acc = accuracy_score(y_val, dt_preds)

dt_acc

print("Decision Tree Validation Accuracy:", dt_acc)
print("\nClassification Report:")
print(classification_report(y_val, dt_preds))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, dt_preds))


Decision Tree Validation Accuracy: 0.8429672447013488

Classification Report:
                     precision    recall  f1-score   support

Insufficient_Weight       0.89      0.86      0.88       505
      Normal_Weight       0.80      0.79      0.79       617
     Obesity_Type_I       0.81      0.80      0.80       582
    Obesity_Type_II       0.95      0.94      0.95       650
   Obesity_Type_III       0.99      0.99      0.99       809
 Overweight_Level_I       0.67      0.71      0.69       485
Overweight_Level_II       0.70      0.71      0.70       504

           accuracy                           0.84      4152
          macro avg       0.83      0.83      0.83      4152
       weighted avg       0.84      0.84      0.84      4152


Confusion Matrix:
[[436  57   1   1   1   7   2]
 [ 45 485   2   0   0  66  19]
 [  1   3 465  25   2  21  65]
 [  0   0  29 612   2   1   6]
 [  0   1   3   2 803   0   0]
 [  7  55  20   0   0 342  61]
 [  1   8  57   5   0  76 357]]


In [14]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Bagging model
bag_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=1),
    n_estimators=300,
    random_state=1,
    n_jobs=-1
)

bag_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", bag_model)
])

# Fit the model
bag_pipe.fit(X_train, y_train)

# Predict on validation set
bag_preds = bag_pipe.predict(X_val)

# Evaluation
bag_acc = accuracy_score(y_val, bag_preds)

print("Bagging Validation Accuracy:", bag_acc)
print("\nClassification Report:")
print(classification_report(y_val, bag_preds))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, bag_preds))


Bagging Validation Accuracy: 0.8971579961464354

Classification Report:
                     precision    recall  f1-score   support

Insufficient_Weight       0.93      0.91      0.92       505
      Normal_Weight       0.85      0.87      0.86       617
     Obesity_Type_I       0.88      0.88      0.88       582
    Obesity_Type_II       0.97      0.97      0.97       650
   Obesity_Type_III       1.00      1.00      1.00       809
 Overweight_Level_I       0.79      0.76      0.77       485
Overweight_Level_II       0.80      0.82      0.81       504

           accuracy                           0.90      4152
          macro avg       0.89      0.89      0.89      4152
       weighted avg       0.90      0.90      0.90      4152


Confusion Matrix:
[[460  37   0   1   0   5   2]
 [ 33 535   2   0   0  37  10]
 [  1   0 512  15   2  13  39]
 [  0   0  16 631   1   0   2]
 [  0   0   1   2 805   1   0]
 [  3  54   7   0   0 368  53]
 [  0   4  42   3   0  41 414]]


In [15]:
from sklearn.ensemble import RandomForestClassifier
# Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=500,
    random_state=1,
    n_jobs=-1
)

rf_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", rf_model)
])

# Fit the model
rf_pipe.fit(X_train, y_train)

# Predict on validation set
rf_preds = rf_pipe.predict(X_val)

# Evaluation
rf_acc = accuracy_score(y_val, rf_preds)

print("Random Forest Validation Accuracy:", rf_acc)
print("\nClassification Report:")
print(classification_report(y_val, rf_preds))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, rf_preds))


Random Forest Validation Accuracy: 0.8928227360308285

Classification Report:
                     precision    recall  f1-score   support

Insufficient_Weight       0.94      0.90      0.92       505
      Normal_Weight       0.82      0.88      0.85       617
     Obesity_Type_I       0.89      0.87      0.88       582
    Obesity_Type_II       0.97      0.97      0.97       650
   Obesity_Type_III       1.00      1.00      1.00       809
 Overweight_Level_I       0.79      0.73      0.76       485
Overweight_Level_II       0.78      0.82      0.80       504

           accuracy                           0.89      4152
          macro avg       0.88      0.88      0.88      4152
       weighted avg       0.89      0.89      0.89      4152


Confusion Matrix:
[[453  48   0   1   0   3   0]
 [ 25 546   2   0   0  34  10]
 [  1   2 507  12   2  16  42]
 [  0   0  16 632   0   0   2]
 [  0   1   1   1 805   1   0]
 [  4  56  10   0   0 353  62]
 [  0  15  33   4   0  41 411]]


In [16]:
from sklearn.ensemble import GradientBoostingClassifier
# Boosting model
gb_model = GradientBoostingClassifier(
    random_state=1
)

gb_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", gb_model)
])

# Fit the model
gb_pipe.fit(X_train, y_train)

# Predict on validation set
gb_preds = gb_pipe.predict(X_val)

# Evaluation
gb_acc = accuracy_score(y_val, gb_preds)

print("Boosting Validation Accuracy:", gb_acc)
print("\nClassification Report:")
print(classification_report(y_val, gb_preds))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, gb_preds))


Boosting Validation Accuracy: 0.9053468208092486

Classification Report:
                     precision    recall  f1-score   support

Insufficient_Weight       0.94      0.92      0.93       505
      Normal_Weight       0.87      0.89      0.88       617
     Obesity_Type_I       0.89      0.89      0.89       582
    Obesity_Type_II       0.96      0.98      0.97       650
   Obesity_Type_III       1.00      1.00      1.00       809
 Overweight_Level_I       0.81      0.77      0.79       485
Overweight_Level_II       0.81      0.82      0.81       504

           accuracy                           0.91      4152
          macro avg       0.90      0.89      0.90      4152
       weighted avg       0.91      0.91      0.91      4152


Confusion Matrix:
[[464  33   0   1   1   5   1]
 [ 28 549   0   0   0  31   9]
 [  1   1 520  15   2  13  30]
 [  0   0  14 634   0   0   2]
 [  0   0   1   2 805   1   0]
 [  3  44   7   0   0 374  57]
 [  0   5  40   7   0  39 413]]


In [17]:
results = {
    "Decision Tree": dt_acc,
    "Bagging": bag_acc,
    "Random Forest": rf_acc,
    "Boosting": gb_acc
}

results_df = (
    pd.DataFrame.from_dict(results, orient="index", columns=["Validation Accuracy"])
      .sort_values("Validation Accuracy", ascending=False)
)

results_df


Unnamed: 0,Validation Accuracy
Boosting,0.905347
Bagging,0.897158
Random Forest,0.892823
Decision Tree,0.842967


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Regularized multinomial logistic regression (L2 / Ridge)
log_reg = LogisticRegression(
    penalty="l2",
    C=1.0,
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=2000,
    random_state=1
)

log_reg_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", log_reg)
])

# Fit model
log_reg_pipe.fit(X_train, y_train)

# Predict on validation set
log_preds = log_reg_pipe.predict(X_val)

# Evaluation
log_acc = accuracy_score(y_val, log_preds)

print("Regularized Logistic Regression Validation Accuracy:", log_acc)
print("\nClassification Report:")
print(classification_report(y_val, log_preds))


  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights


Regularized Logistic Regression Validation Accuracy: 0.8333333333333334

Classification Report:
                     precision    recall  f1-score   support

Insufficient_Weight       0.85      0.92      0.88       505
      Normal_Weight       0.80      0.76      0.78       617
     Obesity_Type_I       0.78      0.80      0.79       582
    Obesity_Type_II       0.93      0.96      0.94       650
   Obesity_Type_III       0.99      1.00      0.99       809
 Overweight_Level_I       0.67      0.65      0.66       485
Overweight_Level_II       0.68      0.63      0.65       504

           accuracy                           0.83      4152
          macro avg       0.81      0.82      0.81      4152
       weighted avg       0.83      0.83      0.83      4152



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [21]:
# Full training data (no validation split)
X_full = train_df.drop(columns=["NObeyesdad", "id"])
y_full = train_df["NObeyesdad"]


In [22]:
# Refit boosting model on full data
gb_final = GradientBoostingClassifier(random_state=1)

gb_final_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", gb_final)
])

gb_final_pipe.fit(X_full, y_full)

# Predict on Kaggle test set
gb_test_preds = gb_final_pipe.predict(test_df.drop(columns=["id"]))

# Create submission file
gb_submission = pd.DataFrame({
    "id": test_df["id"],
    "NObeyesdad": gb_test_preds
})

gb_submission.head()

# Save submission file
gb_submission.to_csv(
    "/Users/machome/Downloads/Multi-Class Prediction of Obesity Risk/submission_boosting.csv",
    index=False
)



In [23]:
# Refit bagging model on full data
bag_final = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=1),
    n_estimators=300,
    random_state=1,
    n_jobs=-1
)

bag_final_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", bag_final)
])

bag_final_pipe.fit(X_full, y_full)

# Predict on Kaggle test set
bag_test_preds = bag_final_pipe.predict(test_df.drop(columns=["id"]))

# Create submission file
bag_submission = pd.DataFrame({
    "id": test_df["id"],
    "NObeyesdad": bag_test_preds
})

bag_submission.to_csv(
    "/Users/machome/Downloads/Multi-Class Prediction of Obesity Risk/submission_bagging.csv",
    index=False
)

bag_submission.head()


Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III


In [24]:
# Refit random forest model on full data
rf_final = RandomForestClassifier(
    n_estimators=500,
    random_state=1,
    n_jobs=-1
)

rf_final_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", rf_final)
])

rf_final_pipe.fit(X_full, y_full)

# Predict on Kaggle test set
rf_test_preds = rf_final_pipe.predict(test_df.drop(columns=["id"]))

# Create submission file
rf_submission = pd.DataFrame({
    "id": test_df["id"],
    "NObeyesdad": rf_test_preds
})

rf_submission.to_csv(
    "/Users/machome/Downloads/Multi-Class Prediction of Obesity Risk/submission_random_forest.csv",
    index=False
)

rf_submission.head()


Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III


In [25]:
# Refit decision tree on full data
dt_final = DecisionTreeClassifier(random_state=1)

dt_final_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", dt_final)
])

dt_final_pipe.fit(X_full, y_full)

# Predict on Kaggle test set
dt_test_preds = dt_final_pipe.predict(test_df.drop(columns=["id"]))

# Create submission file
dt_submission = pd.DataFrame({
    "id": test_df["id"],
    "NObeyesdad": dt_test_preds
})

dt_submission.to_csv(
    "/Users/machome/Downloads/Multi-Class Prediction of Obesity Risk/submission_decision_tree.csv",
    index=False
)

dt_submission.head()


Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
