Kaggle link: <br> https://www.kaggle.com/code/srsses/multiclass-classification-obesity

In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler,OneHotEncoder

from warnings import filterwarnings
filterwarnings('ignore')
seed = 42

In [2]:
data = pd.read_csv("./data/train.csv")
data.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


## Meaning of the columns:
FAVC: Frequent consumption of high-calorie food. This refers to how often an individual eats foods that are high in calories. Frequent consumption of high-calorie foods can contribute to weight gain and obesity.

FCVC: Frequency of consumption of vegetables. This refers to how often an individual eats vegetables. Vegetables are low in calories and high in nutrients, so eating them regularly can help with weight management and overall health.

NCP: Number of main meals. This refers to how many main meals an individual eats per day. There is no single "ideal" number of main meals, but some studies suggest that eating more frequent, smaller meals may be beneficial for weight management.

CAEC: Consumption of food between meals. This refers to how often an individual eats snacks or other foods between meals. Frequent snacking can contribute to excess calorie intake and weight gain.

SMOKE: Smokes (yes or no). This refers to whether or not an individual smokes cigarettes. Smoking is a major risk factor for many health problems, including obesity, heart disease, and cancer.

CH2O: Consumption of water daily. This refers to how much water an individual drinks per day. Drinking plenty of water is important for overall health and may also help with weight management.

SCC: Calories consumption monitoring. This refers to whether or not an individual tracks their calorie intake. Calorie tracking can be a helpful tool for weight management, as it allows individuals to be more aware of how much they are eating.

FAF: Physical activity frequency. This refers to how often an individual engages in physical activity. Regular physical activity is essential for maintaining a healthy weight and reducing the risk of chronic diseases.

TUE: Time using technology devices. This refers to how much time an individual spends using electronic devices such as computers, smartphones, and tablets. Excessive screen time has been linked to a number of health problems, including obesity.

CALC: Consumption of alcohol. This refers to how often and how much alcohol an individual drinks. Excessive alcohol consumption can contribute to weight gain and other health problems.

MTRANS: Mode of transportation. This refers to how an individual gets around, such as by car, public transportation, bicycle, or walking. Active forms of transportation, such as walking and cycling, can help with weight management.

In [3]:
## constants
# columns = data.columns.to_list()
# sex = ['Male','Female']
# yn = ["yes","no"]

def data_sanity_check(df):

    #Check if the appropriate columns exist:
    for col in df:
        if col not in columns:
            raise ValueError(f"Input file does not have a {col} column.")
    
    #Gender check:
    invalid_values = df[~df['Gender'].apply(lambda x: x.capitalize()).isin(sex)][["id","Gender"]]    
    if not invalid_values.empty:        
        raise ValueError(f"Invalid Values found in Gender column for ids: {invalid_values['id'].tolist()}")
    

    # Numeric column check
    num_cols = ["Age","Height","Weight"]
    
    for col in num_cols:
        try:
            # Convert the column to numeric (if it contains valid numeric values)
            df[col] = pd.to_numeric(df[col])
        except ValueError:
            # Handle any non-numeric values (e.g., strings, NaNs)
            print(f"Warning: {col} contains non-numeric values.")

        invalid_values = df[df[col].astype('float') <= 0]
        
        if not invalid_values.empty:
            raise ValueError(f"{col} cannot be 0 or negative.")
    

    # Columns that should have a Yes or NO values
    yn_cols = ["family_history_with_overweight", "FAVC", "SMOKE", "SCC"]

    for colname in yn_cols:
        invalid_values = df[~df[colname].apply(lambda x: x.lower()).isin(yn)][["id",colname]]
            
        if not invalid_values.empty:
            raise ValueError(f"Invalid Values found in {colname} column for ids: {invalid_values['id'].tolist()}")
        
    
    
    # MTRANS
    colname = "MTRANS"
    invalid_values = df[~df[colname].isin(transport)][["id",colname]]
    if not invalid_values.empty:
        raise ValueError(f"Invalid Values found in{colname} column for ids: {invalid_values['id'].tolist()}")
    

# df = pd.read_excel("./input_test_temp.xlsx")
# data_sanity_check(df)

In [3]:
print(data.columns.to_list())

['id', 'Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad']


In [11]:
cat_cols = data.select_dtypes(["object"]).columns.to_list()
num_cols = data.select_dtypes(include=["int64","float64"]).columns.to_list()

print(num_cols)

['id', 'Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']


Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad
0,Male,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation,Overweight_Level_II
1,Female,yes,yes,Frequently,no,no,no,Automobile,Normal_Weight
2,Female,yes,yes,Sometimes,no,no,no,Public_Transportation,Insufficient_Weight
3,Female,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation,Obesity_Type_III
4,Male,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...
20753,Male,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation,Obesity_Type_II
20754,Male,no,yes,Frequently,no,no,Sometimes,Public_Transportation,Insufficient_Weight
20755,Male,yes,yes,Sometimes,no,no,no,Public_Transportation,Obesity_Type_II
20756,Male,yes,yes,Sometimes,no,no,no,Automobile,Overweight_Level_II


## Preproccessing

In [4]:
df = data.copy()
df = df.drop(['id'],axis=1)

X = df.drop(["NObeyesdad"],axis=1)
y = df["NObeyesdad"]

In [12]:
data["MTRANS"].unique().tolist()

['Public_Transportation', 'Automobile', 'Walking', 'Motorbike', 'Bike']

In [5]:
df["NObeyesdad"].value_counts(normalize=True)

NObeyesdad
Obesity_Type_III       0.194913
Obesity_Type_II        0.156470
Normal_Weight          0.148473
Obesity_Type_I         0.140187
Insufficient_Weight    0.121544
Overweight_Level_II    0.121495
Overweight_Level_I     0.116919
Name: proportion, dtype: float64

In [6]:
# from imblearn.over_sampling import SMOTE
# from sklearn.preprocessing import MultiLabelBinarizer

# # Convert multi-labels to binary format
# mlb = MultiLabelBinarizer()
# y_bin = mlb.fit_transform(y)

# # Apply SMOTE
# smote = SMOTE(sampling_strategy='auto', random_state=seed)
# X_resampled, y_resampled = smote.fit_resample(X, y_bin)

# # Convert back to original multi-label format
# y_resampled_multi = mlb.inverse_transform(y_resampled)


In [7]:

unique_classes = ["Insufficient_Weight","Normal_Weight", "Overweight_Level_I","Overweight_Level_II","Overweight_Level_III","Obesity_Type_I","Obesity_Type_II","Obesity_Type_III"]  #Set the order here.
labels = {c: i for i, c in enumerate(unique_classes)}
y_encoded = y.apply(lambda x: labels[x])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=seed)
ytrain_enc = y_train.apply(lambda x: labels[x])
ytest_enc = y_test.apply(lambda x: labels[x])

In [9]:
## preprocessing steps:
cat_features = X_train.select_dtypes(include=['object'])
num_features = X_train.select_dtypes(include=['int64','float64'])

# Round the numerical values upto two decimal numbers

num_features = num_features.astype('float16')  # Convert to float8
num_features = num_features.apply(lambda x:round(x,2))

# Get the names of the columns
cat_cols = cat_features.columns.to_list()
num_cols = num_features.columns.to_list()

In [10]:
X_train.head(1)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
1846,Male,32.0,1.75,120.0,yes,no,2.0,3.0,Sometimes,no,1.0,no,0.0,2.0,Sometimes,Automobile


In [11]:
preprocessor = ColumnTransformer([
    ('categorical_encoder', OneHotEncoder(drop='first'),cat_cols),
    ('scaler', RobustScaler(with_centering=False),num_cols)
    ]   
)

rf_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", RandomForestClassifier()),
])

param_grid = {
    "classifier__n_estimators": [100, 200, 300],
    "classifier__max_depth": [10, 15, 20, 25],
}

# KFold cross-validation with 5 folds
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=rf_pipeline, param_grid=param_grid, cv=kfold, scoring="accuracy")

# Fit the GridSearchCV
grid_search.fit(X_train, ytrain_enc)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

Best parameters: {'classifier__max_depth': 20, 'classifier__n_estimators': 300}
Best score: 0.8940812112869925


In [15]:
## Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_decoded = [unique_classes[i] for i in y_pred]

In [16]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred_decoded)
print(report)

                     precision    recall  f1-score   support

Insufficient_Weight       0.95      0.89      0.92       755
      Normal_Weight       0.81      0.90      0.85       907
     Obesity_Type_I       0.88      0.86      0.87       858
    Obesity_Type_II       0.97      0.98      0.97      1005
   Obesity_Type_III       1.00      1.00      1.00      1207
 Overweight_Level_I       0.78      0.73      0.76       733
Overweight_Level_II       0.79      0.79      0.79       763

           accuracy                           0.89      6228
          macro avg       0.88      0.88      0.88      6228
       weighted avg       0.89      0.89      0.89      6228



In [17]:
from pickle import dump

with open("rf_classifier.pkl","wb") as f:
    dump(best_model,f)