In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv
/kaggle/input/playground-series-s4e2/sample_submission.csv
/kaggle/input/playground-series-s4e2/train.csv
/kaggle/input/playground-series-s4e2/test.csv


# Introduction

Today our task is to create a model that predict obesity risk.

![](https://sa1s3optim.patientpop.com/assets/images/provider/photos/2647979.jpg)

# Libraries

Let's start with importing all libraries those we need.

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from catboost import CatBoostClassifier
import xgboost as xgb
import pandas as pd
import numpy as np

# Downloading data

Now we have to download data and then create Data Frame.

In [3]:
df = pd.read_csv('/kaggle/input/playground-series-s4e2/train.csv', index_col=0)

original_df = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')

concat_df = pd.concat([df, original_df], axis=0)

df = concat_df.drop_duplicates()

df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


# Splitting Data

We will split data into train and test set via Stratified Shuffle Split.

In [4]:
stratified_splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)

for train_index, test_index in stratified_splitter.split(df, df['NObeyesdad']):
    train_set = df.iloc[train_index]
    test_set = df.iloc[test_index]

# Creating Pipeline

We will do all process in one pipeline, the is the best way.

In [5]:
# Function to add a new feature 'over_20'
def add_over_20_feature(df):
    df['over_20'] = df['Age'].map(lambda x: 1 if x > 20 else 0)
    return df

# Define numerical and categorical columns
numerical_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
categorical_cols = ['Gender', 'CAEC', 'CALC', 'MTRANS', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numerical_cols),
        ('categorical', OneHotEncoder(drop='first'), categorical_cols)
    ],
    remainder='passthrough'
)

# Create the full pipeline
pipeline = Pipeline([
    ('add_feature', FunctionTransformer(add_over_20_feature)),
    ('preprocessor', preprocessor),
    ('voting_ensemble', VotingClassifier(
        estimators=[
            ('xgb', xgb.XGBClassifier(
                objective='multi:softmax',
                num_class=7,
                eval_metric='mlogloss',
                max_depth=15,
                eta=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                seed=42
            )),
            ('catboost', CatBoostClassifier(
                iterations=500,
                learning_rate=0.1,
                depth=6,
                loss_function='MultiClass',
                custom_metric='Accuracy',
                random_seed=42,
                verbose=False
            )),
            ('random_forest', RandomForestClassifier(
                n_estimators=170,
                random_state=42
            ))
        ],
        voting='soft'
    ))
])

# Training model

In [6]:
# Separate features and target variable
X_train = train_set.drop(['NObeyesdad'], axis=1)
y_train = train_set['NObeyesdad'].copy()

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Evaluating the model

In [7]:
# Make predictions on the test set
X_test = test_set.drop(['NObeyesdad'], axis=1)
y_test = test_set['NObeyesdad'].copy()
pipeline_predictions = pipeline.predict(X_test)

# Evaluate the ensemble model
test_accuracy_pipeline = accuracy_score(y_test, pipeline_predictions)
print(f'Pipeline Test Accuracy: {test_accuracy_pipeline}')

Pipeline Test Accuracy: 0.912035010940919


Accuracy of model is 91.2%, which some how good.

# Preparing .csv file for sumbission

In [8]:
# Creating test_df dataframe from test.csv
test_df = pd.read_csv('/kaggle/input/playground-series-s4e2/test.csv')

# Make predictions on the submission set
submission_predictions = pipeline.predict(test_df)

# Create a DataFrame for the submission
submission_df = test_df[['id']].copy()
submission_df['NObeyesdad'] = submission_predictions

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)