# Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
import time

In [2]:
df_tr = pd.read_csv("train.csv")
df_ob = pd.read_csv("ObesityDataSet.csv")
df_ob = df_ob.rename(columns={'NObeyesdad': 'WeightCategory'})


In [5]:
df_ob['CALC'].describe()

count          2111
unique            3
top       Sometimes
freq           1402
Name: CALC, dtype: object

In [6]:
df_ob['CALC'] = df_ob['CALC'].replace('Always', 'Sometimes')

In [7]:
# Find differences
common_cols = set(df_tr.columns) & set(df_ob.columns)
only_in_train = set(df_tr.columns) - set(df_ob.columns)  
only_in_obesity = set(df_ob.columns) - set(df_tr.columns)

print(f"Common columns: {len(common_cols)}")
print(f"Only in train: {only_in_train}")
print(f"Only in obesity: {only_in_obesity}")

Common columns: 17
Only in train: {'id'}
Only in obesity: set()


**Roundoff Columns**

Supposed to round off the column as per our EDA, but kept some as continuous to find its impact on model accuracy 

In [8]:
#combined dataset
train_df = pd.concat([df_tr, df_ob], ignore_index=True)
test_df= pd.read_csv("test.csv")
test_ids = test_df['id']

In [9]:
def roundoff(df):
    #df['FAF'] = df['FAF'].round().astype(int)
    #df['TUE'] = df['TUE'].round().astype(int)
    #df['CH2O'] = df['CH2O'].round().astype(int)
    df['NCP'] = df['NCP'].round().astype(int)
    df['FCVC'] = df['FCVC'].round().astype(int)
    return df

In [10]:
train_df = roundoff(train_df)
test_df = roundoff(test_df)

In [11]:
df_tr = roundoff(df_tr)
dr_ob = roundoff(df_ob)

**BMI interactions**

BMI is identified as the most important feature, so trying out various combinations to find its impact on model accuracy.

In [12]:
def add_bmi_interactions(df):
    df['BMI'] = df['Weight'] / (df['Height'] ** 2)
    df['HeightWeightRatio'] = df['Height'] / df_tr['Weight']
    df['BMIxGender'] = df['BMI'] * df['Gender'].map({'Male': 1, 'Female': 0})
    df['BMI_FAF'] = df['BMI'] * df['FAF']
    df['BMI_sq'] = df['BMI']**2
    df['Weight_sq'] = df['Weight']**2
    return df

In [13]:
train_df = add_bmi_interactions(train_df)
test_df = add_bmi_interactions(test_df)

In [14]:
df_tr = add_bmi_interactions(df_tr)
dr_ob = add_bmi_interactions(df_ob)

**Obesity type 3 interactions**

In [17]:
def obesity_type_3(df):    
    df['Obesity_Type_III_Profile'] = (
        (df['MTRANS'] == 'Public_Transportation') &
        (df['NCP'].round() == 3) & 
        (df['FCVC'].round() == 3) & 
        (df['CALC'] == 'Sometimes') &
        (df['family_history_with_overweight'] == 'yes')
    ).astype(int)

    return df

In [18]:
train_df = obesity_type_3(train_df)
test_df = obesity_type_3(test_df)

In [19]:
df_tr = obesity_type_3(df_tr)
dr_ob = obesity_type_3(df_ob)

**Combine categories in MTRANS**

In [20]:
#test_df['MTRANS'] = test_df['MTRANS'].replace(['Walking','Motorbike','Bike'], 'others')
#train_df['MTRANS'] = train_df['MTRANS'].replace(['Walking','Motorbike','Bike'], 'others')
#df_tr['MTRANS'] = df_tr['MTRANS'].replace(['Walking','Motorbike','Bike'], 'others')
#df_ob['MTRANS'] = df_ob['MTRANS'].replace(['Walking','Motorbike','Bike'], 'others')

**New csv Files with the changes**

In [21]:
train_df.to_csv('train_combined.csv', index=False)

In [22]:
test_df.to_csv('test_final.csv', index=False)

In [23]:
df_tr.to_csv('train1.csv', index=False)

In [24]:
df_ob.to_csv('Obesity1.csv', index=False)