In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/original/train.csv
/kaggle/input/original/test.csv


# TRAINING DATA PREPROCESSING

In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler


In [3]:
df = pd.read_csv("/kaggle/input/original/train.csv")

In [4]:
print(df.head())

   id  Gender        Age    Height      Weight family_history_with_overweight  \
0   0    Male  24.443011  1.699998   81.669950                            yes   
1   1  Female  18.000000  1.560000   57.000000                            yes   
2   2  Female  18.000000  1.711460   50.165754                            yes   
3   3  Female  20.952737  1.710730  131.274851                            yes   
4   4    Male  31.641081  1.914186   93.798055                            yes   

  FAVC      FCVC       NCP        CAEC SMOKE      CH2O SCC       FAF  \
0  yes  2.000000  2.983297   Sometimes    no  2.763573  no  0.000000   
1  yes  2.000000  3.000000  Frequently    no  2.000000  no  1.000000   
2  yes  1.880534  1.411685   Sometimes    no  1.910378  no  0.866045   
3  yes  3.000000  3.000000   Sometimes    no  1.674061  no  1.467863   
4  yes  2.679664  1.971472   Sometimes    no  1.979848  no  1.967973   

        TUE       CALC                 MTRANS       WeightCategory  
0  0.976473

In [5]:
print("Dataset Information: \n")
print(df.info())

print(f"Dataset Shape: {df.shape}")

Dataset Information: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15533 entries, 0 to 15532
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              15533 non-null  int64  
 1   Gender                          15533 non-null  object 
 2   Age                             15533 non-null  float64
 3   Height                          15533 non-null  float64
 4   Weight                          15533 non-null  float64
 5   family_history_with_overweight  15533 non-null  object 
 6   FAVC                            15533 non-null  object 
 7   FCVC                            15533 non-null  float64
 8   NCP                             15533 non-null  float64
 9   CAEC                            15533 non-null  object 
 10  SMOKE                           15533 non-null  object 
 11  CH2O                            15533 non-null  float64
 12  SCC      

In [6]:
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    print(f"\n🔹 Column: {col}")
    print(f"Unique Categories ({df[col].nunique()}): {df[col].unique()}")


🔹 Column: Gender
Unique Categories (2): ['Male' 'Female']

🔹 Column: family_history_with_overweight
Unique Categories (2): ['yes' 'no']

🔹 Column: FAVC
Unique Categories (2): ['yes' 'no']

🔹 Column: CAEC
Unique Categories (4): ['Sometimes' 'Frequently' 'no' 'Always']

🔹 Column: SMOKE
Unique Categories (2): ['no' 'yes']

🔹 Column: SCC
Unique Categories (2): ['no' 'yes']

🔹 Column: CALC
Unique Categories (3): ['Sometimes' 'no' 'Frequently']

🔹 Column: MTRANS
Unique Categories (5): ['Public_Transportation' 'Automobile' 'Walking' 'Motorbike' 'Bike']

🔹 Column: WeightCategory
Unique Categories (7): ['Overweight_Level_II' 'Normal_Weight' 'Insufficient_Weight'
 'Obesity_Type_III' 'Obesity_Type_II' 'Overweight_Level_I'
 'Obesity_Type_I']


In [7]:
print(df.isnull().sum())

id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
WeightCategory                    0
dtype: int64


In [8]:
X = df.iloc[:, :-1]  # all columns except last
Y = df.iloc[:, -1:]

In [9]:
X.shape

(15533, 17)

In [10]:
X = X.drop(columns=['id'])

print("New dataset shape:", X.shape)
print("Columns after removing 'id':", X.columns.tolist())

New dataset shape: (15533, 16)
Columns after removing 'id': ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS']


In [11]:
X_encoded = pd.get_dummies(X, drop_first=False)
Y_encoded = pd.get_dummies(Y, drop_first=False)
X_encoded = X_encoded.astype(int)
Y_encoded = Y_encoded.astype(int)

Check which features are useful for predicting output

In [12]:
X_encoded.shape


(15533, 30)

In [13]:
Y_encoded.shape

(15533, 7)

In [14]:
df_encoded = pd.concat([X_encoded, Y_encoded], axis=1)

In [15]:
df_encoded.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking,WeightCategory_Insufficient_Weight,WeightCategory_Normal_Weight,WeightCategory_Obesity_Type_I,WeightCategory_Obesity_Type_II,WeightCategory_Obesity_Type_III,WeightCategory_Overweight_Level_I,WeightCategory_Overweight_Level_II
0,24,1,81,2,2,2,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1
1,18,1,57,2,3,2,1,1,1,0,...,0,0,0,0,1,0,0,0,0,0
2,18,1,50,1,1,1,0,1,1,0,...,0,1,0,1,0,0,0,0,0,0
3,20,1,131,3,3,1,1,0,1,0,...,0,1,0,0,0,0,0,1,0,0
4,31,1,93,2,1,1,1,0,0,1,...,0,1,0,0,0,0,0,0,0,1


**SVM**

In [16]:
def remove_outliers_svm(df_encoded, num_target_cols=7, outlier_percent=10):

    X_svm = df_encoded.iloc[:, :-num_target_cols]
    Y_svm = df_encoded.iloc[:, -num_target_cols:]
    
    clf = OneClassSVM(nu=outlier_percent/100, kernel="rbf", gamma='scale')
    clf.fit(X_svm)
    
    preds = clf.predict(X_svm)
    
    X_filtered = X_svm[preds == 1].reset_index(drop=True)
    Y_filtered = Y_svm[preds == 1].reset_index(drop=True)
    
    df_cleaned = pd.concat([X_filtered, Y_filtered], axis=1)
    
    return df_cleaned, X_filtered, Y_filtered


In [17]:
df_encoded = pd.concat([X_encoded, Y_encoded], axis=1)
df_encoded.shape

(15533, 37)

In [18]:
df_svm, X_svm, Y_svm = remove_outliers_svm(df_encoded, num_target_cols=7, outlier_percent=10)

print("New dataset shape:", df_svm.shape)
print("Filtered features shape:", X_svm.shape)
print("Filtered target shape:", Y_svm.shape)


New dataset shape: (13991, 37)
Filtered features shape: (13991, 30)
Filtered target shape: (13991, 7)


**SImilarity score using random forest**

In [19]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_svm, Y_svm)

importances = pd.Series(model.feature_importances_, index=X_svm.columns)

print(importances.sort_values(ascending=False).head(31))


Weight                                0.395636
Age                                   0.139417
FCVC                                  0.062797
Gender_Male                           0.046667
Gender_Female                         0.045017
FAF                                   0.042910
NCP                                   0.038642
CH2O                                  0.032788
TUE                                   0.032763
CALC_Sometimes                        0.018895
family_history_with_overweight_no     0.016263
family_history_with_overweight_yes    0.015623
CALC_no                               0.015350
MTRANS_Public_Transportation          0.014021
MTRANS_Automobile                     0.012220
CAEC_Sometimes                        0.011731
CAEC_Frequently                       0.011680
FAVC_no                               0.009677
FAVC_yes                              0.008720
CALC_Frequently                       0.004720
SCC_yes                               0.004617
CAEC_no      

Create_new_dataset

In [20]:
importances_sorted = importances.sort_values(ascending=False)

cumulative_importance = importances_sorted.cumsum()

selected_features = cumulative_importance[cumulative_importance <= 0.95].index.tolist()

if len(selected_features) < len(importances_sorted):
    selected_features.append(cumulative_importance.index[len(selected_features)])

X_selected = X_svm[selected_features]

print("New dataset shape:", X_selected.shape)
print("Selected features:", X_selected.columns.tolist())


New dataset shape: (13991, 17)
Selected features: ['Weight', 'Age', 'FCVC', 'Gender_Male', 'Gender_Female', 'FAF', 'NCP', 'CH2O', 'TUE', 'CALC_Sometimes', 'family_history_with_overweight_no', 'family_history_with_overweight_yes', 'CALC_no', 'MTRANS_Public_Transportation', 'MTRANS_Automobile', 'CAEC_Sometimes', 'CAEC_Frequently']


**now sacling then pca on this new dataset**

In [21]:
X_selected_scaled_ = pd.DataFrame(StandardScaler().fit_transform(X_selected), columns=X_selected.columns)

In [22]:
X_selected_scaled_.shape

(13991, 17)

In [23]:
Y_svm.shape

(13991, 7)

In [24]:
X_selected_scaled_.head()

Unnamed: 0,Weight,Age,FCVC,Gender_Male,Gender_Female,FAF,NCP,CH2O,TUE,CALC_Sometimes,family_history_with_overweight_no,family_history_with_overweight_yes,CALC_no,MTRANS_Public_Transportation,MTRANS_Automobile,CAEC_Sometimes,CAEC_Frequently
0,-0.269274,0.037201,-0.43143,0.900889,-0.900889,-0.901294,-0.880404,0.362909,-0.713787,0.633626,-0.432884,0.432884,-0.589924,0.518454,-0.476514,0.400746,-0.334906
1,-1.3136,-1.067056,-0.43143,-1.110014,1.110014,0.286312,0.441809,0.362909,0.949814,-1.578217,-0.432884,0.432884,1.695133,-1.928811,2.098575,-2.495347,2.985911
2,-1.618195,-1.067056,-2.171953,-1.110014,1.110014,-0.901294,-2.202616,-1.380724,0.949814,-1.578217,-0.432884,0.432884,1.695133,0.518454,-0.476514,0.400746,-0.334906
3,1.906406,-0.69897,1.309093,-1.110014,1.110014,0.286312,0.441809,-1.380724,-0.713787,0.633626,-0.432884,0.432884,-0.589924,0.518454,-0.476514,0.400746,-0.334906
4,0.25289,1.3255,-0.43143,0.900889,-0.900889,0.286312,-2.202616,-1.380724,-0.713787,0.633626,-0.432884,0.432884,-0.589924,0.518454,-0.476514,0.400746,-0.334906


**PCA**

In [25]:
from sklearn.decomposition import PCA
import pandas as pd

def apply_pca(X, variance_threshold=0.95):
   
    pca = PCA(n_components=variance_threshold)
    X_pca = pca.fit_transform(X)
    
    df_pca = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])
    
    components_df = pd.DataFrame(
        pca.components_,
        columns=X.columns,
        index=[f'PC{i+1}' for i in range(X_pca.shape[1])]
    )
    
    explained_variance = pca.explained_variance_ratio_
    
    return df_pca, components_df, explained_variance


In [26]:
df_pca, components_df, explained_variance = apply_pca(X_selected_scaled_, variance_threshold=0.95)


In [27]:
print("Explained variance ratio (in %):")
for i, var in enumerate(explained_variance, 1):
    print(f"PC{i}: {var*100:.2f}%")


Explained variance ratio (in %):
PC1: 20.17%
PC2: 15.25%
PC3: 12.45%
PC4: 11.51%
PC5: 8.93%
PC6: 6.56%
PC7: 5.54%
PC8: 5.30%
PC9: 4.27%
PC10: 3.74%
PC11: 2.68%


In [29]:
X_pca_selected = df_pca

In [30]:
X_pca_selected.shape

(13991, 11)

# INTIAL MODEL FITTING AND CHECKING ACCURACY

**Random forest**

In [31]:

X_train_pca_selected, X_test_pca_selected, y_train_pca_selected, y_test_pca_selected = train_test_split(
    X_pca_selected, Y_svm, test_size=0.3, random_state=42
)

y_train_int_pca_selected = np.argmax(y_train_pca_selected.values, axis=1)
y_test_int_pca_selected = np.argmax(y_test_pca_selected.values, axis=1)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_pca_selected, y_train_pca_selected)

y_pred_pca_selected = rf.predict(X_test_pca_selected)

print("Random Forest(with pca) Accuracy:", accuracy_score(y_test_pca_selected, y_pred_pca_selected))


Random Forest(with pca) Accuracy: 0.6512625059552167


In [32]:

X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(
    X_encoded, Y_encoded, test_size=0.3, random_state=42
)

y_train_int_encoded = np.argmax(y_train_encoded.values, axis=1)
y_test_int_encoded = np.argmax(y_test_encoded.values, axis=1)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_encoded, y_train_int_encoded)

y_pred_encoded = rf.predict(X_test_encoded)

print("Random Forest(encoded) Accuracy:", accuracy_score(y_test_int_encoded, y_pred_encoded))


Random Forest(encoded) Accuracy: 0.8457081545064378


In [33]:
X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(
    X_encoded, Y_encoded, test_size=0.3, random_state=42
)

y_train_int_encoded = np.argmax(y_train_encoded.values, axis=1)
y_test_int_encoded = np.argmax(y_test_encoded.values, axis=1)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train_encoded, y_train_int_encoded)

y_pred_encoded = xgb.predict(X_test_encoded)

print("XGBoost Accuracy (Encoded features):", accuracy_score(y_test_int_encoded, y_pred_encoded))


XGBoost Accuracy (Encoded features): 0.8669527896995708


In [34]:
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

X_train_pca_selected, X_test_pca_selected, y_train_pca_selected, y_test_pca_selected = train_test_split(
    X_pca_selected, Y_svm, test_size=0.3, random_state=42
)

y_train_int_pca_selected = np.argmax(y_train_pca_selected.values, axis=1)
y_test_int_pca_selected = np.argmax(y_test_pca_selected.values, axis=1)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train_pca_selected, y_train_int_pca_selected)

y_pred_pca_selected = xgb.predict(X_test_pca_selected)

print("XGBoost Accuracy (PCA-selected features):", accuracy_score(y_test_int_pca_selected, y_pred_pca_selected))


XGBoost Accuracy (PCA-selected features): 0.7529776083849452
