In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")


# Load Data

In [None]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_final = pd.read_csv('sample_submission.csv')


# EDA
1. Dataset overview

2. Univariate analysis - Numeric Features

3. Univariate analysis - Categorical Features

4. Bivariate/Multivariate analysis

5. Data quality checks



# Dataset Overview
    - shape
    - info
    - missing values
    - duplicated values
    - head, tail, sample

In [None]:
def dataset_overview(df):

    print("================== Dataset Overview ==================")
    print(f"Rows: {df.shape[0]}")
    print(f"Columns: {df.shape[1]}")

    display(df.info())
    display(df.isnull().sum())

    print("============ Duplicates Values ============") 
    print(f"Duplicated values : {df.duplicated(keep=False).sum()}")
    if df.duplicated().sum()>0:
        display(df[df.duplicated(keep=False)].reset_index())
    
    print("============ Data Preview ============")
    print("Head:")
    display(df.head(3))
    print("Tail:")
    display(df.tail(3))
    print("Sample:")
    display(df.sample(3))

    print("============ Numerical and Categorical Values ============")
    num_cols = df.select_dtypes(include=['int64','float64']).columns
    cat_cols = df.select_dtypes(include=['object']).columns
    print(f"Numerical Datatypes: {num_cols}")
    print(f"Number of numeric features: {len(num_cols)}")
    print(f"Categorical Datatypes: {cat_cols}")
    print(f"Number of categorical features: {len(cat_cols)}")


In [None]:
print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
print(f"Duplicated values : {df.duplicated(keep=False).sum()}")
if df.duplicated().sum()>0:
    display(df[df.duplicated(keep=False)].reset_index())

In [None]:
print("Head:")
display(df.head(3))
print("Tail:")
display(df.tail(3))
print("Sample:")
display(df.sample(3))

In [None]:
num_cols = df.select_dtypes(include=['int64','float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns
print(f"Numerical Datatypes: {num_cols}")
print(f"Number of numeric features: {len(num_cols)}")
print(f"Categorical Datatypes: {cat_cols}")
print(f"Number of categorical features: {len(cat_cols)}")


In [None]:
df['NObeyesdad'].value_counts()

# Observations
1. There are 20758 rows of data and 18 columns
2. There is one id column, 8 numerical columns, 9 categorical columns
3. The target column (NObeyesdad) is categorical with 7 unique values 


# Univariate analysis - Numeric Features
    - Distribution plots (hist and box)
    - Describe
    - Outliers 

In [None]:
sns.set_palette("pastel")    
sns.set_theme(style="darkgrid")   

In [None]:
def num_analysis(df,col):
    print(f"****************************** {col} analysis ******************************")
    # Plot box and hist plots
    fig,axs = plt.subplots(1,2,figsize=(10,5))
    axs[0].set_title(f"{col} boxplot")
    axs[0].tick_params(axis='x', rotation=45)
    sns.boxplot(data=df,x=col,ax=axs[0])
    axs[1].set_title(f"{col} histplot")
    axs[1].tick_params(axis='x', rotation=45)

    sns.histplot(data=df,x=col,ax=axs[1],kde=True)
    plt.tight_layout()
    plt.show()

    # Get describe()
    print(f"********************  {col} values description  ********************")
    display(df[col].describe().to_frame().style.background_gradient(cmap='cool'))

    print(f"********************  {col} outliers  ********************")
    # Find upper and lower outliers if any
    Q3 = df[col].quantile(0.75)
    Q1 = df[col].quantile(0.25)

    print(f"IQR : {Q3 - Q1}")

    upper_outliers = df[df[col] > Q3]
    lower_outliers = df[df[col] < Q1]

    if len(upper_outliers)>0:
        print(f"****** Upper Outliers ******")
        print(f"Upper outlier count: {len(upper_outliers)}")
        display(upper_outliers.head(3))
    
    if len(lower_outliers)>0:
        print(f"****** Lower Outliers ******")
        print(f"Lower outlier count: {len(lower_outliers)}")
        display(lower_outliers.head(3))

    
    print("")
    print("")
    print("")
    print("")



In [None]:
num_cols_for_analysis = num_cols.drop('id')

In [None]:
for col in num_cols_for_analysis:
    num_analysis(df,col)

# Univariate analysis - Categorical Features
    - Value counts
    - Unique values
    - Bar chart

In [None]:
def categorical_analysis(df,col):
    print(f"****************************** {col} analysis ******************************")
    
    print(f"Number of Unique Values: {df[col].nunique()}")
    if df[col].nunique() < 10:
        fig, ax = plt.subplots(figsize=(8, 4))
        sns.histplot(data=df, x=col, ax=ax, hue=col,legend=False)
        ax.set_title("Value Distribution")
        for tick in ax.get_xticklabels():
            tick.set_rotation(45)
        
        # Add count/percentage labels
        total = df[col].notna().sum()
        for container in ax.containers:
            for bar in container:
                height = bar.get_height()
                if height == 0:
                    continue  # Skip empty bars
                count = int(height)
                percentage = f'{100 * height / total:.1f}%'
                ax.annotate(f'{count}\n({percentage})',
                            xy=(bar.get_x() + bar.get_width()/2, height),
                            xytext=(0, 3),
                            textcoords="offset points",
                            ha='center', va='bottom',
                            fontsize=9)
        
        plt.tight_layout()
        plt.show()
    else:
        print(f"Top values for {col}")
        display(df[col].value_counts().reset_index().head(5))

    print("")
    print("")

In [None]:
for col in cat_cols:
    categorical_analysis(df,col)

# Observations 
    - As all categorical values have <15 unique values, can safely one hot encode each of them
    - Classes are quite imbalanced, models to be used should be sensitive to this fact

In [None]:
df.columns

# Multivariate Analysis:
 


In [None]:
# What is the relationship between gender and weight diagnosis?
ct = pd.crosstab(df['NObeyesdad'], df['Gender'])
sns.heatmap(ct, annot=True, fmt='d', cmap='Blues')
plt.title('Gender / Nobeyesdad Crosstab heatmap')
plt.show()

In [None]:
df.columns

In [None]:
# Is there a relationship between weight and cups of water (CH20)?

df_corr = df[['CH2O','Weight']].corr()
sns.heatmap(df_corr,annot=True)



In [None]:
# What is the relationship between the number of servings of vegetables and obesity diagnosis?
plt.figure(figsize=(15,5))
plt.title("FCVC vs Diagnosis")
sns.violinplot(data=df, x='FCVC', hue='NObeyesdad')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')  # Move legend outside right
plt.tight_layout()
plt.show()


# Preprocessing
1. Drop ID

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import xgboost as xgb
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [None]:
df_t = df.drop(columns=['id'])
X = df_t.drop(columns=['NObeyesdad'])
y = df_t['NObeyesdad']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=43)

In [None]:
df['NObeyesdad'].value_counts()

In [None]:
BMI_mapping = {
    'Insufficient_Weight':0,
    'Normal_Weight':1,
    'Overweight_Level_I':2,
    'Overweight_Level_II':3,
    'Obesity_Type_I':4,
    'Obesity_Type_II':5,
    'Obesity_Type_III':6
}

In [None]:
y_encoded = y.map(BMI_mapping)




In [None]:
model_cat = CatBoostClassifier()

In [None]:
X_train.columns

In [None]:
cat_cols = cat_cols.drop('NObeyesdad')

In [None]:
def pipeline_stack(model):

    preprocessor = ColumnTransformer(transformers=[
        ('cat',OneHotEncoder(handle_unknown='ignore'),cat_cols),
        ],
        remainder ='passthrough'
    )

    pipeline = Pipeline(steps=[
        ('preprocecssor',preprocessor),
        ('model',model)
    ])

    return pipeline

pipeline = pipeline_stack(model_cat)


In [None]:
def objective(trial):
    params = {
            'iterations': trial.suggest_int('iterations',650,1000),
            'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.2),
            'depth': trial.suggest_int('depth', 3, 10),
            'l2_leaf_reg': 7.3,        
            'verbose': 0,
            'random_seed': 42,
            'loss_function': 'MultiClass',
            'task_type': 'GPU',
            'early_stopping_rounds':20,
            'eval_metric':'Accuracy',
            'classes_count':7
            
        }
    
    model = CatBoostClassifier(**params)

    pipeline = pipeline_stack(model)


    scores = cross_val_score(pipeline, X, y_encoded, cv=3, scoring='accuracy', n_jobs=1)
    return scores.mean()

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42)  
)
study.optimize(objective,n_trials = 50)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
params = study.best_params

In [None]:
print(params)

In [None]:
model_cat = CatBoostClassifier(**params)

pipeline_final = pipeline_stack(model_cat)

In [None]:
pipeline_final.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score 

In [None]:
y_test

In [None]:
# Encode labels
y_train_enc = y_train.map(BMI_mapping)
y_test_enc = y_test.map(BMI_mapping)

# Fit pipeline
pipeline_final.fit(X_train, y_train_enc)

# Predict
y_preds = pipeline_final.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test_enc, y_preds)
print(accuracy)

In [None]:
df_to_predict = df_test.drop(columns=['id'])



In [None]:
df_to_predict.head(3)

In [None]:
y_pred_final = pipeline_final.predict(df_to_predict)

In [None]:
df_final.head(3)

In [None]:
y_pred_final

In [None]:
index_to_category = {v: k for k, v in BMI_mapping.items()}

def get_bmi_category(index):
    return index_to_category[index]

y_pred_final = np.vectorize(get_bmi_category)(y_pred_final)

In [None]:
df_final['NObeyesdad'] = y_pred_final

In [None]:
df_final.head(3)

In [None]:
df_final.to_csv("submission.csv",index=False)