# Análisis exploratorio

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from ydata_profiling import ProfileReport

In [2]:
# load dataset
dataset = pd.read_csv("../data/obesity-prediction-dataset.csv")

In [10]:
# quick look
dataset.head(10)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II
5,Male,29,1.62,53.0,no,yes,2.0,3.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Automobile,Normal_Weight
6,Female,23,1.5,55.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,0.0,Sometimes,Motorbike,Normal_Weight
7,Male,22,1.64,53.0,no,no,2.0,3.0,Sometimes,no,2.0,no,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
8,Male,24,1.78,64.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Frequently,Public_Transportation,Normal_Weight
9,Male,22,1.72,68.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,no,Public_Transportation,Normal_Weight


In [None]:
# quick summary
dataset.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.315964,1.70162,86.586035,2.418986,2.685651,2.008053,1.010313,0.657861
std,6.357078,0.093368,26.191163,0.533996,0.778079,0.61295,0.850613,0.608926
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,20.0,1.63,65.47,2.0,2.66,1.585,0.125,0.0
50%,23.0,1.7,83.0,2.39,3.0,2.0,1.0,0.625
75%,26.0,1.77,107.43,3.0,3.0,2.48,1.67,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [14]:
# generate profile report
profile = ProfileReport(dataset, title="Obesity prediction dataset", plot={"image_format": "svg"})
profile.to_file("../data/profile.html")
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 17/17 [00:00<00:00, 543.05it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
# dump duplicates
duplicates = profile.get_duplicates()
duplicates.to_csv("../data/duplicates.csv", index=False)
duplicates

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,# duplicates
7,Male,21,1.62,70.0,no,yes,2.0,1.0,no,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation,Overweight_Level_I,15
3,Female,21,1.52,42.0,no,yes,3.0,1.0,Frequently,no,1.0,no,0.0,0.0,Sometimes,Public_Transportation,Insufficient_Weight,4
0,Female,16,1.66,58.0,no,no,2.0,1.0,Sometimes,no,1.0,no,0.0,1.0,no,Walking,Normal_Weight,2
2,Female,21,1.52,42.0,no,no,3.0,1.0,Frequently,no,1.0,no,0.0,0.0,Sometimes,Public_Transportation,Insufficient_Weight,2
1,Female,18,1.62,55.0,yes,yes,2.0,3.0,Frequently,no,1.0,no,1.0,1.0,no,Public_Transportation,Normal_Weight,2
4,Female,22,1.69,65.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Sometimes,Public_Transportation,Normal_Weight,2
5,Female,25,1.57,55.0,no,yes,2.0,1.0,Sometimes,no,2.0,no,2.0,0.0,Sometimes,Public_Transportation,Normal_Weight,2
6,Male,18,1.72,53.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,0.0,2.0,Sometimes,Public_Transportation,Insufficient_Weight,2
8,Male,22,1.74,75.0,yes,yes,3.0,3.0,Frequently,no,1.0,no,1.0,0.0,no,Automobile,Normal_Weight,2


In [6]:
# plot number of duplicates
fig = plt.figure()
ax = fig.subplots()
plt.bar(duplicates.index, duplicates["# duplicates"])
ax.set(ylabel="Number of duplicates")
plt.savefig("../report/img/duplicates.svg", bbox_inches='tight')

In [61]:
def accumulated_bar_chart(labels: list[str], counts: dict[str, pd.Series], output: str):
    fig = plt.figure()
    ax = fig.subplots()

    bottom = np.zeros(len(labels))

    for type, count in counts.items():
        p = ax.bar(labels, count, label=type, bottom=bottom)
        bottom += count

        ax.bar_label(p, label_type='center')


    ax.set_xticks(labels)
    ax.set_xticklabels(labels, rotation=90)
    ax.legend()
    plt.savefig(output, bbox_inches='tight')

In [62]:
# gender and obesity

# obesity_types = list(dataset["NObeyesdad"].unique())
obesity_types = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]

obesity_per_gender = dataset.groupby("Gender")["NObeyesdad"].value_counts()

obesity_counts = {
    "Male": obesity_per_gender["Male"].reindex(obesity_types),
    "Female": obesity_per_gender["Female"].reindex(obesity_types)
}

# plot in accumulated bar chart
accumulated_bar_chart(
    labels=obesity_types,
    counts={
        "Male": obesity_per_gender["Male"].reindex(obesity_types),
        "Female": obesity_per_gender["Female"].reindex(obesity_types)
    },
    output="../report/img/gender.svg")

obesity_per_gender

Gender  NObeyesdad         
Female  Obesity_Type_III       323
        Insufficient_Weight    173
        Obesity_Type_I         156
        Overweight_Level_I     145
        Normal_Weight          141
        Overweight_Level_II    103
        Obesity_Type_II          2
Male    Obesity_Type_II        295
        Obesity_Type_I         195
        Overweight_Level_II    187
        Normal_Weight          146
        Overweight_Level_I     145
        Insufficient_Weight     99
        Obesity_Type_III         1
Name: count, dtype: int64