In [8]:
from collections import OrderedDict

import pandas as pd

import numpy as np

import torch

import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

from torch.nn import functional as F
from torchsummary import summary


from plotly import express as px
from plotly import graph_objects as go
from plotly import subplots as sp


from sklearn.model_selection import train_test_split


import progressbar

import copy


In [9]:
data = pd.read_csv("train.csv", index_col="id").rename(
    columns={
        "FAVC": "Frequent consumption of high caloric food",
        "FCVC": "Frequency of consumption of vegetables",
        "NCP": "Number of main meals",
        "CAEC": "Consumption of food between meals",
        "CH2O": "Consumption of water daily",
        "SCC": "Calories consumption monitoring",
        "FAF": "Physical activity frequency",
        "TUE": "Time using technology devices",
        "CALC": "Consumption of alcohol",
        "MTRANS": "Transportation used",
    }
)

data = pd.concat(
    [
        data[data["NObeyesdad"] == value].sample(2400)
        for value in data["NObeyesdad"].unique().tolist()
    ]
)


In [10]:
px.histogram(
    data,
    x="NObeyesdad",
)


In [11]:
data = data.assign(
    **{
        "Age": data.Age.round(),
        "Height": (data.Height * 100).round(),
        "Weight": data.Weight.round(),
        "Frequency of consumption of vegetables": data[
            "Frequency of consumption of vegetables"
        ]
        .round()
        .astype("int"),
        "Number of main meals": data["Number of main meals"].round().astype("int"),
        "Consumption of water daily": 8
        * data["Consumption of water daily"].round().astype("int"),
        "Physical activity frequency": data["Physical activity frequency"]
        .round()
        .astype("int"),
        "Time using technology devices": data["Time using technology devices"]
        .round()
        .astype("int"),
        "BMI": lambda x: x["Weight"]
        / x["Height"]
        / np.where(
            x["Physical activity frequency"] == 0,
            1,
            x["Physical activity frequency"],
        ),
    }
)


In [12]:
data.describe()


Unnamed: 0,Age,Height,Weight,Frequency of consumption of vegetables,Number of main meals,Consumption of water daily,Physical activity frequency,Time using technology devices,BMI
count,16800.0,16800.0,16800.0,16800.0,16800.0,16800.0,16800.0,16800.0,16800.0
mean,23.748631,170.003869,84.850298,2.412679,2.741667,16.232381,1.017857,0.6375,0.427937
std,5.89445,8.817301,25.639355,0.564433,0.756929,5.3624,0.865661,0.666852,0.178196
min,14.0,145.0,39.0,1.0,1.0,8.0,0.0,0.0,0.075581
25%,20.0,163.0,65.0,2.0,3.0,16.0,0.0,0.0,0.2834
50%,22.0,170.0,82.0,2.0,3.0,16.0,1.0,1.0,0.431663
75%,26.0,176.0,106.0,3.0,3.0,16.0,2.0,1.0,0.586592
max,61.0,198.0,165.0,3.0,4.0,24.0,3.0,2.0,0.953757


In [13]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 16800 entries, 20039 to 6228
Data columns (total 18 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Gender                                     16800 non-null  object 
 1   Age                                        16800 non-null  float64
 2   Height                                     16800 non-null  float64
 3   Weight                                     16800 non-null  float64
 4   family_history_with_overweight             16800 non-null  object 
 5   Frequent consumption of high caloric food  16800 non-null  object 
 6   Frequency of consumption of vegetables     16800 non-null  int32  
 7   Number of main meals                       16800 non-null  int32  
 8   Consumption of food between meals          16800 non-null  object 
 9   SMOKE                                      16800 non-null  object 
 10  Consumption of water dai

In [14]:
data.head()


Unnamed: 0_level_0,Gender,Age,Height,Weight,family_history_with_overweight,Frequent consumption of high caloric food,Frequency of consumption of vegetables,Number of main meals,Consumption of food between meals,SMOKE,Consumption of water daily,Calories consumption monitoring,Physical activity frequency,Time using technology devices,Consumption of alcohol,Transportation used,NObeyesdad,BMI
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20039,Female,24.0,158.0,80.0,yes,yes,2,3,Sometimes,no,16,no,1,0,Sometimes,Public_Transportation,Overweight_Level_II,0.506329
5079,Male,34.0,170.0,85.0,yes,yes,2,3,Sometimes,no,8,no,1,1,no,Automobile,Overweight_Level_II,0.5
2500,Female,22.0,173.0,78.0,yes,yes,2,2,Sometimes,no,16,no,1,0,Sometimes,Public_Transportation,Overweight_Level_II,0.450867
12462,Male,21.0,165.0,78.0,yes,no,2,1,Frequently,no,24,no,1,0,no,Public_Transportation,Overweight_Level_II,0.472727
13968,Male,18.0,170.0,80.0,yes,yes,3,3,Sometimes,no,16,no,1,1,Frequently,Public_Transportation,Overweight_Level_II,0.470588


In [15]:
# Made a key that I can use to identify the labels once I convert them to their respective index.
label_values = data.NObeyesdad.unique().tolist()


In [7]:
# Feature engineered data

updated_data = (
    pd.get_dummies(
        data.select_dtypes("object").drop(columns=["NObeyesdad"]), drop_first=True
    )
    .join(data.select_dtypes("number"))
    .join(data["NObeyesdad"].apply(label_values.index))
)


In [16]:
px.scatter_matrix(updated_data, color="NObeyesdad", height=5000, width=5000)
