# Imports

In [1]:
from collections import OrderedDict

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
from torchsummary import summary

from plotly import express as px
from plotly import graph_objects as go
from plotly import subplots as sp

from sklearn.model_selection import train_test_split

import progressbar

import copy


# Import Data
I begin the analysis by import the data and changing some of the names to be more descriptive.
I then run describe() and info() to get a sense of the data that I am looking at.


In [2]:
data = pd.read_csv("train.csv", index_col="id").rename(
    columns={
        "FAVC": "Frequent consumption of high caloric food",
        "FCVC": "Frequency of consumption of vegetables",
        "NCP": "Number of main meals",
        "CAEC": "Consumption of food between meals",
        "CH2O": "Consumption of water daily",
        "SCC": "Calories consumption monitoring",
        "FAF": "Physical activity frequency",
        "TUE": "Time using technology devices",
        "CALC": "Consumption of alcohol",
        "MTRANS": "Transportation used",
    }
)
data = pd.concat(
    [
        data[data["NObeyesdad"] == value].sample(2400)
        for value in data["NObeyesdad"].unique().tolist()
    ]
)


In [3]:
data = data.assign(
    **{
        "Age": data.Age.round(),
        "Height": (data.Height * 100).round(),
        "Weight": data.Weight.round(),
        "Frequency of consumption of vegetables": data[
            "Frequency of consumption of vegetables"
        ]
        .round()
        .astype("int"),
        "Number of main meals": data["Number of main meals"].round().astype("int"),
        "Consumption of water daily": 8
        * data["Consumption of water daily"].round().astype("int"),
        "Physical activity frequency": data["Physical activity frequency"]
        .round()
        .astype("int"),
        "Time using technology devices": data["Time using technology devices"]
        .round()
        .astype("int"),
        "BMI": lambda x: x["Weight"]
        / x["Height"]
        / np.where(
            x["Physical activity frequency"] == 0,
            1,
            x["Physical activity frequency"],
        ),
    }
)


In [4]:
# Made a key that I can use to identify the labels once I convert them to their respective index.
label_values = data.NObeyesdad.unique().tolist()


In [5]:
# Feature engineered data

updated_data = (
    pd.get_dummies(
        data.select_dtypes("object").drop(columns=["NObeyesdad"]), drop_first=True
    )
    .join(data.select_dtypes("number"))
    .join(data["NObeyesdad"].apply(label_values.index))
)


In [26]:
updated_data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 16800 entries, 15030 to 10505
Data columns (total 24 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Gender_Male                                    16800 non-null  bool   
 1   family_history_with_overweight_yes             16800 non-null  bool   
 2   Frequent consumption of high caloric food_yes  16800 non-null  bool   
 3   Consumption of food between meals_Frequently   16800 non-null  bool   
 4   Consumption of food between meals_Sometimes    16800 non-null  bool   
 5   Consumption of food between meals_no           16800 non-null  bool   
 6   SMOKE_yes                                      16800 non-null  bool   
 7   Calories consumption monitoring_yes            16800 non-null  bool   
 8   Consumption of alcohol_Sometimes               16800 non-null  bool   
 9   Consumption of alcohol_no                      1680

# Train Test Split


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    updated_data.iloc[:, :-1], updated_data.iloc[:, -1], test_size=0.1, random_state=42
)

X_train


Unnamed: 0_level_0,Gender_Male,family_history_with_overweight_yes,Frequent consumption of high caloric food_yes,Consumption of food between meals_Frequently,Consumption of food between meals_Sometimes,Consumption of food between meals_no,SMOKE_yes,Calories consumption monitoring_yes,Consumption of alcohol_Sometimes,Consumption of alcohol_no,...,Transportation used_Walking,Age,Height,Weight,Frequency of consumption of vegetables,Number of main meals,Consumption of water daily,Physical activity frequency,Time using technology devices,BMI
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
128,False,True,True,False,True,False,False,False,True,False,...,False,21.0,163.0,66.0,3,3,16,0,2,0.404908
17117,True,True,True,True,False,False,False,False,True,False,...,False,19.0,180.0,87.0,3,1,24,3,0,0.161111
4300,False,True,True,False,True,False,False,False,True,False,...,False,21.0,173.0,79.0,2,1,16,2,1,0.228324
19871,True,True,True,False,True,False,False,False,False,True,...,False,33.0,170.0,83.0,2,3,24,3,0,0.162745
6485,True,True,True,False,True,False,False,False,False,True,...,False,22.0,176.0,55.0,3,3,16,2,2,0.156250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7351,True,True,True,False,True,False,False,False,True,False,...,False,26.0,187.0,119.0,2,3,24,1,1,0.636364
20739,True,True,True,False,True,False,False,False,True,False,...,False,41.0,162.0,110.0,3,3,8,1,0,0.679012
6846,True,True,True,True,False,False,False,False,False,True,...,False,18.0,174.0,53.0,3,3,16,1,1,0.304598
10910,True,True,True,False,True,False,False,False,False,True,...,False,30.0,171.0,83.0,3,3,16,2,1,0.242690


# Scale data


In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Enter Pytorch
This particular use case I will use Pytorch to develop a Deep learning model in the spirit of VGG19 which is a well-performing classification model in deep learning.  There are not enough data points to use a true VGG19, so this will be an abridged version with different parameters.


In [25]:
X_train.shape


(15120, 23)

In [9]:
device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu"


# create tensors, datasets, and batches


In [10]:
class dataset(Dataset):
    def __init__(self, X, y):
        super().__init__()
        self.X = torch.FloatTensor(X)
        self.y = torch.zeros(size=(len(y.tolist()), len(label_values)))
        self.y[torch.arange(len(self.y)), y.tolist()] = 1

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):

        return self.X[idx].to(device), self.y[idx].to(device)


In [11]:
train_dataset = dataset(X_train, y_train)
test_dataset = dataset(X_test, y_test)

train_loader = DataLoader(train_dataset, drop_last=True, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=len(X_test))


# Model Definition


In [12]:
# Fully Connected Neural Network


class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 256)
        self.fc4 = nn.Linear(256, 7)

    def forward(self, x):
        x = F.relu(F.dropout(self.fc1(x), 0.15))
        x = F.relu(F.dropout(self.fc2(x), 0.15))
        x = F.relu(F.dropout(self.fc3(x), 0.15))
        return self.fc4(x)


In [13]:
# VGG19 style CNN


class CNNClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(
            OrderedDict(
                [
                    ("Conv1_1", nn.Conv1d(1, 64, 3, 2, 1)),
                    ("Conv1_2", nn.Conv1d(64, 64, 3, 1, 1)),
                ]
            )
        )
        self.conv2 = nn.Sequential(
            OrderedDict(
                [
                    ("Conv2_1", nn.Conv1d(64, 128, 3, 1, 1)),
                    ("Conv2_2", nn.Conv1d(128, 128, 3, 1, 1)),
                ]
            )
        )
        self.conv3 = nn.Sequential(
            OrderedDict(
                [
                    ("Conv3_1", nn.Conv1d(128, 256, 3, 1, 1)),
                    ("Conv3_2", nn.Conv1d(256, 256, 3, 1, 1)),
                ]
            )
        )
        self.fc1 = nn.Sequential(
            OrderedDict(
                [("flattened", nn.Flatten()), ("Output Layer", nn.Linear(256, 7))]
            )
        )

    def forward(self, x):
        x = x.view(-1, 1, x[0].shape[0])
        x = F.max_pool1d(self.conv1(x), 1, 3)
        x = F.max_pool1d(self.conv2(x), 1, 3)
        x = F.max_pool1d(self.conv3(x), 1, 3)
        x = self.fc1(x)
        return x


summary(CNNClassifier().to(device), (1, X_train.shape[1]), 32)


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1                [32, 64, 1]             256
            Conv1d-2                [32, 64, 1]          12,352
            Conv1d-3               [32, 128, 1]          24,704
            Conv1d-4               [32, 128, 1]          49,280
            Conv1d-5               [32, 256, 1]          98,560
            Conv1d-6               [32, 256, 1]         196,864
           Flatten-7                  [32, 256]               0
            Linear-8                    [32, 7]           1,799
Total params: 383,815
Trainable params: 383,815
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.28
Params size (MB): 1.46
Estimated Total Size (MB): 1.75
----------------------------------------------------------------


# Experiment class definition


In [14]:
class Experiment:
    def __init__(self, model, lossfunc, optimizer, n_epochs):
        self.model = model
        self.lossfunc = lossfunc
        self.optimizer = optimizer
        self.n_epochs = n_epochs
        self.train_loss = torch.zeros(size=(self.n_epochs,), requires_grad=False)
        self.test_loss = torch.zeros(size=(self.n_epochs,), requires_grad=False)
        self.train_accuracy = torch.zeros(size=(self.n_epochs,), requires_grad=False)
        self.test_accuracy = torch.zeros(size=(self.n_epochs,), requires_grad=False)
        self.best_model = None
        self.best_loss = None
        self.best_accuracy = None

    def run(self):
        for epoch in progressbar.progressbar(range(self.n_epochs)):
            self.model.train()
            batch_loss = torch.zeros(size=(len(train_loader),), requires_grad=False)
            batch_accuracy = torch.zeros(size=(len(train_loader),), requires_grad=False)

            for i, (X, y) in enumerate(train_loader):
                y_pred = self.model(X)
                loss = self.lossfunc(y_pred, y)

                batch_loss[i] = loss
                batch_accuracy[i] = (
                    (y_pred.argmax(dim=1) == y.argmax(dim=1)).float().mean()
                )

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            self.train_accuracy[epoch] = batch_accuracy.mean()
            self.train_loss[epoch] = batch_loss.mean()

            if self.best_loss is None or self.best_loss > self.train_loss[epoch]:
                self.best_model = copy.deepcopy(self.model)
                self.best_loss = self.train_loss[epoch]
                self.best_accuracy = self.train_accuracy[epoch]

            self.model.eval()
            X, y = next(iter(test_loader))
            y_pred = self.best_model(X)
            self.test_loss[epoch] = self.lossfunc(y_pred, y)
            self.test_accuracy[epoch] = (
                (y_pred.argmax(dim=1) == y.argmax(dim=1)).float().mean()
            )

    def get_best_model(self):
        return self.best_model


# Model Fitting


In [15]:
models = [Classifier()]


In [16]:
experiments = []
for model in models:
    n_epochs = 40

    lossfunc = torch.nn.CrossEntropyLoss()

    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)

    experiment = Experiment(model.to(device), lossfunc, optimizer, n_epochs)
    experiment.run()
    experiments.append(experiment)


  0% (0 of 40) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  2% (1 of 40) |                         | Elapsed Time: 0:00:03 ETA:   0:02:10
  5% (2 of 40) |#                        | Elapsed Time: 0:00:06 ETA:   0:02:01
  7% (3 of 40) |#                        | Elapsed Time: 0:00:10 ETA:   0:02:13
 10% (4 of 40) |##                       | Elapsed Time: 0:00:13 ETA:   0:01:57
 12% (5 of 40) |###                      | Elapsed Time: 0:00:16 ETA:   0:02:01
 15% (6 of 40) |###                      | Elapsed Time: 0:00:20 ETA:   0:01:57
 17% (7 of 40) |####                     | Elapsed Time: 0:00:23 ETA:   0:01:58
 20% (8 of 40) |#####                    | Elapsed Time: 0:00:27 ETA:   0:01:52
 22% (9 of 40) |#####                    | Elapsed Time: 0:00:31 ETA:   0:01:57
 25% (10 of 40) |######                  | Elapsed Time: 0:00:35 ETA:   0:02:01
 27% (11 of 40) |######                  | Elapsed Time: 0:00:39 ETA:   0:01:49
 30% (12 of 40) |#######                

In [28]:
experiments[0].model.best_model_state


AttributeError: 'Classifier' object has no attribute 'best_model_state'

# Model Scoring


In [17]:
fig = sp.make_subplots(
    1,
    2,
)
fig.add_trace(
    trace=go.Scatter(
        x=np.arange(experiment.test_accuracy.shape[0]),
        y=experiment.test_accuracy.detach(),
        name="Accuracy",
    ),
    row=1,
    col=2,
)
fig.add_trace(
    trace=go.Scatter(
        x=np.arange(experiment.test_loss.shape[0]),
        y=experiment.test_loss.detach(),
        name="Test Loss",
    ),
    row=1,
    col=1,
)
fig.add_trace(
    trace=go.Scatter(
        x=np.arange(experiment.train_loss.shape[0]),
        y=experiment.train_loss.detach(),
        name="Training Loss",
    )
)

fig.update_layout({"title": {"text": "Model Outputs"}})

fig.show()


In [24]:
torch.save(experiments[0].get_best_model(), "model.pt")
