In [1]:
from collections import OrderedDict
import pickle

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam

from torchsummary import summary

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

import skorch


In [2]:
data = pd.read_csv("train.csv", index_col="id").rename(
    columns={
        "FAVC": "Frequent consumption of high caloric food",
        "FCVC": "Frequency of consumption of vegetables",
        "NCP": "Number of main meals",
        "CAEC": "Consumption of food between meals",
        "CH2O": "Consumption of water daily",
        "SCC": "Calories consumption monitoring",
        "FAF": "Physical activity frequency",
        "TUE": "Time using technology devices",
        "CALC": "Consumption of alcohol",
        "MTRANS": "Transportation used",
    }
)
data = pd.concat(
    [
        data[data["NObeyesdad"] == value].sample(2400)
        for value in data["NObeyesdad"].unique().tolist()
    ]
)


In [3]:
data = data.assign(
    **{
        "Age": data.Age.round(),
        "Height": (data.Height * 100).round(),
        "Weight": data.Weight.round(),
        "Frequency of consumption of vegetables": data[
            "Frequency of consumption of vegetables"
        ]
        .round()
        .astype("int"),
        "Number of main meals": data["Number of main meals"].round().astype("int"),
        "Consumption of water daily": 8
        * data["Consumption of water daily"].round().astype("int"),
        "Physical activity frequency": data["Physical activity frequency"]
        .round()
        .astype("int"),
        "Time using technology devices": data["Time using technology devices"]
        .round()
        .astype("int"),
        "BMI": lambda x: x["Weight"]
        / x["Height"]
        / np.where(
            x["Physical activity frequency"] == 0,
            1,
            x["Physical activity frequency"],
        ),
    }
)


In [4]:
# Made a key that I can use to identify the labels once I convert them to their respective index.
label_values = data.NObeyesdad.unique().tolist()


In [5]:
# Feature engineered data

updated_data = (
    pd.get_dummies(
        data.select_dtypes("object").drop(columns=["NObeyesdad"]), drop_first=True
    )
    .join(data.select_dtypes("number"))
    .join(data["NObeyesdad"].apply(label_values.index))
)


In [6]:
updated_data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 16800 entries, 12709 to 7070
Data columns (total 24 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Gender_Male                                    16800 non-null  bool   
 1   family_history_with_overweight_yes             16800 non-null  bool   
 2   Frequent consumption of high caloric food_yes  16800 non-null  bool   
 3   Consumption of food between meals_Frequently   16800 non-null  bool   
 4   Consumption of food between meals_Sometimes    16800 non-null  bool   
 5   Consumption of food between meals_no           16800 non-null  bool   
 6   SMOKE_yes                                      16800 non-null  bool   
 7   Calories consumption monitoring_yes            16800 non-null  bool   
 8   Consumption of alcohol_Sometimes               16800 non-null  bool   
 9   Consumption of alcohol_no                      16800

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    updated_data.iloc[:, :-1], updated_data.iloc[:, -1], test_size=0.1, random_state=42
)

X_train


Unnamed: 0_level_0,Gender_Male,family_history_with_overweight_yes,Frequent consumption of high caloric food_yes,Consumption of food between meals_Frequently,Consumption of food between meals_Sometimes,Consumption of food between meals_no,SMOKE_yes,Calories consumption monitoring_yes,Consumption of alcohol_Sometimes,Consumption of alcohol_no,...,Transportation used_Walking,Age,Height,Weight,Frequency of consumption of vegetables,Number of main meals,Consumption of water daily,Physical activity frequency,Time using technology devices,BMI
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
444,False,True,True,False,True,False,False,False,False,True,...,False,24.0,150.0,45.0,1,3,8,0,1,0.300000
19482,False,True,True,False,True,False,False,False,False,True,...,False,34.0,168.0,77.0,2,2,24,1,0,0.458333
15371,False,False,True,False,True,False,False,False,False,True,...,False,34.0,168.0,78.0,3,1,8,0,1,0.464286
13779,True,False,False,False,True,False,False,True,False,True,...,False,33.0,175.0,85.0,2,3,24,0,2,0.485714
2848,True,True,True,False,True,False,False,False,True,False,...,False,18.0,170.0,50.0,1,3,8,1,1,0.294118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15253,True,True,True,False,True,False,False,False,True,False,...,False,32.0,176.0,121.0,2,3,24,2,0,0.343750
5391,True,True,True,False,True,False,False,False,False,True,...,False,31.0,169.0,102.0,3,3,8,1,0,0.603550
15853,False,False,True,True,False,False,False,False,False,True,...,False,18.0,170.0,50.0,1,3,16,1,1,0.294118
14138,True,False,True,False,True,False,False,False,False,False,...,False,35.0,173.0,85.0,3,3,16,1,1,0.491329


In [8]:
device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu"


In [9]:
class dataset(Dataset):
    def __init__(self, X, y):
        super().__init__()
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y.values)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):

        return self.X[idx].to(device), self.y[idx].to(device)


In [10]:
# train_dataset = dataset(X_train, y_train)
# test_dataset = dataset(X_test, y_test)

# train_loader = DataLoader(train_dataset, drop_last=True, batch_size=16, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=len(X_test))


In [11]:
# VGG19 style CNN


class CNNClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(
            OrderedDict(
                [
                    ("Conv1_1", nn.Conv1d(1, 64, 3, 2, 1)),
                    ("Conv1_2", nn.Conv1d(64, 64, 3, 1, 1)),
                ]
            )
        )
        self.conv2 = nn.Sequential(
            OrderedDict(
                [
                    ("Conv2_1", nn.Conv1d(64, 128, 3, 1, 1)),
                    ("Conv2_2", nn.Conv1d(128, 128, 3, 1, 1)),
                ]
            )
        )
        self.conv3 = nn.Sequential(
            OrderedDict(
                [
                    ("Conv3_1", nn.Conv1d(128, 256, 3, 1, 1)),
                    ("Conv3_2", nn.Conv1d(256, 256, 3, 1, 1)),
                ]
            )
        )
        self.fc1 = nn.Sequential(
            OrderedDict(
                [("flattened", nn.Flatten()), ("Output Layer", nn.Linear(256, 7))]
            )
        )

    def forward(self, x):
        x = x.view(-1, 1, x[0].shape[0])
        x = F.max_pool1d(self.conv1(x), 1, 3)
        x = F.max_pool1d(self.conv2(x), 1, 3)
        x = F.max_pool1d(self.conv3(x), 1, 3)
        x = self.fc1(x)
        return x


summary(CNNClassifier().to(device), (1, X_train.shape[1]), 32)


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1                [32, 64, 1]             256
            Conv1d-2                [32, 64, 1]          12,352
            Conv1d-3               [32, 128, 1]          24,704
            Conv1d-4               [32, 128, 1]          49,280
            Conv1d-5               [32, 256, 1]          98,560
            Conv1d-6               [32, 256, 1]         196,864
           Flatten-7                  [32, 256]               0
            Linear-8                    [32, 7]           1,799
Total params: 383,815
Trainable params: 383,815
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.28
Params size (MB): 1.46
Estimated Total Size (MB): 1.75
----------------------------------------------------------------


In [12]:
# Fully Connected Neural Network


class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 256)
        self.fc4 = nn.Linear(256, 7)

    def forward(self, x):
        x = F.relu(F.dropout(self.fc1(x), 0.15))
        x = F.relu(F.dropout(self.fc2(x), 0.15))
        x = F.relu(F.dropout(self.fc3(x), 0.15))
        return self.fc4(x)


In [13]:
scaler = MinMaxScaler()
net = skorch.classifier.NeuralNetClassifier(
    Classifier,
    optimizer=Adam,
    criterion=nn.CrossEntropyLoss,
    lr=0.001,
    max_epochs=100,
    device=device,
    batch_size=32,
    dataset=dataset,
    classes=label_values,
    callbacks=[skorch.callbacks.EarlyStopping(monitor="valid_loss", patience=3)],
)

model = Pipeline(
    [
        ("scaler", scaler),
        ("net", net),
    ]
)

model.fit(X_train, y_train)


  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.1333[0m       [32m0.6528[0m        [35m0.8405[0m  2.0914
      2        [36m0.7033[0m       [32m0.7242[0m        [35m0.6667[0m  1.8407
      3        [36m0.6234[0m       [32m0.7593[0m        [35m0.6069[0m  1.8946
      4        [36m0.5813[0m       0.7573        [35m0.6055[0m  2.3153
      5        [36m0.5453[0m       [32m0.8009[0m        [35m0.5187[0m  2.8388
      6        [36m0.5231[0m       0.7880        0.5258  2.2915
      7        [36m0.5117[0m       [32m0.8112[0m        [35m0.4831[0m  2.1348
      8        [36m0.4894[0m       [32m0.8135[0m        [35m0.4805[0m  2.0665
      9        [36m0.4845[0m       [32m0.8234[0m        [35m0.4625[0m  2.1501
     10        [36m0.4784[0m       0.8181        0.4674  2.2537
     11        [36m0.4699[0m       [32m0.8237[0m        0.4660  2.3657
Stopping since v

In [14]:
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)
