In [1]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import seaborn as sns
import time
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline as backend_inline

backend_inline.set_matplotlib_formats("svg")


# Pytorch device specific configuration
# Pytorch Gpu Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.set_default_device(device)

# Font update global for all plots
plt.rcParams.update({"font.size": 18})

In [3]:
# import the data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url, sep=";")
data = data[data["total sulfur dioxide"] < 200]  # drop a few outliers

# z-score all columns except for quality
cols2zscore = data.keys()
cols2zscore = cols2zscore.drop("quality")
data[cols2zscore] = data[cols2zscore].apply(stats.zscore)

# create a new column for binarized (boolean) quality
data["boolQuality"] = 0
# data['boolQuality'][data['quality']<6] = 0 # implicit in the code! just here for clarity
data.loc[data["quality"] > 5, "boolQuality"] = 1

In [5]:
# convert from pandas dataframe to tensor
dataT = torch.tensor(data[cols2zscore].values).float()
labels = torch.tensor(data["boolQuality"].values).float()
# transform to matrix
labels = labels.reshape(labels.shape[0], 1)
labels.shape

torch.Size([1597, 1])

In [6]:
# use scikitlearn to split the data
train_data, test_data, train_labels, test_labels = train_test_split(
    dataT, labels, test_size=0.1
)

# then convert them into PyTorch Datasets (note: already converted to tensors)
train_data = TensorDataset(train_data, train_labels)
test_data = TensorDataset(test_data, test_labels)

# finally, translate into dataloader objects
batchsize = 32
train_loader = DataLoader(
    train_data,
    batch_size=batchsize,
    shuffle=True,
    drop_last=True,
    generator=torch.Generator(device),
)
test_loader = DataLoader(
    test_data,
    batch_size=test_data.tensors[0].shape[0],
    generator=torch.Generator(device),
)

In [12]:
# Class to create a model
class ANNwine(nn.Module):
    def __init__(self):
        super().__init__()
        #### Layers
        # Input
        self.input = nn.Linear(11, 32)

        # Hidden
        self.fc1 = nn.Linear(32, 64)
        self.fc2 = nn.Linear(64, 64)

        # Output
        self.output = nn.Linear(64, 1)

        # Forward Function

    def forward(self, x):
        x = F.relu(self.input(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        return self.output(x)

In [13]:
# test the model
net = ANNwine()
net(torch.randn(10, 11)).shape

torch.Size([10, 1])

In [14]:
# a function that trains the model

# global parameter
numepochs = 1000


def trainTheModel():

    # loss function and optimizer
    lossfun = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.SGD(winenet.parameters(), lr=0.01)

    # initialize losses
    losses = torch.zeros(numepochs)
    trainAcc = []
    testAcc = []

    # loop over epochs
    for epochi in range(numepochs):

        # turn on training mode
        winenet.train()

        # loop over training data batches
        batchAcc = []
        batchLoss = []
        for X, y in train_loader:

            # forward pass and loss
            yHat = winenet(X)
            loss = lossfun(yHat, y)

            # backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # loss from this batch
            batchLoss.append(loss.item())

            # compute training accuracy for this batch
            batchAcc.append(100 * torch.mean(((yHat > 0) == y).float()).item())
        # end of batch loop...

        # now that we've trained through the batches, get their average training accuracy
        trainAcc.append(np.mean(batchAcc))

        # and get average losses across the batches
        losses[epochi] = np.mean(batchLoss)

        # test accuracy
        winenet.eval()
        X, y = next(iter(test_loader))  # extract X,y from test dataloader
        with torch.no_grad():  # deactivates autograd
            yHat = winenet(X)
        testAcc.append(100 * torch.mean(((yHat > 0) == y).float()).item())

    # function output
    return trainAcc, testAcc, losses

In [15]:
# Create and train the model
winenet = ANNwine()
trainAcc, testAcc, losses = trainTheModel()

In [None]:
# Compute the model performance
train_predictions = winenet(train_loader.dataset.tensors[0])
print(f"Raw Train Predictions: {train_predictions}")
test_predictions = winenet(test_loader.dataset.tensors[0])
print(f"Raw Test Predictions: {test_predictions}")

Raw Train Predictions: tensor([[-15.6744],
        [  1.2578],
        [ -5.0351],
        ...,
        [  7.8460],
        [ -4.3748],
        [  7.3109]], device='cuda:0', grad_fn=<AddmmBackward0>)
Raw Test Predictions: tensor([[-8.5984e+00],
        [ 6.7529e+00],
        [-1.1623e+01],
        [-4.2437e-01],
        [-6.5844e+00],
        [ 1.3056e+00],
        [-8.8821e+00],
        [ 8.0917e-01],
        [ 1.3229e+01],
        [-4.4594e+00],
        [ 2.6934e+00],
        [-1.2870e-02],
        [-9.4251e+00],
        [-6.8931e+00],
        [-7.3885e+00],
        [ 3.3978e-01],
        [-2.1057e-01],
        [-4.7706e+00],
        [-2.2814e+00],
        [ 1.9854e+00],
        [-7.3105e+00],
        [ 6.6253e+00],
        [-1.8567e+00],
        [ 1.5778e+01],
        [ 5.2719e+00],
        [-2.0680e+01],
        [ 4.7550e+00],
        [-3.6947e+00],
        [-6.7576e+00],
        [-7.2122e+00],
        [-2.5980e+01],
        [-1.9328e+01],
        [-4.6906e+00],
        [ 1.3120e+0