In [2]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import seaborn as sns
import time
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from IPython import display
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline as backend_inline

backend_inline.set_matplotlib_formats("svg")


#### Pytorch device specific configuration ###
# # Pytorch Gpu Configuration for Cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# For Directml Gpu Configurations
# import torch_directml

# device = torch_directml.device()

# Set default device
torch.set_default_device(device)

# # Font update global for all plots
# plt.rcParams.update({"font.size": 18})

In [5]:
# import the data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url, sep=";")
data = data[data["total sulfur dioxide"] < 200]  # drop a few outliers

In [None]:
def createUnbalancedData(threshold=5):
    # z-score all columns except for quality
    cols2zscore = data.keys()
    cols2zscore = cols2zscore.drop("quality")
    data[cols2zscore] = data[cols2zscore].apply(stats.zscore)

    # Binalizing quality field
    data["boolQuality"] = 0
    data.loc[data["quality"] >= threshold, "boolQuality"] = 1
    print(data[["quality", "boolQuality"]])

    # convert from pandas dataframe to tensor
    dataT = torch.tensor(data[cols2zscore].values).float()
    labels = torch.tensor(data["boolQuality"].values).float()

    # transform to matrix
    labels = labels.reshape(labels.shape[0], 1)
    # use scikitlearn to split the data
    train_data, test_data, train_labels, test_labels = train_test_split(
        dataT, labels, test_size=0.1
    )

    # then convert them into PyTorch Datasets (note: already converted to tensors)
    train_data = TensorDataset(train_data, train_labels)
    test_data = TensorDataset(test_data, test_labels)

    # finally, translate into dataloader objects
    batchsize = 32
    train_loader = DataLoader(
        train_data,
        batch_size=batchsize,
        shuffle=True,
        drop_last=True,
        generator=torch.Generator(device),
    )
    test_loader = DataLoader(
        test_data,
        batch_size=test_data.tensors[0].shape[0],
        generator=torch.Generator(device),
    )
    return train_loader, test_loader

In [18]:
# Test the create unbalance data function
train_loader, test_loader = createUnbalancedData(threshold=6)

      quality  boolQuality
0           5            0
1           5            0
2           5            0
3           6            1
4           5            0
...       ...          ...
1594        5            0
1595        6            1
1596        6            1
1597        5            0
1598        6            1

[1597 rows x 2 columns]


In [24]:
# Class to create a model
def createWineNet():
    # Model definition
    class ANNwine(nn.Module):
        def __init__(self):
            super().__init__()
            #### Layers
            # Input
            self.input = nn.Linear(11, 32)

            # Hidden
            self.fc1 = nn.Linear(32, 64)
            self.fc2 = nn.Linear(64, 64)

            # Output
            self.output = nn.Linear(64, 1)

            # Forward Function

        def forward(self, x):
            x = F.leaky_relu(self.input(x))
            x = F.leaky_relu(self.fc1(x))
            x = F.leaky_relu(self.fc2(x))

            return self.output(x)

    net = ANNwine()
    # Loss Function
    lossFun = nn.BCEWithLogitsLoss()

    # Optimizer
    optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

    return net, lossFun, optimizer

In [25]:
# Let's test the model
net, lossFun, optimizer = createWineNet()

net(torch.randn(10, 11)).shape

torch.Size([10, 1])

In [26]:
# A function to train the model
def trainTheModel(nEpochs):

    # Model class Instantiation
    net, lossFun, optimizer = createWineNet()

    # Initiallize losses, train and test accuricies
    losses = torch.zeros(nEpochs)
    trainAcc = []
    testAcc = []

    # The loop
    for epoch in range(nEpochs):

        # Training mode
        net.train()

        # Initialize batch accuricies and losses
        batchAcc = []
        batchLoss = []

        # The batch loop
        for X, y in train_loader:

            # Forward pass
            yHat = net(X)
            loss = lossFun(yHat, y)

            # Backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Batch loss
            batchLoss.append(loss.cpu().item())

            # Batch Accuracy
            batchAcc.append(100 * torch.mean(((yHat > 0) == y).float()).cpu().item())

        # End of batch loop
        # Train Accuracy for the epoch
        trainAcc.append(np.mean(batchAcc))
        # Losses across the batches
        losses[epoch] = np.mean(batchLoss)

        # test accuracy
        net.eval()
        X, y = next(iter(test_loader))  # extract X,y from test dataloader
        with torch.no_grad():  # deactivates autograd
            yHat = net(X)
        testAcc.append(100 * torch.mean(((yHat > 0) == y).float()).cpu().item())

        return trainAcc, testAcc, losses

In [None]:
# Params for the Experiment
nEpochs = 500
therosholds = [4.5, 5.5, 6.5]
losses = []
trainAccs = []
testAccs = []
for i, th in enumerate(therosholds):
    print(f"Training started for: {th}")
    train_loader, test_loader = createUnbalancedData(threshold=th)
    trainAcc, testAcc, loss = trainTheModel(nEpochs)
    losses.append(loss)
    trainAccs.append(trainAcc)
    testAccs.append(testAcc)
    print(f"Training Completed for: {th}")

Training started for: 4.5
      quality  boolQuality
0           5            1
1           5            1
2           5            1
3           6            1
4           5            1
...       ...          ...
1594        5            1
1595        6            1
1596        6            1
1597        5            1
1598        6            1

[1597 rows x 2 columns]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x12 and 11x32)