In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import scipy.stats as stats
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

wine = pd.read_csv(url, sep=';')

# remove some rows with outliers
data = wine[wine['total sulfur dioxide'] < 200]

wine

In [None]:
# plot the data
wine.plot(marker='o', linestyle='none', figsize=(12, 6))
plt.xlabel('Sample number')
plt.ylabel('Value')
plt.show()

In [None]:
input_columns = wine.keys().drop('quality')
wine[input_columns] = wine[input_columns].apply(stats.zscore)

# binarize quality to simplify the model
wine['bool_quality'] = 0
wine['bool_quality'][wine['quality'] > 5] = 1

# convert from pandas dataframe to tensor
data_tensor = torch.tensor(wine[input_columns].values).float()
labels = torch.tensor(wine['bool_quality'].values).float()
labels = labels[:, None]  # transform to matrix

data_tensor, labels

# Break the data into batches

In [None]:
# use scikitlearn to split the data
train_data, test_data, train_labels, test_labels = train_test_split(
    data_tensor, labels, test_size=.2)

# then convert them into PyTorch Datasets (note: already converted to tensors)
train_data = TensorDataset(train_data, train_labels)
test_data = TensorDataset(test_data, test_labels)


# finally, translate into dataloader objects
def load_train_data_batches_with_multipliers(batch_multipliers):
    batch_size_base = 2
    for multiplier in batch_multipliers:
        yield DataLoader(train_data,
                         batch_size=batch_size_base**multiplier,
                         shuffle=True,
                         drop_last=True)


test_loader = DataLoader(test_data, batch_size=test_data.tensors[0].shape[0]
                         )  # how big should these batches be??

In [None]:
# check sizes of data batches
for batch in load_train_data_batches_with_multipliers([1, 3, 5, 7, 9]):
    print("batch_size =", batch.batch_size)
    for X, y in batch:
        print(X.shape, y.shape)


# Construct the model and training plans

In [None]:
class ANNMultilayer(nn.Module):

    def __init__(self):
        super().__init__()
        # model architecture
        self._input = nn.Linear(11, 16)
        self._hidden1 = nn.Linear(16, 32)
        self._hidden2 = nn.Linear(32, 32)
        self._output = nn.Linear(32, 1)

    def forward(self, input):
        x = F.relu(self._input(input))
        x = F.relu(self._hidden1(x))
        x = F.relu(self._hidden2(x))
        return self._output(x)

In [None]:
class ANNPipeline():

    def __init__(self, learning_rate):
        self._ann = ANNMultilayer()
        # loss function
        self._lossfun = nn.BCEWithLogitsLoss()
        # optimizer
        self._optimizer = torch.optim.SGD(self._ann.parameters(),
                                          lr=learning_rate)

    def train(self, train_loader, numepochs=1000):
        # initialize accuracies as empties
        train_accuracy = []
        test_accuracy = []
        losses = torch.zeros(numepochs)

        for epochi in range(numepochs):
            self._ann.train()
            # loop over training data batches
            batchAcc = []
            batchLoss = []
            for X, y in train_loader:
                # forward pass and loss
                yHat = self._ann(X)
                loss = self._lossfun(yHat, y)

                # backprop
                self._optimizer.zero_grad()
                loss.backward()
                self._optimizer.step()

                # compute training accuracy just for this batch
                batchAcc.append(100 * torch.mean(
                    ((yHat > 0) == y).float()).item())
                batchLoss.append(loss.item())
            # end of batch loop...

            # now that we've trained through the batches, get their average training accuracy
            train_accuracy.append(np.mean(batchAcc))
            losses[epochi] = np.mean(batchLoss)

            # test accuracy
            self._ann.eval()
            X, y = next(iter(test_loader))  # extract X,y from test dataloader
            with torch.no_grad():
                yHat = self._ann(X)
            train_accuracy.append(100 * torch.mean(
                ((yHat > 0) == y).float()).item())

        # function output
        return train_accuracy, test_accuracy, losses

# Test it out

In [None]:
result_data = {}
for train_loader in load_train_data_batches_with_multipliers(
        batch_multipliers=[1, 2, 3, 4, 5, 6]):
    net = ANNPipeline(learning_rate=.01)
    train_accuracy, test_accuracy, losses = net.train(train_loader)
    result_data[train_loader.batch_size] = {
        "train_accuracy": train_accuracy,
        "test_accuracy": test_accuracy,
        "losses": losses
    }

In [None]:
result_data

In [None]:
# plot the results
fig, ax = plt.subplots(1, 3, figsize=(15, 5))

for batch_size, results in result_data.items():
    ax[0].plot(results["losses"])
ax[0].set_ylabel('Loss')
ax[0].set_xlabel('Epochs')
ax[0].set_title('Losses with minibatches')
ax[0].legend([batch_size for batch_size in result_data])

for batch_size, results in result_data.items():
    ax[1].plot(results["train_accuracy"])
ax[1].set_title('Accuracy with minibatches in training data')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy (%)')
ax[1].legend([batch_size for batch_size in result_data])
ax[1].set_ylim([27, 103])

for batch_size, results in result_data.items():
    print(batch_size, results)
    ax[2].plot(results["test_accuracy"])
ax[2].set_title('Accuracy with minibatches in test data')
ax[2].set_xlabel('Epochs')
ax[2].set_ylabel('Accuracy (%)')
ax[2].legend([batch_size for batch_size in result_data])
ax[2].set_ylim([27, 103])

plt.show()