In [1]:
import torch
import pickle
from torch import nn
from torch.utils.data import DataLoader
from torch.autograd import Variable

print("PyTorch version:")
print(torch.__version__)
print("GPU Detected:")
# print(torch.cuda.is_available())
print(torch.backends.mps.is_available())

# defining a shortcut function for later:
import os

# gpu = torch.device("cuda:0")
gpu = torch.device("mps")

PyTorch version:
2.2.0
GPU Detected:
True


In [2]:
with open("./data/train_text", "rb") as f:
    train_text = pickle.load(f)
with open("./data/test_text", "rb") as f:
    test_text = pickle.load(f)

train_text_data = DataLoader(train_text, batch_size=1, shuffle=True)
test_text_data = DataLoader(test_text, batch_size=1, shuffle=True)

In [3]:
# check batch dimension
batch_size = train_text_data.batch_size
for data, label in train_text_data:
    print("shape: {0}".format(data.size()))
    break

shape: torch.Size([1, 12203, 300])


In [8]:
class TextClassificationModel(nn.Module):
    def __init__(self, sequence_size, kernel_size, num_class, dropout, activation_fn):
        super(TextClassificationModel, self).__init__()
        self.hidden_layers = nn.ModuleList([])
        self.hidden_layers.append(
            nn.Conv1d(sequence_size, 512, kernel_size=kernel_size)
        )  # layer1
        self.hidden_layers.append(
            nn.Conv1d(512, 256, kernel_size=kernel_size)
        )  # layer2
        self.hidden_layers.append(
            nn.Conv1d(256, 128, kernel_size=kernel_size)
        )  # layer3
        self.flatten = nn.Flatten()  # Flatten layer
        self.dropout = nn.Dropout(dropout)
        self.output_projection = nn.Linear(38400, num_class)
        self.nonlinearity = activation_fn

    def forward(self, x):
        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
            x = self.dropout(x)
            x = self.nonlinearity(x)

        x = self.flatten(x)
        out = self.output_projection(x)

        out_distribution = nn.functional.log_softmax(out, dim=-1)
        return out_distribution


def train(
    train_dataloader, test_dataloader, nll_criterion, num_epochs, ffnn, ffnn_optimizer
):
    # A counter for the number of gradient updates we've performed.
    num_iter = 0

    # Iterate `num_epochs` times.
    for epoch in range(num_epochs):
        print("Starting epoch {}".format(epoch + 1))
        # Iterate over the train_dataloader, unpacking the images and labels
        for data, labels in train_dataloader:
            # If we're using the GPU, move reshaped_images and labels to the GPU.
            if gpu:
                data = data.to(gpu)
                labels = labels.to(gpu)

            # Run the forward pass through the model to get predicted log distribution.
            predicted = ffnn(data)

            # Calculate the loss
            batch_loss = nll_criterion(predicted, labels)

            # Clear the gradients as we prepare to backprop.
            ffnn_optimizer.zero_grad()

            # Backprop (backward pass), which calculates gradients.
            batch_loss.backward()

            # Take a gradient step to update parameters.
            ffnn_optimizer.step()

            # Increment gradient update counter.
            num_iter += 1

            # Calculate test set loss and accuracy every 500 gradient updates
            # It's standard to have this as a separate evaluate function, but
            # we'll place it inline for didactic purposes.
            if num_iter % 500 == 0:
                # Set model to eval mode, which turns off dropout.
                ffnn.eval()
                # Counters for the num of examples we get right / total num of examples.
                num_correct = 0
                total_examples = 0
                total_test_loss = 0

                with torch.no_grad():
                    # Iterate over the test dataloader
                    for test_data, test_labels in test_dataloader:

                        # If we're using the GPU, move tensors to the GPU.
                        if gpu:
                            test_data = test_data.to(gpu)
                            test_labels = test_labels.to(gpu)

                        # Run the forward pass to get predicted distribution.
                        predicted = ffnn(test_data)

                        # Calculate loss for this test batch. This is averaged, so multiply
                        # by the number of examples in batch to get a total.
                        total_test_loss += nll_criterion(
                            predicted, test_labels
                        ).data * test_labels.size(0)

                        # Get predicted labels (argmax)
                        _, predicted_labels = torch.max(predicted.data, 1)

                        # Count the number of examples in this batch
                        total_examples += test_labels.size(0)

                        # Count the total number of correctly predicted labels.
                        # predicted == labels generates a ByteTensor in indices where
                        # predicted and labels match, so we can sum to get the num correct.
                        num_correct += torch.sum(predicted_labels == test_labels.data)
                accuracy = 100 * num_correct / total_examples
                average_test_loss = total_test_loss / total_examples
                print(
                    "Iteration {}. Test Loss {}. Test Accuracy {}.".format(
                        num_iter, average_test_loss, accuracy
                    )
                )
                # Set the model back to train mode, which activates dropout again.
                ffnn.train()

In [9]:
model = TextClassificationModel(
    sequence_size=12203, num_class=2, kernel_size=1, dropout=0.5, activation_fn=nn.ReLU()
)
nll_criterion = nn.NLLLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.0005)
model.to(gpu)

TextClassificationModel(
  (hidden_layers): ModuleList(
    (0): Conv1d(12203, 512, kernel_size=(1,), stride=(1,))
    (1): Conv1d(512, 256, kernel_size=(1,), stride=(1,))
    (2): Conv1d(256, 128, kernel_size=(1,), stride=(1,))
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (dropout): Dropout(p=0.5, inplace=False)
  (output_projection): Linear(in_features=38400, out_features=2, bias=True)
  (nonlinearity): ReLU()
)

In [10]:
num_epochs = 10
train(
    train_text_data, 
    test_text_data, 
    nll_criterion, 
    num_epochs, 
    model, 
    optimiser
)


Starting epoch 1
Iteration 500. Test Loss 0.41324731707572937. Test Accuracy 86.16462707519531.
Starting epoch 2
Iteration 1000. Test Loss 0.4221436381340027. Test Accuracy 88.26619720458984.
Starting epoch 3
Iteration 1500. Test Loss 0.17694760859012604. Test Accuracy 93.5201416015625.
Starting epoch 4
Iteration 2000. Test Loss 0.2007778286933899. Test Accuracy 92.11908721923828.
Starting epoch 5
Iteration 2500. Test Loss 0.10656852275133133. Test Accuracy 96.14710998535156.
Starting epoch 6
Iteration 3000. Test Loss 0.08465412259101868. Test Accuracy 96.49737548828125.
Starting epoch 7
Iteration 3500. Test Loss 0.12515655159950256. Test Accuracy 96.14710998535156.
Starting epoch 8
Iteration 4000. Test Loss 0.1716514676809311. Test Accuracy 93.87039947509766.
Iteration 4500. Test Loss 0.12806156277656555. Test Accuracy 95.44658660888672.
Starting epoch 9
Iteration 5000. Test Loss 0.2212853580713272. Test Accuracy 95.62171936035156.
Starting epoch 10
Iteration 5500. Test Loss 0.1280970