# Building a Deep Neural Net for Sentiment Analysis on IMDb Reviews

## 1. Data collection and preprocessing
- Collect a dataset of IMDb reviews
- Preprocess the text data (tokenization, lowercasing, removing special characters, etc.)
- Split the dataset into training, validation, and test sets

## 2. **Model selection and architecture**
- Research different types of deep learning models (**RNN**, LSTM, GRU, CNN, Transformer)
- Decide on a model architecture
- Experiment with pre-trained models (BERT, GPT, RoBERTa) for fine-tuning

## 3. Model training and hyperparameter tuning
- Set up a training loop
- Use backpropagation to update the model's weights based on the loss function
- Experiment with different hyperparameters (learning rate, batch size, dropout rate, etc.) and optimization algorithms (Adam, RMSprop, etc.)
- Monitor performance on the validation set during training

## 4. Model evaluation and refinement
- Evaluate the model on the test set using relevant metrics (accuracy, F1 score, precision, recall, etc.)
- Identify areas for improvement and iterate on the model architecture, training process, or preprocessing techniques

## 5. "Extra for experts" ideas
- Handle class imbalance (oversampling, undersampling, or SMOTE)
- Experiment with different word embeddings (Word2Vec, GloVe, FastText) or contextual embeddings (ELMo, BERT)
- Explore advanced model architectures (multi-head attention, capsule networks, memory-augmented networks)
- Investigate transfer learning or multi-task learning
- Conduct error analysis to understand and address specific issues
- Develop a user interface or API for your sentiment analysis model

In [13]:
from typing import Tuple
import tokenizers
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence

class JeromeRNNInnerModule(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        activation_function=None,
    ):
        super().__init__()
        self._activation = activation_function or F.tanh
        self._input_size = input_size
        self._hidden_size = hidden_size

        # use Xavier initialisation to avoid exploding exponentials
        self._w_ax = nn.Parameter(torch.Tensor(self._input_size + self._hidden_size, self._hidden_size))
        nn.init.kaiming_uniform_(self._w_ax)

        # bias
        self._b_ax = nn.Parameter(torch.zeros(self._hidden_size))

    def forward(self, x, a):
        # single token
        # x shape is (L, B, _input_size)

        L, B, _ = x.size()
        output = torch.zeros((L, B, self._hidden_size), device=x.device)

        for t in range(L):
            x_t = x[t]
            a = self._activation(
                torch.matmul(
                    torch.concat((x_t, a), dim=-1),
                    self._w_ax
                ) + self._b_ax
            )
            output[t, :, :] = a
        
        return output


class JeromeRNNBlockModule(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        n_layers: int,
        activation_function=None,
    ):
        super().__init__()
        self._hidden_size = hidden_size
        self._input_size = input_size
        self._layers = nn.ModuleList([
            JeromeRNNInnerModule(
                input_size=input_size if i == 0 else hidden_size,
                hidden_size=hidden_size,
                activation_function=activation_function,
            )
            for i in range(n_layers)
        ])

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x shape: (L, B, _input_size)
        _, batch_size, _ = x.size()

        for i, layer in enumerate(self._layers):
            a = torch.zeros((batch_size, self._hidden_size), device=x.device)
            x = layer(x, a)
        
        return x[-1]




class SentimentAnalysisModel(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        emb_dim: int = 30,
        hidden_size: int = 40,
        n_rnn_layers: int = 1,
    ):
        super().__init__()
        self.emb = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_dim,
        )
        # self.rnn = nn.RNN(
        #     input_size=emb_dim,
        #     hidden_size=hidden_size,
        #     num_layers=n_rnn_layers,
        # )

        # Using Jerome's RNN
        self.rnn = JeromeRNNBlockModule(
            input_size=emb_dim,
            hidden_size=hidden_size,
            n_layers=n_rnn_layers,
        )

        self.seq = nn.Sequential(
            nn.Linear(hidden_size, 500),
            nn.ReLU(),
            nn.Linear(500, 500),
            nn.ReLU(),
            nn.Linear(500, 2),
            nn.Softmax(dim=1),
        )

    def forward(self, x: torch.Tensor, lengths: torch.Tensor):
        # x shape: (B, L)
        # convert token indices to embedding values
        x = self.emb(x)
        # x shape: (B, L, Emb dim)

        x = x.transpose(0, 1)
        # x shape: (L, B, Emb dim)

        # # pack the sequence
        # x = pack_padded_sequence(x, lengths, enforce_sorted=False)

        # # run the rnn, only taking the final rnn hidden state from the last layer
        # # TODO: understand the difference between the two outputs more
        # _, x = self.rnn(x)
        # # x shape: (B, n_rnn_layers, Hidden size?)
        
        # # take only the last layer
        # x = x[-1, :, :]

        # Using Jerome's RNN
        x = self.rnn(x)

        # x shape: (B, Hidden size?)
        
        return self.seq(x)

In [14]:
# load in tokenized data
data_dict = torch.load("data/imdb_data.pt")
data = data_dict["reviews"]
labels = data_dict["labels"]
lengths = data_dict["lengths"]

# split into train and test by 80:20
training_fraction = 0.8

train_data = data[:int(len(data) * training_fraction)]
train_labels = labels[:int(len(data) * training_fraction)]
train_lengths = lengths[:int(len(data) * training_fraction)]

test_data = data[int(len(data) * training_fraction):]
test_labels = labels[int(len(data) * training_fraction):]
test_lengths = lengths[int(len(data) * training_fraction):]


# load in tokenizer
tokenizer = tokenizers.Tokenizer.from_file("models/tokenizer.json")
vocab_size = tokenizer.get_vocab_size()

# Training loop

In [15]:
def calculate_accuracy(
    model: nn.Module,
    train_data: torch.Tensor,
    train_labels: torch.Tensor,
    train_lengths: torch.Tensor,
    batch_size: int,
):
    # calculate overall loss (need batching for memory reasons)
    loss = 0
    total = 0
    correct = 0

    with torch.no_grad():
        for i in range(0, len(train_data), batch_size):
            training_data_batch = train_data[i : i + batch_size]
            training_labels_batch = train_labels[i : i + batch_size]
            training_lengths_batch = train_lengths[i : i + batch_size]

            output = model(training_data_batch, lengths=training_lengths_batch)
            ## Calculate correct predictions
            _, y_predicted = torch.max(output, 1)
            total += training_labels_batch.size(0)
            correct += (y_predicted == training_labels_batch).sum().item()
            loss += nn.functional.cross_entropy(output, training_labels_batch)

    return loss, correct, total


In [18]:
import torch.optim as optim
import json
import time

# nice file name including the date to store accuracy data
date_str = time.strftime("%Y%m%d-%H%M%S")

batch_size = 512

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# create the model
model = SentimentAnalysisModel(
    vocab_size=vocab_size,
    emb_dim=30,
    hidden_size=40,
    n_rnn_layers=3,
)
model = model.to(device)

subset = batch_size * 3

train_data = train_data.to(device)[:subset]
train_labels = train_labels.to(device)[:subset]
train_lengths = train_lengths.to(device)[:subset]

test_data = test_data.to(device)[:subset]
test_labels = test_labels.to(device)[:subset]
test_lengths = test_lengths.to(device)[:subset]

optimizer = optim.Adam(model.parameters(), lr=0.001)

eval_interval = 1

acc_best = 0
acc_data = []

# train the model
for epoch in range(500):
    if epoch % eval_interval == 0:
        train_loss, train_correct, train_total = calculate_accuracy(
            model, train_data, train_labels, train_lengths, batch_size
        )
        test_loss, test_correct, test_total = calculate_accuracy(
            model, test_data, test_labels, test_lengths, batch_size
        )
        torch.save(model.state_dict(), f"models/rnn_{date_str}_latest.pt")

        test_acc = test_correct / test_total * 100
        train_acc = train_correct / train_total * 100

        if test_acc > acc_best:
            torch.save(model.state_dict(), f"models/rnn_{date_str}_best.pt")
            acc_best = test_acc

        print(
            f"UPDATE - Epoch: {epoch}, "
            f"Train Acc: {train_acc:0.2f}% (loss: {train_loss:0.2f}), "
            f"Test Acc: {test_acc: 0.2f}% (loss: {test_loss:0.2f})"
        )
        acc_data.append(
            {
                "epoch": epoch,
                "train_acc": train_acc,
                "test_acc": test_acc,
                "train_loss": train_loss.item(),
                "test_loss": test_loss.item(),
            }
        )
        with open(f"models/rnn_{date_str}_acc.json", "w") as f:
            json.dump(acc_data, f, indent=4)

    cum_loss = 0

    for i in range(0, len(train_data), batch_size):
        training_data_batch = train_data[i : i + batch_size]
        training_labels_batch = train_labels[i : i + batch_size]
        training_lengths_batch = train_lengths[i : i + batch_size]

        # forward pass
        optimizer.zero_grad()
        output = model(training_data_batch, lengths=training_lengths_batch)

        loss = nn.functional.cross_entropy(output, training_labels_batch)
        loss.backward()
        # During training, after loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()
        cum_loss += loss.item()

    print(f"Epoch: {epoch}, Loss: {cum_loss:0.2f}")


UPDATE - Epoch: 0, Train Acc: 49.09% (loss: 2.08), Test Acc:  49.61% (loss: 2.08)
Epoch: 0, Loss: 2.37
UPDATE - Epoch: 1, Train Acc: 49.09% (loss: 2.47), Test Acc:  49.61% (loss: 2.45)
Epoch: 1, Loss: nan
UPDATE - Epoch: 2, Train Acc: 49.09% (loss: nan), Test Acc:  49.61% (loss: nan)
Epoch: 2, Loss: nan
UPDATE - Epoch: 3, Train Acc: 49.09% (loss: nan), Test Acc:  49.61% (loss: nan)
Epoch: 3, Loss: nan
UPDATE - Epoch: 4, Train Acc: 49.09% (loss: nan), Test Acc:  49.61% (loss: nan)
Epoch: 4, Loss: nan
UPDATE - Epoch: 5, Train Acc: 49.09% (loss: nan), Test Acc:  49.61% (loss: nan)
Epoch: 5, Loss: nan
UPDATE - Epoch: 6, Train Acc: 49.09% (loss: nan), Test Acc:  49.61% (loss: nan)
Epoch: 6, Loss: nan
UPDATE - Epoch: 7, Train Acc: 49.09% (loss: nan), Test Acc:  49.61% (loss: nan)
Epoch: 7, Loss: nan
UPDATE - Epoch: 8, Train Acc: 49.09% (loss: nan), Test Acc:  49.61% (loss: nan)
Epoch: 8, Loss: nan
UPDATE - Epoch: 9, Train Acc: 49.09% (loss: nan), Test Acc:  49.61% (loss: nan)
Epoch: 9, Loss:

KeyboardInterrupt: 

In [None]:
training_data_batch

## Loading in latest model to check accuracy

In [None]:
# load in the latest model
loaded_model = SentimentAnalysisModel(vocab_size=vocab_size)
loaded_model.load_state_dict(torch.load("models/rnn_latest.pt"))
loaded_model.to(device)

test_data = test_data.to(device)
test_labels = test_labels.to(device)

test_stats = calculate_accuracy(loaded_model, test_data, test_labels, test_lengths, batch_size)
train_stats = calculate_accuracy(loaded_model, train_data, train_labels, train_lengths, batch_size)


print(f"Test Loss: {test_stats[0]}, Test Accuracy: {test_stats[1]/test_stats[2]*100: .2f}")
print(f"Train Loss: {train_stats[0]}, Train Accuracy: {train_stats[1]/train_stats[2]*100: .2f}")