In [1]:
import torch
import torch.nn as nn
from torchtext.datasets import IMDB
from torchtext.vocab import vocab
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader

import re
from collections import Counter, OrderedDict

  from .autonotebook import tqdm as notebook_tqdm


# Sentiment Analysis

##### Data preprocessing

In [2]:
# get data
train_dataset = IMDB(split='train')
test_dataset = IMDB(split='test')

# train val set
torch.manual_seed(1)
train_dataset, val_dataset = random_split(list(train_dataset), [20000, 5000])

In [3]:
# function to remove html tags, keep emoticons, tokenize into words
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

# get list of unique word tokens
token_counts = Counter()
for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)

print('Vocab num:', len(token_counts))

Vocab num: 69023


In [4]:
# sort the unique words
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

# create unique word to unique integer vocab
vocab = vocab(ordered_dict)

# add two generic vocab
vocab.insert_token("<pad>", 0)  # placeholder, for adjusting length of sequence
vocab.insert_token("<unk>", 1)  # unknown words, e.g. words that appear in the val or test set but not train set

vocab.set_default_index(1)

# vocab stores words as integer
print([vocab[token] for token in ['hello', 'world']])

[4892, 177]


In [5]:
# pipeline to transform each text data into integer vocabs
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

# pipeline to transform label (pos, neg) into (1, 0)
label_pipeline = lambda x: 1. if x == 'pos' else 0.

# function to preprocess (transform to integer vocab) the text in a batch
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []

    # for each sample in the batch
    for _label, _text in batch:
        # convert the label to integer
        label_list.append(label_pipeline(_label))

        # convert the text to integer, and create tensor
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)

        # track all sample lengths
        lengths.append(processed_text.size(0))

    # convert to tensor
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)

    # pad samples so each has the same length
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)

    return padded_text_list, label_list, lengths

In [6]:
# create dataloader with the preprocessing function
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size, shuffle=True, collate_fn=collate_batch)
val_dl = DataLoader(val_dataset, batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size, shuffle=False, collate_fn=collate_batch)

# Model

In [7]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()

        # create embedding layer to transform unique integer vocabs to real value vectors
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # one LSTM layer
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)

        # one FC hidden layer
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()

        # FC layer output (binary output)
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, vocab, lengths):
        # use embedding layer to convert vocab to real value feature vectors
        out = self.embedding(vocab)

        # pad to same length
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)

        # run through the rnn layer
        out, (hidden, cell) = self.rnn(out)

        # use the last hidden state from the last hidden layer as input to FC
        out = hidden[-1, :, :]

        out = self.fc1(out)
        out = self.relu(out)

        out = self.fc2(out)
        out = self.sigmoid(out)

        return out

In [8]:
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)

model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model

RNN(
  (embedding): Embedding(69025, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [9]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:
# function to train for one epoch
def train(dataloader):
    # switch to train mode
    model.train()

    total_acc, total_loss = 0, 0

    for vocab_batch, label_batch, lengths in dataloader:
        pred = model(vocab_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # metric
        total_acc += ((pred > 0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item() * label_batch.size(0)

    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

In [11]:
def eval(dataloader):
    # switch to eval mode
    model.eval()

    total_acc, total_loss = 0, 0

    for vocab_batch, label_batch, lengths in dataloader:
        pred = model(vocab_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)

        # metric
        total_acc += ((pred > 0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item() * label_batch.size(0)

    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

In [None]:
# train
num_epochs = 10
torch.manual_seed(1)

for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_eval, loss_eval = eval(val_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_eval:.4f}')