In [1]:
from torchtext.datasets import IMDB # type: ignore

train_dataset = IMDB(split='train')
test_dataset = IMDB(split='test')

In [2]:
import torch
from torch.utils.data import random_split

torch.manual_seed(0)
train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])

In [3]:
import re
from collections import Counter, OrderedDict

def tokenizer(text):
    # strip html tags
    text = re.sub('<[^>]*>', '', text)
    
    # a regex to match emojis
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    
    return tokenized

print(tokenizer('<div>hello</div> :)'))

['hello', ':)']


  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
  text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')


In [4]:
token_counts = Counter()
for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)
print(len(token_counts))

69241


In [5]:
print(token_counts.most_common(5))

[('the', 267895), ('and', 130566), ('a', 130057), ('of', 116231), ('to', 108249)]


Create token encoding based on token counts.

In [6]:
from torchtext.vocab import vocab   # type: ignore

sorted_by_freq_tuples = sorted(
    token_counts.items(),
    key=lambda x: x[1],
    reverse=True
)

ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token('<pad>', 0)
vocab.insert_token('<unk>', 1)
vocab.set_default_index(1)


In [7]:
for token in vocab.get_itos()[:10]:
    print(f"{token}: {vocab[token]}")

<pad>: 0
<unk>: 1
the: 2
and: 3
a: 4
of: 5
to: 6
is: 7
it: 8
in: 9


In [8]:
print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 461]


In [33]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1.0 if x == 2 else 0.0 # 2 is positive, 1 is negative

In [34]:
print(text_pipeline('this is an example'))
print(label_pipeline('pos'))

[11, 7, 35, 461]
0.0


In [35]:
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for label, text in batch:
        # encode the label
        label_list.append(label_pipeline(label))
        
        # encode the text and record its length
        processed_text = torch.tensor(text_pipeline(text))
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    
    # pad the text to the length of the longest sequence
    padded_text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    
    return padded_text_list, label_list, lengths

In [36]:
from torch.utils.data import DataLoader

dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_batch)

In [37]:
text_batch, label_batch, text_lengths = next(iter(dataloader))
print(text_batch.shape)
print(label_batch)
print(text_lengths)

torch.Size([4, 135])
tensor([1., 0., 1., 0.])
tensor([127,  78, 135,  57])


In [38]:
batch_size = 32

train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [39]:
text_batch, label_batch, text_lengths = next(iter(train_dl))
print(text_batch.shape)
print(label_batch)
print(text_lengths)

torch.Size([32, 778])
tensor([0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0., 1.,
        0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0.])
tensor([142, 118, 118, 129, 202, 417, 778, 103, 159, 135, 223, 193, 281, 255,
        137, 138, 538, 391, 214, 422, 428, 118, 504, 261, 372, 134, 132, 177,
        252, 121, 141, 138])


Embedding.

The idea behind embedding is to introduce $k$ features that encode the meaning of the token. 

While we could get away with one-hot encoding, we would result in a very large sparse matrix `n by n` size when $n$ is large.

In [40]:
import torch.nn as nn

# example of an embedding layer
# with 10 words in the dictionary and 3-dimensional embeddings

embedding = nn.Embedding(
    num_embeddings=10, # size of the dictionary
    embedding_dim=3,
    padding_idx=0
)

# example of a batch of 2 sequences
# each sequence has 4 words
text_encoded_input = torch.tensor([
    [1,2,3,4],
    [5,6,7,0],
])
print(embedding(text_encoded_input))

tensor([[[-0.2358, -0.6970, -1.1608],
         [ 0.6995,  0.1991,  1.3949],
         [ 0.8082,  0.5609,  1.0762],
         [ 1.1017, -0.1759,  2.0242]],

        [[-0.0865,  0.0981, -1.2150],
         [ 0.7312,  1.1718, -0.9274],
         [ 0.5451,  0.2468,  1.1843],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


RNN Model.

This simple RRN model processes a batch of sentences. For each sentence, it outputs one value. This is a many-to-one architecture, which can be used for sentiment classification.

In [41]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=2, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        _, hidden = self.rnn(x)
        out = hidden[-1, :, :]
        
        return self.fc(out)

model = RNN(input_size=64, hidden_size=32)
print(model)
model(torch.rand(3, 10, 64))

RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


tensor([[-0.3266],
        [-0.2313],
        [-0.2454]], grad_fn=<AddmmBackward0>)

RNN model for sentiment analysis.

In [42]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(
            out, 
            lengths.cpu().numpy(), 
            batch_first=True, 
            enforce_sorted=False
        )
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        
        return out

In [43]:
def train(dataloader, model, loss_fn, optimizer):
    model.train()
    total_acc, total_loss = 0, 0
    
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        
        optimizer.step()
        total_acc += ((pred > 0.5) == label_batch).sum().item()
        total_loss += loss.item()*label_batch.size(0)
        
    return total_loss/len(dataloader.dataset), total_acc/len(dataloader.dataset)

In [44]:
def evaluate(dataloader, model, loss_fn):
    model.eval()
    total_acc, total_loss = 0, 0
    
    for text_batch, label_batch, lengths in dataloader:
        with torch.no_grad():
            pred = model(text_batch, lengths)[:, 0].float()
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred > 0.5) == label_batch).sum().item()
            total_loss += loss.item()*label_batch.size(0)
        
    return total_loss/len(dataloader.dataset), total_acc/len(dataloader.dataset)

In [45]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64
model = RNN(len(vocab), embed_dim, rnn_hidden_size, fc_hidden_size)

In [46]:
num_epochs = 5
torch.manual_seed(1)

for epoch in range(num_epochs):
    train_loss, train_acc = train(train_dl, model, loss_fn, optimizer)
    valid_loss, valid_acc = evaluate(valid_dl, model, loss_fn)
    
    print(f"Epoch {epoch+1}/{num_epochs}, train loss: {train_loss:.3f}, train acc: {train_acc:.3f}, valid loss: {valid_loss:.3f}, valid acc: {valid_acc:.3f}")

KeyboardInterrupt: 