In [10]:
import torch
import torch.nn as nn
from torchtext.datasets import IMDB
from torchtext.vocab import vocab
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.distributions.categorical import Categorical

import numpy as np
import re
from collections import Counter, OrderedDict

# Sentiment Analysis

##### Data preprocessing

In [2]:
# get data
train_dataset = IMDB(split='train')
test_dataset = IMDB(split='test')

# train val set
torch.manual_seed(1)
train_dataset, val_dataset = random_split(list(train_dataset), [20000, 5000])

In [3]:
# function to remove html tags, keep emoticons, tokenize into words
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

# get list of unique word tokens
token_counts = Counter()
for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)

print('Vocab num:', len(token_counts))

Vocab num: 69023


In [4]:
# sort the unique words
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

# create unique word to unique integer vocab
vocab = vocab(ordered_dict)

# add two generic vocab
vocab.insert_token("<pad>", 0)  # placeholder, for adjusting length of sequence
vocab.insert_token("<unk>", 1)  # unknown words, e.g. words that appear in the val or test set but not train set

vocab.set_default_index(1)

# vocab stores words as integer
print([vocab[token] for token in ['hello', 'world']])

[4892, 177]


In [5]:
# pipeline to transform each text data into integer vocabs
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

# pipeline to transform label (pos, neg) into (1, 0)
label_pipeline = lambda x: 1. if x == 'pos' else 0.

# function to preprocess (transform to integer vocab) the text in a batch
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []

    # for each sample in the batch
    for _label, _text in batch:
        # convert the label to integer
        label_list.append(label_pipeline(_label))

        # convert the text to integer, and create tensor
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)

        # track all sample lengths
        lengths.append(processed_text.size(0))

    # convert to tensor
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)

    # pad samples so each has the same length
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)

    return padded_text_list, label_list, lengths

In [6]:
# create dataloader with the preprocessing function
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size, shuffle=True, collate_fn=collate_batch)
val_dl = DataLoader(val_dataset, batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size, shuffle=False, collate_fn=collate_batch)

# Model

In [7]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()

        # create embedding layer to transform unique integer vocabs to real value vectors
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # one LSTM layer
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)

        # one FC hidden layer
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()

        # FC layer output (binary output)
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, vocab, lengths):
        # use embedding layer to convert vocab to real value feature vectors
        out = self.embedding(vocab)

        # pad to same length
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)

        # run through the rnn layer
        out, (hidden, cell) = self.rnn(out)

        # use the last hidden state from the last hidden layer as input to FC
        out = hidden[-1, :, :]

        out = self.fc1(out)
        out = self.relu(out)

        out = self.fc2(out)
        out = self.sigmoid(out)

        return out

In [8]:
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)

model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model

RNN(
  (embedding): Embedding(69025, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [9]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:
# function to train for one epoch
def train(dataloader):
    # switch to train mode
    model.train()

    total_acc, total_loss = 0, 0

    for vocab_batch, label_batch, lengths in dataloader:
        pred = model(vocab_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # metric
        total_acc += ((pred > 0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item() * label_batch.size(0)

    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

In [11]:
def eval(dataloader):
    # switch to eval mode
    model.eval()

    total_acc, total_loss = 0, 0

    for vocab_batch, label_batch, lengths in dataloader:
        pred = model(vocab_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)

        # metric
        total_acc += ((pred > 0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item() * label_batch.size(0)

    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

In [None]:
# train
num_epochs = 10
torch.manual_seed(1)

for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_eval, loss_eval = eval(val_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_eval:.4f}')

# Language Modeling

##### Preprocess data

In [2]:
# read data
with open('../data/1268-0.txt', 'r', encoding="utf8") as fp:
    text = fp.read()

# remove non-book stuffs at the beginning
start_idx = text.find('THE MYSTERIOUS ISLAND')
end_idx = text.find('End of the Project Gutenberg')
text = text[start_idx:end_idx]

# get unique characters
char_set = set(text)

print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 1112350
Unique Characters: 80


In [3]:
# create a dictionary to map unique chars to integers
chars_sorted = sorted(char_set)
char2int = {ch: i for i, ch in enumerate(chars_sorted)}

# create a numpy array to map integers back to chars
char_array = np.array(chars_sorted)

# encode the input text chars to integers
text_encoded = np.array([char2int[ch] for ch in text], dtype=np.int32)

print('Text encoded shape:', text_encoded.shape)
print(text[:15], '== Encoding ==>', text_encoded[:15])
print(text_encoded[15:21], '== Reverse ==>'.join(char_array[text_encoded[15:21]]))

Text encoded shape: (1112350,)
THE MYSTERIOUS  == Encoding ==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28] I== Reverse ==>S== Reverse ==>L== Reverse ==>A== Reverse ==>N== Reverse ==>D


In [4]:
seq_length = 40  # every 40 chars as one sequence segment
chunk_size = seq_length + 1  # input and target are offset by one character, as we use inputs char i to n to predict i+1 to n+1

# sliding window through the input text and create segments
text_chunks = [text_encoded[i: i+chunk_size] for i in range(len(text_encoded) - chunk_size)]

# create dataset using the segments
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)

    def __getitem__(self, index):
        text_chunk = self.text_chunks[index]

        # input is from index (0, n-1), target is index (1, n)
        return text_chunk[:-1].long(), text_chunk[1:].long()

seq_dataset = TextDataset(torch.tensor(text_chunks))

  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [5]:
# create dataloader
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size, shuffle=True, drop_last=True)

##### Model

In [6]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()

        self.rnn_hidden_size = rnn_hidden_size

        # create embedding layer to transform unique integer vocabs to real value vectors
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # one LSTM layer
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)

        # FC layer output (multiclass output to number of unique chars)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size) 

    def forward(self, x, hidden, cell):
        # use embedding layer to convert input to real value feature vectors
        out = self.embedding(x).unsqueeze(1)

        # run through the rnn layer
        out, (hidden, cell) = self.rnn(out, (hidden, cell))

        out = self.fc(out).reshape(out.size(0), -1)

        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden, cell

In [7]:
vocab_size = len(char_array)  # vocab is just the number of unique chars
embed_dim = 256
rnn_hidden_size = 512

torch.manual_seed(1)

model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model

RNN(
  (embedding): Embedding(80, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=80, bias=True)
)

In [8]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [9]:
num_epochs = 10000
torch.manual_seed(1)

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    loss = 0

    # get one batch
    seq_batch, target_batch = next(iter(seq_dl))

    # predict from every char of each sample of the batch
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    loss = loss.item() / seq_length

    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 4.3719
Epoch 500 loss: 1.5092
Epoch 1000 loss: 1.3639
Epoch 1500 loss: 1.3212
Epoch 2000 loss: 1.1857
Epoch 2500 loss: 1.2661
Epoch 3000 loss: 1.1597
Epoch 3500 loss: 1.1649
Epoch 4000 loss: 1.1280
Epoch 4500 loss: 1.1791
Epoch 5000 loss: 1.1377
Epoch 5500 loss: 1.0918
Epoch 6000 loss: 1.0983
Epoch 6500 loss: 1.0902
Epoch 7000 loss: 1.0425
Epoch 7500 loss: 1.0642
Epoch 8000 loss: 1.0667
Epoch 8500 loss: 1.0286
Epoch 9000 loss: 1.0535
Epoch 9500 loss: 0.9866


##### Evaluation

In [11]:
# function to generate a sequence of text based on a input text
def generateText(model, starting_str, len_generated_text=500, scale_factor=1.0):
    # encode the input text to integers
    encoded_input = torch.tensor([char2int[c] for c in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1))

    # init the output string as the input string (append to it with generated text later)
    generated_str = starting_str

    # switch to eval mode
    model.eval()

    hidden, cell = model.init_hidden(1)

    # for every character in the input string, run through model to update hidden and cell units
    for i in range(len(starting_str) - 1):
        _, hidden, cell = model(encoded_input[:, i].view(1), hidden, cell)

    last_char = encoded_input[:, -1]

    for i in range(len_generated_text):
        # use the last character in the sequence as the input to model
        logits, hidden, cell = model(last_char.view(1), hidden, cell)

        # process the logits
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor  # less scale = more random, vice versa

        # random sample from the logits (so we don't always predict the same character)
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()  # update the last character in the sequence

        # append the last character to the generated sequence
        generated_str += str(char_array[last_char])

    return generated_str

In [12]:
torch.manual_seed(1)
print(generateText(model, 'The island'))

The island is necessary for thity. Cyrus Harding became smoke could scatched him should it
crossitious most stream, let us, although? If it threw the convicts thus should be a
fan with joy; the captain nor honess was long, sliftly among the vegetation of the forest.

There were in shortles, brought the true, took extremity of vapor. But they had to be driven to us there? How could
go the banks of January, till they sew datena sound upon unknown for a few cascade,” replied Gideon Spilett, “it is indeed by 
