In [1]:
from bpemb import BPEmb
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import numpy as np
from random import shuffle
import time
from IPython import display

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchnlp.datasets import imdb_dataset
from torch.autograd import Variable

%matplotlib inline

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True

Some of our own reviews to demonstrate the model

In [3]:
review1 = "Fictional Movie combines aspects of the action, adventure, and suspense genres into a delightful blend. Although the plotline is difficult to describe without spoilers, it delivers on every promise, keeping the audience on the edge of their seats!"
review2 = "Fictional Movie underwhelms on all accounts. The story is insipid and uninspired. The pacing is sluggish, with long stretches of irrelevant and unimportant segues whose import is at best an inside joke to the writers. You might as well not bother with this movie."
review3 = "Some movies have great visuals, clever writing, or heartwarming stories. Fictional Movie has none of these things. It is a pallid imitation of better movies of bygone eras. If you are going to see one movie this year, see something else."

Notice all the positive labels? We need to shuffle the data!

## Dataset

**imdb_dataset**: This is the dataset we will be working with. It involves reviews and a label of 'positive' or 'negative' associated with the review sentiment.

In [4]:
data_train = imdb_dataset(train=True)
data_test = imdb_dataset(test=True)
shuffle(data_train)
shuffle(data_test)
print(len(data_train), len(data_test))

25000 25000


In [5]:
for i in range(5):
    label = data_train[i]['sentiment']
    txt = data_train[i]['text']
    print(f'Sentiment: {label}')
    print(f'Text: {txt}\n')

Sentiment: neg
Text: I suppose if you like endless dialogue that doesn't forward the story and flashy camera effects like the scene transitions in the television show _Angel_, you'll enjoy the film. Me? All I wanted was a nice, tight little story, and it wasn't there. The pacing was practically backward, plot points were buried under a sea of unneeded dialogue, and there was absolutely no sense of dread, or tension, or ANYTHING.<br /><br />Is it the redneck? Is it the Wendigo? No, it's a cameraman on speed. That's not scary. It doesn't generate a single note of tension or atmosphere unless you're scared by MTV. Like those reviewers before me, I too noticed that by the end the movie invokes derisive laughter from the audience.<br /><br />Terrible film.

Sentiment: neg
Text: I can't believe this is on DVD. Even less it was available at my local video store.<br /><br />Some argue this is a good movie if you take in consideration it had only a 4000$ budget. I find this funny. I would find 

Maximum length of the reviews

In [6]:
lens = [len(d['text'].split()) for d in data_train]
print(np.max(lens))

2470


## 1. Embedding

#### Word2Vec

Before actually getting into the classification, let's do a quite recap of one of the most important embedding methods in NLP: Word2Vec.

Links:
1. https://www.analyticsvidhya.com/blog/2020/03/pretrained-word-embeddings-nlp/
2. https://radimrehurek.com/gensim/models/word2vec.html
3. https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial/notebook
4. Word2Vec from scratch: https://towardsdatascience.com/implementing-word2vec-in-pytorch-skip-gram-model-e6bae040d2fb
5. Word2Vec with Gensim: https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92

In [7]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
]

def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens
tokenized_corpus = tokenize_corpus(corpus)

vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}
vocabulary_size = len(vocabulary)

In [8]:
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))
idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

In [9]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [10]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 4.31084063734327
Loss at epo 10: 3.9481004919324603
Loss at epo 20: 3.705983177253178
Loss at epo 30: 3.524117525986263
Loss at epo 40: 3.3792495233672004
Loss at epo 50: 3.259489371095385
Loss at epo 60: 3.1575867005756924
Loss at epo 70: 3.068761876651219
Loss at epo 80: 2.989760562351772
Loss at epo 90: 2.918317857810429


In [11]:
print(W1.shape)
print(W2.shape)

torch.Size([5, 15])
torch.Size([15, 5])


One interesting question is: Which weight should we use?

The answer is W1. Here are some links though that discuss about this topic:

1. They both capture the word semantics. Not only W, sometimes W' is also used as word vectors. Even in somecases (W+W')/2 has also been used and better results in that particular task have been obtained.
2. The output context matrix encodes the meanings of words as context, different from the embedding matrix . NOTE: Despite the name, is independent of , not a transpose or inverse or whatsoever. (https://lilianweng.github.io/posts/2017-10-15-word-embedding/)
3. https://stackoverflow.com/questions/29381505/why-does-word2vec-use-2-representations-for-each-word
4. Building and training was fun and all, but our end goal was not to build a neural network; we wanted to get word embeddings. As stated earlier in this post, the key behind word embeddings is that the rows of the first weight matrix is effectively a dense representation of one-hot encoded vectors each corresponding to various tokens in the text dataset (https://jaketae.github.io/study/word2vec/).

A short version of the implementation of Word2Vec with gensim would be the following one:

In [12]:
def dataset_to_tensors(dataset):
    sentiments, embeddings = [], []
    for d in dataset:
        wordvecs = d['text']
        embeddings.append(wordvecs)
    return embeddings

train_data = dataset_to_tensors(data_train)
test_data = dataset_to_tensors(data_test)

In [13]:
model = Word2Vec(train_data, vector_size=100, window=5, min_count=5, workers=4)
weights = torch.FloatTensor(model.wv.vectors)
embedding = nn.Embedding.from_pretrained(weights)

### BPEmb

Let's use another type of embedding for the posterior classification: BPEmb

This is a set of pre-trained embeddings that we will use to convert the text into a sequence of vectors for subsequent analysis. The idea is to extract labels from the dataset, and parse text into a set of subword vectors.

In [14]:
print(len(data_train[0]['text']))
print(len(data_train[1]['text']))

bpe = BPEmb(lang='en', dim=50)
print(bpe.embed(data_train[0]['text']).shape)
print(bpe.embed(data_train[1]['text']).shape)

740
673
(210, 50)
(194, 50)


In [15]:
def dataset_to_tensors(dataset):
    sentiments, embeddings = [], []
    bpe = BPEmb(lang='en', dim=50)
    for d in dataset:
        if d['sentiment'] == 'pos':
            sentiments.append(1)
        else:
            sentiments.append(0)
      
        wordvecs = bpe.embed(d['text'])
        embeddings.append(wordvecs)
    return np.array(sentiments), embeddings

In [16]:
train_labels, train_tensors = dataset_to_tensors(data_train)
test_labels, test_tensors = dataset_to_tensors(data_test)

In [17]:
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))

(array([0, 1]), array([12500, 12500]))
(array([0, 1]), array([12500, 12500]))


This is what one of our inputs will look like

Note that each one will have a different sequence length, which will make batching difficult

In [18]:
print(train_tensors[0])
print(train_tensors[0].shape)

print(train_tensors[1])
print(train_tensors[1].shape)

print("Maximum length: ",np.max([d.shape[0] for d in train_tensors]))
print("Mean length: ", np.mean([d.shape[0] for d in train_tensors]))

[[-0.271187  0.202276 -0.371739 ... -0.628708  0.477076  0.206391]
 [-0.341687  0.078269  0.315032 ...  0.242251 -0.149144  0.422889]
 [-0.439638 -0.303795  0.780145 ... -0.610518 -0.001575  0.187587]
 ...
 [ 0.727799  0.067326  0.168493 ...  0.307808  0.1344   -0.206341]
 [-1.273908 -0.607935  0.375018 ...  0.364744 -0.068366  0.425233]
 [-0.275946  0.033621 -0.175133 ... -0.063625  0.111334  0.193656]]
(210, 50)
[[-0.271187  0.202276 -0.371739 ... -0.628708  0.477076  0.206391]
 [-0.023115 -0.254144  0.209979 ... -0.279852 -0.692482  0.9685  ]
 [ 0.017189  0.269219  0.051473 ...  0.24861   0.451184 -0.059601]
 ...
 [-0.298372 -0.09406   0.25116  ... -0.51283  -0.16148   0.690044]
 [-0.466095  0.107843  0.310546 ... -0.009862 -0.057548  0.435935]
 [-0.275946  0.033621 -0.175133 ... -0.063625  0.111334  0.193656]]
(194, 50)
Maximum length:  3616
Mean length:  345.2042


Lets make a function to assemble a batch from a dataset, such that everything is padded with zero vectors to the longest example in the batch
Inside the network, we're going to use a function to only take the sequence processed up to the final point for each case.
So this should only produce overhead, not change the algorithm
 
Since the reviews are very long, we'll clip them after a certain number of tokens (variable)

In [19]:
def assemble_batch(x, y, batchidx, BS = 20, clip=500):
    labels = []
    tensors = []
    lens = [np.minimum(d.shape[0],clip) for d in x[BS*batchidx:BS*(batchidx+1)]]

    maxlen = np.max(lens)

    labels = y[BS*batchidx:BS*(batchidx+1)]

    for i in range(batchidx*BS, (batchidx+1)*BS):
        if x[i].shape[0]<maxlen:
            d = np.concatenate([x[i], np.zeros((maxlen-x[i].shape[0], 50))], axis=0)
        else:
            d = x[i][:maxlen]
        tensors.append(d)
    return np.array(tensors), labels, np.array(lens)

Data is in the format batchdim x sequence x features 

In [20]:
b_x, b_y, b_l = assemble_batch(train_tensors, train_labels, 0)
print(b_x.shape, b_y.shape, b_l.shape)

(20, 500, 50) (20,) (20,)


## 2. Classification

### CNN

First lets try just aggregating sentiment predictions for each component word-vector

In [None]:
class BagNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Conv1d(50,128,1)
        self.l2 = nn.Conv1d(128,128,1)
        self.l3 = nn.Conv1d(128,128,1)
        self.drop1 = nn.Dropout(p=0.05)
        self.l4 = nn.Linear(128,128)
        self.drop2 = nn.Dropout(p=0.05)
        self.l5 = nn.Linear(128,1)
        self.optim = torch.optim.Adam(self.parameters(), lr=1e-2)

    def forward(self, x, lengths):
        z = x.transpose(1,2).contiguous()
        z = F.relu(self.l1(z))
        z = F.relu(self.l2(z))
        z = self.l3(z)
        z = z.sum(2)
        z = z/lengths.float().unsqueeze(1)
        z = self.drop1(z)
        z = self.drop2(F.relu(self.l4(z)))
        z = torch.sigmoid(self.l5(z))[:,0]
        return z

def predict(text, model):
    bpe = BPEmb(lang='en', dim=50)
    vecs = bpe.embed(text) # sequence * features
    x = torch.cuda.FloatTensor(vecs).unsqueeze(0) # 1 * sequence * features
    l = torch.cuda.LongTensor(np.array([x.size(1)]))
    p = model.forward(x,l)
    return p.cpu().detach().item()  

In [None]:
bagnet = BagNet().to(device)

Now train the model
 
Batch size seems to matter for this model - I lost 7% accuracy with batch size of 100 versus batch size of 200.
Somewhat unusual...

- eval() changes the bn and dropout layer’s behaviour
- torch.no_grad() deals with the autograd engine and stops it from calculating the gradients, which is the recommended way of doing validation

In [None]:
BS = 10
train_error, test_error = [], []
train_accuracy, test_accuracy = [], []
for epoch in range(10):  
    # Training loop
    errs, accs = [], []
    bagnet.train()
    for i in range(train_labels.shape[0]//BS):
        bagnet.optim.zero_grad()
        x, y, l = assemble_batch(train_tensors, train_labels, i, BS, clip=500)
        x = torch.FloatTensor(x).to(device)
        y = torch.FloatTensor(y).to(device)
        l = torch.LongTensor(l).to(device)
        p = bagnet.forward(x,l)

        # Log loss, with fudge factors to prevent NaN
        loss = -torch.mean(y*torch.log(p+1e-8) + (1-y)*torch.log(1-p+1e-8))
        acc = torch.mean(torch.ge(p,0.5).float()*y + (1-torch.ge(p,0.5).float())*(1-y))
        loss.backward()
        bagnet.optim.step()

        # Accumulate stats
        errs.append(loss.cpu().detach().item())
        accs.append(acc.cpu().detach().item())
        
        break
    train_error.append(np.mean(errs))
    train_accuracy.append(np.mean(accs))

    # Testing loop
    errs, accs = [], []
    bagnet.eval()
    with torch.no_grad():
        for i in range(test_labels.shape[0]//BS):
            x, y, l = assemble_batch(test_tensors, test_labels, i, BS)
            x = torch.FloatTensor(x)
            y = torch.FloatTensor(y)
            l = torch.LongTensor(l)
            p = bagnet.forward(x, l)

            # Log loss, with fudge factors to prevent NaN
            loss = -torch.mean(y*torch.log(p+1e-8) + (1-y)*torch.log(1-p+1e-8))
            acc = torch.mean(torch.ge(p,0.5).float()*y + (1-torch.ge(p,0.5).float())*(1-y))

            # Accumulate stats
            errs.append(loss.cpu().detach().item())
            accs.append(acc.cpu().detach().item())
  
    test_error.append(np.mean(errs))
    test_accuracy.append(np.mean(accs))

    plt.clf()
    plt.subplot(1,2,1)
    plt.title("Log loss")
    plt.plot(train_error, label="Train")
    plt.plot(test_error, label="Test")
    plt.legend()

    plt.subplot(1,2,2)
    plt.plot(train_accuracy, label="Train")
    plt.plot(test_accuracy, label="Test")
    plt.legend()
    plt.title("Accuracy")

    plt.gcf().set_size_inches((8,4))
    display.clear_output(wait=True)
    display.display(plt.gcf())
    time.sleep(0.01)

In [None]:
print("Final accuracy: ", test_accuracy[-1])

print(review1)
print("Review 1 prediction: ", predict(review1, bagnet))
print("\n")

print(review2)
print("Review 2 prediction: ", predict(review2, bagnet))
print("\n")

print(review3)
print("Review 3 prediction: ", predict(review3, bagnet))
print("\n")

### LSTM

Make an LSTM to classify sentiment

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()

        # This can overfit quite easily, so use dropout on the embeddings
        self.drop = nn.Dropout(0.25)

        # The output will be of the form batch x sequence x features. However, we're just going to use the final element for the classifier.
        # Using a very tiny LSTM because otherwise it overfits    
        self.lstm = nn.LSTM(50, 50, batch_first=True)

        self.classify_layer = nn.Linear(50, 1)

        # For LSTMs, we want a fairly high learning rate if we can get away with it
        self.optim = torch.optim.Adam(self.parameters(), lr=1e-2)

    def forward(self, x, lengths):
        # x is Batch x Sequence x Features
        z = self.drop(x)

        # We don't need the hidden state or cell state outputs, so discard them   
        z, _ = self.lstm(z)

        # Take the LSTM state after the last token for each sentence, ignoring parts after the end
        idx = torch.arange(x.size(0)).long().cuda()
        l = lengths - 1
        z = z[idx, l[idx]]
        z = torch.sigmoid(self.classify_layer(z))

        return z[:,0]

In [None]:
net = Net().to(device)

In [None]:
# Now train the model
BS = 200
train_error, test_error = [], []
train_accuracy, test_accuracy = [], []

for epoch in range(30):
    # Training loop
    errs = []
    accs = []
    net.train()
    for i in range(train_labels.shape[0]//BS):
        net.optim.zero_grad()
        x,y,l = assemble_batch(train_tensors, train_labels, i, BS)
        x = torch.cuda.FloatTensor(x)
        y = torch.cuda.FloatTensor(y)
        l = torch.cuda.LongTensor(l)

        p = net.forward(x,l)

        # Log loss, with fudge factors to prevent NaN
        loss = -torch.mean(y*torch.log(p+1e-8) + (1-y)*torch.log(1-p+1e-8))
        acc = torch.mean(torch.ge(p,0.5).float()*y + (1-torch.ge(p,0.5).float())*(1-y))
        loss.backward()
        net.optim.step()

        # Accumulate stats
        errs.append(loss.cpu().detach().item())
        accs.append(acc.cpu().detach().item())
  
    train_error.append(np.mean(errs))
    train_accuracy.append(np.mean(accs))

    # Testing loop
    errs = []
    accs = []
    net.eval()
    for i in range(test_labels.shape[0]//BS):
        x,y,l = assemble_batch(test_tensors, test_labels, i, BS)
        x = torch.cuda.FloatTensor(x)
        y = torch.cuda.FloatTensor(y)
        l = torch.cuda.LongTensor(l)

        p = net.forward(x,l)

        # Log loss, with fudge factors to prevent NaN
        loss = -torch.mean(y*torch.log(p+1e-8) + (1-y)*torch.log(1-p+1e-8))
        acc = torch.mean(torch.ge(p,0.5).float()*y + (1-torch.ge(p,0.5).float())*(1-y))

        # Accumulate stats
        errs.append(loss.cpu().detach().item())
        accs.append(acc.cpu().detach().item())
  
    test_error.append(np.mean(errs))
    test_accuracy.append(np.mean(accs))

    plt.clf()
    plt.subplot(1,2,1)
    plt.title("Log loss")
    plt.plot(train_error, label="Train")
    plt.plot(test_error, label="Test")
    plt.legend()

    plt.subplot(1,2,2)
    plt.plot(train_accuracy, label="Train")
    plt.plot(test_accuracy, label="Test")
    plt.legend()
    plt.title("Accuracy")

    plt.gcf().set_size_inches((8,4))
    display.clear_output(wait=True)
    display.display(plt.gcf())
    time.sleep(0.01)

In [None]:
print("Final accuracy: ", test_accuracy[-1])

print(review1)
print("Review 1 prediction: ", predict(review1, net))
print("\n")

print(review2)
print("Review 2 prediction: ", predict(review2, net))
print("\n")

print(review3)
print("Review 3 prediction: ", predict(review3, net))
print("\n")