In [1]:
%%html
<style> table {float:left} </style>

In [2]:
!pip install torch tqdm lazyme nltk gensim
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /home/kenny/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
import numpy as np
from tqdm import tqdm

from gensim.corpora import Dictionary

import torch
from torch import nn, optim, tensor, autograd
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

In [11]:
try: # Use the default NLTK tokenizer.
    from nltk import word_tokenize, sent_tokenize 
    # Testing whether it works. 
    # Sometimes it doesn't work on some machines because of setup issues.
    word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
except: # Use a naive sentence tokenizer and toktok.
    import re
    from nltk.tokenize import ToktokTokenizer
    # See https://stackoverflow.com/a/25736515/610569
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    # Use the toktok tokenizer that requires no dependencies.
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize

# Classification 

Text Categorization (textcat) is a common task in NLP. As long as, we have labelled data and we want to assign a discrete label to every input data point, it's a classification problem. E.g. 

| Tasks | Possible Labels | 
|:-|:-|
| Sentiment analysis | Positive, Negative, Neutral | 
| Tweetstorm detection | True, False |
| Author profiling | Author1, Author2, ... | 
| Language Identification | EN, ZH, DE, JA, FR, ...|

There are various datasets for sentiment classification, previously we looked at the movie reviews dataset in `nltk`. There's also this other popular IMDB movie reviews dataset from Stanford. Lets use that.

Download the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and put it in the same directory as where you're running this jupyter notebook.

# Munge the data!

As always we have to preprocess the data.

In [28]:
from lazyme import find_files

def tokenize_data(path_to_dir, file_ext):
    for filename in tqdm(find_files(path_to_dir, file_ext)):
        with open(filename) as fin:
            yield word_tokenize(fin.read())
        
X_train_pos = list(tokenize_data('./aclImdb/train/pos/', '*.txt'))
X_train_neg = list(tokenize_data('./aclImdb/train/neg/', '*.txt'))
X_test_pos = list(tokenize_data('./aclImdb/test/pos/', '*.txt'))
X_test_neg = list(tokenize_data('./aclImdb/test/neg/', '*.txt'))

12500it [00:37, 330.62it/s]
12500it [00:38, 326.50it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [63]:
X_train = X_train_pos + X_train_neg
X_test = X_test_pos + X_test_neg

y_train = ['pos'] * len(X_train_pos) + ['neg'] * len(X_train_neg)
y_test = ['pos'] * len(X_test_pos) + ['neg'] * len(X_test_neg)

# Create our IMDB PyTorch Dataset 

Although we have a binary class problem, we will demonstrate a multi-class solution issue that can be also used on binary classification. 


First trick is to convert the "human" labels to a one-hot encoding.

For example, if we have 

| Text Index | Label |
|:-|:-|
|0 | pos|
|1 |neg|
|2 |pos|
|3 |neu|



If we use the 

 - first position of the label vector to represent negative  
 - second to represent positive
 - third to represent neutral


we should represent the labels as such:

| Text Index | Label | One-hot |
|:-|:-|:-|
|0 | 1|[0, 1, 0]|
|1 | 0|[1, 0, 0]|
|2 | 1|[0, 1, 0]|
|3 | 2|[0, 0, 1]|



In [50]:
# To get the one-hot encoding:
labels = [1, 0, 1, 2]
torch.eye(max(labels)+1)[labels] 

tensor([[0., 1., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])

In [51]:
# In PyTorch version 1.0.1, simply use this:
labels = [1, 0, 1, 2]
torch.one_hot([1, 0, 1, 2])

AttributeError: module 'torch' has no attribute 'one_hot'

In [30]:
class IMDBDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.vocab = Dictionary(texts)
        # Vectorize labels
        label_set = {'neg':0, 'pos':1}
        labels = [label_set[l] for l in labels]
        self.labels = torch.tensor(labels).long()
        # Keep track of how many data points.
        self._len = len(texts)
        
    def __getitem__(self, index):
        vectorized_sent = self.vectorize(self.texts[index])
        return {'x':vectorized_sent, 
                'y':self.labels[index], 
                'x_len':len(vectorized_sent)}
    
    def __len__(self):
        return self._len
    
    def vectorize(self, tokens):
        """
        :param tokens: Tokens that should be vectorized. 
        :type tokens: list(str)
        """
        # See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx 
        # Lets just cast list of indices into torch tensors directly =)
        return torch.tensor(self.vocab.doc2idx(tokens))
    
    def unvectorize(self, indices):
        """
        :param indices: Converts the indices back to tokens.
        :type tokens: list(int)
        """
        return [self.vocab[i] for i in indices]

In [31]:
imdb_data = IMDBDataset(X_train, y_train)

In [32]:
print(imdb_data[0]) # First data point.

{'x': tensor([ 11,  12,  49,  24,  35,  38,   6,  15,  69,  31,  93,  76,  97,  30,
         84,  62,  68,  25,  79,  53,   5,  87,  30,  23,  18,   2,   6,  16,
          8, 103,  47,  93,  90,  67,  52,  56,  98,  32,  92,  11,  12,   4,
         77,  49,  57,  37,  98,  70,  91,  49,  23,  18,   2,   6,  19,  81,
         98,  88,  44,   5,  93,  48,  86, 101,  34,  82,  74,  96,  94,  63,
         89,   1,  66,   5,  93,  64,  60,  93, 102,  83,   5,  28,  72,  56,
         60,  93,  80,  13,  51,  29,  94,  86,   6,  22,  13,  78,  93,  40,
         47, 100,  24,  85,  73,  99,  98,  33,  39,  93,  79,   5,  13,  46,
         71,   7,   7,   7,  31,   7,   7,   7,   6,  12,   6,  10,  36,  54,
          9,  14,   9,  13,   3,  45,  98,  75,  61,  60, 104,  89,   6,  17,
          9,  20,  98,  11,  12,   6,  13,  41,  92,  55,  26,  60,  58,  27,
         95,  92,  11,  12,  49,  42,  43,   6,  21,  24,  65,  92,  50,  49,
         59,   0]), 'y': tensor(1), 'x_len': 170}


# PyTorch DataLoader

The [`torch.utils.data.DataLoader` object](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)  will help us easily create batches from the `torch.utils.data.Dataset` so that we can do mini-batch SGD and fully utilize GPU/CPU computation during gradient optimization.

The `DataLoader` requires the following function to be implemented in the `Dataset`:

 - `__getitem__`: Return the dictionary of inputs 
 - `__len__`: Return the no. of indices that `__getitem__` can fetch
 

In [33]:
batch_size = 1
dataloader = DataLoader(dataset=imdb_data, batch_size=batch_size, shuffle=True)

In [34]:
for data_dict in dataloader:
    # Sort indices of data in batch by lengths.
    sorted_indices = np.array(data_dict['x_len']).argsort()[::-1].tolist()
    data_batch = {name:_tensor[sorted_indices]
                  for name, _tensor in data_dict.items()}
    print(data_batch)
    break

{'x': tensor([[11054, 27740,    49, 24878, 33449,     5,    24,  7623,  3236,  6372,
           101,    49,  6500,   376,    60, 23114,   311,   206,     4,  6500,
            98,  3630,   209,    93,    53,    60,   209,  7005, 40299,   107,
         63186, 63187,   108,   311,   206, 54900,   299,   253,    24,   468,
             5,   168,  6713,   209,  7855,   468,    60,  4138,  5582,  5957,
            47,    61,   529,     6,   124,   204,    98,  1269,  1392,   209,
           847,  3353,    92,   206,    34,  1073,    98,   207,   821,   432,
            50,   252,    29,  1269,   209, 26302,   376,    60, 63213,     6,
           118,    93,  2987,   156,   584,   821,     5,   437,    34,   206,
           432,    50,   252,   113,   331,   308,    61,    60,    93,  1200,
         10842,    60,    24,   190,    98, 28228,    30,    93,  8639,   308,
            42,   196,    24,    36,     6,   605,   213,   322,   182,    59,
          1038,   694,    50,  1196,    57,   

# Lets try batch of size > 1

In [35]:
batch_size = 5
dataloader = DataLoader(dataset=imdb_data, batch_size=batch_size, shuffle=True)

for data_dict in dataloader:
    # Sort indices of data in batch by lengths.
    sorted_indices = np.array(data_dict['x_len']).argsort()[::-1].tolist()
    data_batch = {name:_tensor[sorted_indices]
                  for name, _tensor in data_dict.items()}
    print(data_batch)
    break

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 189 and 454 in dimension 1 at /opt/conda/conda-bld/pytorch-cpu_1544218188686/work/aten/src/TH/generic/THTensorMoreMath.cpp:1333

# Gotcha! Everything should be a fixed-size tensor

To use the `DataLoader` to generate batches, one thing that we need to keep consistent is the size of the tensors for our inputs and outputs. 

For the outputs (`y`), it shouldn't be much of a problem since they are already in fixed size one-hot encoding.

It's the inputs (`x`), that has variable length and we need to somehow fix it. 

There are a couple of ways to accomplish the fixed-size inputs:

 - Set the size of `x` tensors to a certain size and cut-off extra words after that
 - Set the size of `x` tensors to the max length seen in the train data and pad the other data points with lower length with a special `<pad>` symbol. 
 
 
Lets do both:

 - Set a max size limit
 - For sentences that has length > max, we cut the rest of the sentence off
 - For sentences that has length < max, we pad till we reach the max

In [36]:
# Here's a clean way to pad 1-Dimensional tensors in PyTorch
a = torch.randn(10)
print(a.shape)
print(a)

torch.Size([10])
tensor([-0.5593, -0.1504, -0.1062, -0.3134,  0.9965, -0.4380, -0.7861,  1.1467,
         0.1388,  0.6459])


In [37]:
max_len = 15
pad_left = 0
pad_right = max_len - len(a)
b = F.pad(a, (pad_left, pad_right), 'constant')
print(b.shape)
print(b)

torch.Size([15])
tensor([-0.5593, -0.1504, -0.1062, -0.3134,  0.9965, -0.4380, -0.7861,  1.1467,
         0.1388,  0.6459,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000])


Now we have to rewrite the `IMDBDataset` to account for fixed-length `x` tensors. 

In [38]:
class IMDBDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        
        # Remember the `patch_with_special_tokens` from gensim?
        # Now we can put it into good use.
        special_tokens = {'<pad>': 0, '<unk>':1}
        self.vocab = Dictionary(texts)
        self.vocab.patch_with_special_tokens(special_tokens)
        # Keep track of vocab size.
        self.vocab_size = len(self.vocab)
        
        # Vectorize labels
        label_set = {'neg':0, 'pos':1}
        labels = [label_set[l] for l in labels]
        # Keep track of num of labels.
        self.num_labels = max(labels)+1
        self.labels = torch.tensor(labels).long()
        self.labels_onehot = torch.eye(self.num_labels)[labels].long()
        
        # Keep track of how many data points.
        self._len = len(texts)
        
        # Find the longest text in the data.
        self.max_len = max(len(txt) for txt in texts)
        
    def __getitem__(self, index):
        vectorized_sent = self.vectorize(self.texts[index])
        # To pad the sentence:
        # Pad left = 0; Pad right = max_len - len of sent.
        pad_dim = (0, self.max_len-len(vectorized_sent))
        vectorized_sent = F.pad(vectorized_sent, pad_dim, 'constant')
        return {'x':vectorized_sent, 
                'y':self.labels[index], 
                'x_len':len(vectorized_sent)}
    
    
    def __len__(self):
        return self._len
    
    def vectorize(self, tokens):
        """
        :param tokens: Tokens that should be vectorized. 
        :type tokens: list(str)
        """
        # See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx 
        # Lets just cast list of indices into torch tensors directly =)
        return torch.tensor(self.vocab.doc2idx(tokens, unknown_word_index=1))
    
    def unvectorize(self, indices):
        """
        :param indices: Converts the indices back to tokens.
        :type tokens: list(int)
        """
        return [self.vocab[i] for i in indices]

In [39]:
imdb_data = IMDBDataset(X_train, y_train)

In [40]:
imdb_data.vocab.token2id['the']

93

In [45]:
imdb_data.num_labels

2

In [41]:
imdb_data[0]

{'x': tensor([11, 12, 49,  ...,  0,  0,  0]), 'y': tensor(1), 'x_len': 2818}

In [43]:
batch_size = 5
dataloader = DataLoader(dataset=imdb_data, batch_size=batch_size, shuffle=True)

for data_dict in dataloader:
    # Sort indices of data in batch by lengths.
    sorted_indices = np.array(data_dict['x_len']).argsort()[::-5].tolist()
    data_batch = {name:_tensor[sorted_indices]
                  for name, _tensor in data_dict.items()}
    print(data_batch)
    break

{'x': tensor([[331, 373,  49,  ...,   0,   0,   0]]), 'y': tensor([1]), 'x_len': tensor([2818])}


# Training a model with Feed-Forward Net

Now that we have everything about the data in place, we can make use of all the knowledge we've gained thus far:

 - **Multi-Layered Perceptron**, aka. **Feed-Forward Network** that we've learnt from the previous XOR examples
   - *Linear* layers
   - *Activation function*, which?
   - *Criterion* which?
   - *Optimizer*, Adam vs SGD
 

In [52]:
output_dim=2
max_len=2818
class FFNet(nn.Module):
    def __init__(self, max_len, num_labels, vocab_size, embedding_size, hidden_dim):
        super(FFNet, self).__init__()
        self.embeddings = nn.Embedding(num_embeddings=vocab_size,
                                       embedding_dim=embedding_size, 
                                       padding_idx=0)
        # The no. of inputs to the linear layer is the 
        # no. of tokens in each input * embedding_size
        self.linear1 = nn.Linear(max_len*embedding_size, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, inputs):
        # We want to flatten the inputs so that we get the matrix of shape.
        # batch_size x no. of tokens in each input * embedding_size
        batch_size, max_len = inputs.shape
        embedded = self.embeddings(inputs).view((batch_size, -1)) # Change the size of the embedded matrix.
        hid = F.relu(self.linear1(embedded))
        out = self.linear2(hid)
        return F.sigmoid(out)
        

# The Training Routine

In [54]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

embedding_size = 100
learning_rate = 0.003
hidden_size = 100

# Initialize the dataset.
batch_size = 5
imdb_data = imdb_data = IMDBDataset(X_train, y_train)
dataloader = DataLoader(dataset=imdb_data, batch_size=batch_size, shuffle=True)

criterion = nn.CrossEntropyLoss()
# Hint: the CBOW model object you've created.
model = FFNet(imdb_data.max_len, 
              imdb_data.num_labels, 
              imdb_data.vocab_size, 
              embedding_size=embedding_size, 
              hidden_dim=hidden_size)


optimizer = optim.Adam(model.parameters(),lr=learning_rate)

#model = nn.DataParallel(model)

losses = []
num_epochs = 3
for _e in range(num_epochs):
    epoch_loss = []
    for batch in tqdm(dataloader):
        x = batch['x'].to(device)
        y = batch['y'].to(device)
        # Zero gradient.
        optimizer.zero_grad()
        # Feed forward.
        predictions = model(x)
        loss = criterion(predictions, y)
        loss.backward()
        optimizer.step()
        epoch_loss.append(float(loss))
        break
    print(sum(epoch_loss)/len(epoch_loss))
    break
    losses.append(sum(epoch_loss)/len(epoch_loss))
    


  0%|          | 0/5000 [00:00<?, ?it/s]

0.6863227486610413





In [55]:
print(predictions.shape)

torch.Size([5, 2])


In [56]:
torch.max(predictions, 1)  # Predictions of the last batch.

(tensor([0.4852, 0.4908, 0.4801, 0.4784, 0.4959], grad_fn=<MaxBackward0>),
 tensor([1, 0, 0, 1, 0]))

# Prediction with the model

In [64]:
print(X_test[1]) # First test review.

['Actor', 'turned', 'director', 'Bill', 'Paxton', 'follows', 'up', 'his', 'promising', 'debut', ',', 'the', 'Gothic-horror', '``', 'Frailty', "''", ',', 'with', 'this', 'family', 'friendly', 'sports', 'drama', 'about', 'the', '1913', 'U.S.', 'Open', 'where', 'a', 'young', 'American', 'caddy', 'rises', 'from', 'his', 'humble', 'background', 'to', 'play', 'against', 'his', 'Bristish', 'idol', 'in', 'what', 'was', 'dubbed', 'as', '``', 'The', 'Greatest', 'Game', 'Ever', 'Played', '.', "''", 'I', "'m", 'no', 'fan', 'of', 'golf', ',', 'and', 'these', 'scrappy', 'underdog', 'sports', 'flicks', 'are', 'a', 'dime', 'a', 'dozen', '(', 'most', 'recently', 'done', 'to', 'grand', 'effect', 'with', '``', 'Miracle', "''", 'and', '``', 'Cinderella', 'Man', "''", ')', ',', 'but', 'some', 'how', 'this', 'film', 'was', 'enthralling', 'all', 'the', 'same.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'The', 'film', 'starts', 'with', 'some', 'creative', 'opening', 'credits', '(', 'imagine', 'a', 'Disneyfied

In [1]:
def vectorize_test_inputs(inputs):
    # Process the input text in the same way as you did with the training data.
    vectorized_sent = imdb_data.vectorize(inputs)
    pad_dim = ???
    padded_sent = ???
    return padded_sent.unsqueeze(0)

print('Input tensor:', vectorize_test_inputs(X_test[0]))
label_set = {'neg':0, 'pos':1}
print('Label:', label_set[y_test[0]])

SyntaxError: invalid syntax (<ipython-input-1-ab8beda5162c>, line 4)

In [411]:
# Apply the model to the inputs.
with torch.no_grad():
    predictions = model(vectorize_test_inputs(X_test[0])).unsqueeze(0)
    print(predictions)
    print(F.softmax(predictions))

tensor([0.5093, 0.5479])
tensor([0.4903, 0.5097])


  """
