In [4]:
import nltk
from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.corpus import conll2000
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.model_selection import train_test_split
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')
nltk.download('universal_tagset')
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, TensorDataset, Dataset
tokenizer = get_tokenizer('basic_english')

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/zhejing/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package brown to /Users/zhejing/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/zhejing/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/zhejing/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


# PART OF SPEECH TAGGING

### Data Preparation

1. Concatenate all datasets






In [5]:
## Combining all the datasets to form one big dataset 
treebank_corpus = treebank.tagged_sents(tagset='universal')
brown_corpus = brown.tagged_sents(tagset='universal')
conll_corpus = conll2000.tagged_sents(tagset='universal')

## Concatenate the dataset to form a big corpus of type (word, label)
tag_dataset_corpus = treebank_corpus + brown_corpus + conll_corpus

## making dataset smaller for faster training 
tag_dataset_corpus = tag_dataset_corpus[:7500]


In [25]:
treebank_corpus

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')], ...]

2. Break the dataset into Data, Label arrays i.e:

| Sentence | Sequence label |
| -------- | -------------- |
|this happened that day| POS_1 POS_2... POS_N |
|this happened the other day| POS_1 POS_2... POS_N |

3. We need all our training data to exist in numerical form for it to be trained by Neural Networks.
Towards this end we encode all our labels to numeric values.

In [49]:
X, yx = [], []
result_set = set()
tbd = TreebankWordDetokenizer()

## getting all sentences in X and all labels in yx from the corpus
for instance in tag_dataset_corpus:
    x = list(map(lambda x: x[0], instance))
    y = list(map(lambda x: x[1], instance))
    X.append(tbd.detokenize(x))
    yx.append(y)
    for tag in y:
        result_set.add(tag)

# making the tag dictionary that will encode all the POS tags to numeric values
tag_encoding = {t:i+1 for i, t in enumerate(result_set)}
tag_encoding[''] = 0


In [51]:
yx

[['NOUN',
  'NOUN',
  '.',
  'NUM',
  'NOUN',
  'ADJ',
  '.',
  'VERB',
  'VERB',
  'DET',
  'NOUN',
  'ADP',
  'DET',
  'ADJ',
  'NOUN',
  'NOUN',
  'NUM',
  '.'],
 ['NOUN',
  'NOUN',
  'VERB',
  'NOUN',
  'ADP',
  'NOUN',
  'NOUN',
  '.',
  'DET',
  'NOUN',
  'VERB',
  'NOUN',
  '.'],
 ['NOUN',
  'NOUN',
  '.',
  'NUM',
  'NOUN',
  'ADJ',
  'CONJ',
  'ADJ',
  'NOUN',
  'ADP',
  'NOUN',
  'NOUN',
  'NOUN',
  'NOUN',
  '.',
  'VERB',
  'VERB',
  'X',
  'DET',
  'ADJ',
  'NOUN',
  'ADP',
  'DET',
  'ADJ',
  'ADJ',
  'NOUN',
  '.'],
 ['DET',
  'NOUN',
  'ADP',
  'NOUN',
  'ADV',
  'VERB',
  'X',
  'X',
  'PRT',
  'VERB',
  'NOUN',
  'NOUN',
  'NOUN',
  'VERB',
  'VERB',
  'DET',
  'ADJ',
  'NOUN',
  'ADP',
  'NOUN',
  'NOUN',
  'ADP',
  'DET',
  'NOUN',
  'ADP',
  'NOUN',
  'VERB',
  'X',
  'PRT',
  'PRON',
  'ADV',
  'ADP',
  'NUM',
  'NOUN',
  'ADP',
  '.',
  'NOUN',
  'VERB',
  'X',
  'X',
  '.'],
 ['DET',
  'NOUN',
  'NOUN',
  '.',
  'NOUN',
  '.',
  'VERB',
  'ADV',
  'ADJ',
  'ADP'

# Exercise 1
Use the dictionary `tag_encoding` to encode all the POS tags to numerical value.

In [47]:
# YOUR CODE GOES HERE
tag_encoding
yx_encoded = []
for corpus in yx:
    temp = []
    for c in corpus:
        c = tag_encoding[c]
        temp.append(c)
    yx_encoded.append(temp)
        

In [48]:
yx_encoded

[[8, 8, 10, 3, 8, 11, 10, 12, 12, 9, 8, 4, 9, 11, 8, 8, 3, 10],
 [8, 8, 12, 8, 4, 8, 8, 10, 9, 8, 12, 8, 10],
 [8,
  8,
  10,
  3,
  8,
  11,
  2,
  11,
  8,
  4,
  8,
  8,
  8,
  8,
  10,
  12,
  12,
  6,
  9,
  11,
  8,
  4,
  9,
  11,
  11,
  8,
  10],
 [9,
  8,
  4,
  8,
  7,
  12,
  6,
  6,
  5,
  12,
  8,
  8,
  8,
  12,
  12,
  9,
  11,
  8,
  4,
  8,
  8,
  4,
  9,
  8,
  4,
  8,
  12,
  6,
  5,
  1,
  7,
  4,
  3,
  8,
  4,
  10,
  8,
  12,
  6,
  6,
  10],
 [9,
  8,
  8,
  10,
  8,
  10,
  12,
  7,
  11,
  4,
  1,
  12,
  9,
  8,
  10,
  4,
  7,
  11,
  8,
  5,
  1,
  12,
  8,
  9,
  6,
  12,
  5,
  8,
  11,
  10,
  8,
  12,
  6,
  6,
  10],
 [8,
  8,
  10,
  9,
  8,
  4,
  11,
  11,
  8,
  8,
  9,
  6,
  12,
  8,
  8,
  10,
  12,
  12,
  8,
  4,
  1,
  8,
  8,
  8,
  4,
  3,
  10],
 [4,
  11,
  8,
  12,
  12,
  6,
  7,
  4,
  9,
  8,
  4,
  10,
  9,
  11,
  8,
  12,
  4,
  8,
  5,
  8,
  8,
  8,
  4,
  8,
  10,
  9,
  8,
  11,
  6,
  5,
  12,
  11,
  8,
  5,
  9,
  8,
  10],

# Exercise 2
Split the dataset into training and testing dataset. Use the encoded labels obtained in previous exercise instead of the string labels for POS tagging.

Split in a way that test size is 20% of the dataset.

In [53]:
# YOUR CODE GOES HERE
train_X, test_X, train_y, test_y = train_test_split(X, yx_encoded, test_size=0.20, random_state=42)

Just as we encoded the labels to have a numerical value we need to encode the words in the text too.

This possesses many challenges:
1. We want to limit the vocabulary based on the frequency of occurance.
2. The size of the vocabulary shouldn't be extraordinarily large as it would just make lookup operation very expensive.
3. All out of vocabulary words have to be recognised and labelled as such.

Therefore we use the suitable implemented funtion.

In [54]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_X), specials=['<unk>'], max_tokens=20_000)
vocab.set_default_index(vocab["<unk>"])

## Creating Custom dataset
We need this class since we would be feeding lots of training data in batches so certain transformations and padding/truncation is required to make the training process fast and robust.

We truncate/pad to the size of each input being 64. All inputs will have a 64 number long vector.

[Dataset & DataLoader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

In [55]:
class PosDataset(Dataset):
    def __init__(self, X, y, vocab, transform=None, padding=128):
        self.X = X
        self.y = y
        self.vocab = vocab
        self.tokenizer = get_tokenizer('basic_english')
        self.padding = padding
    
    @staticmethod
    def pad_truncate_tensor(vec, pad):
        """
        args:
            vec - tensor to pad
            pad - the size to pad to
            dim - dimension to pad

        return:
            a new tensor padded to 'pad' in dimension 'dim'
        """
        # pad_size = list(vec.shape)
        # pad_size[dim] = pad - vec.size(dim)
        # return torch.cat([vec, torch.zeros(*pad_size)], dim=dim)
        inp_length = vec.shape[-1]
        if inp_length > pad:
            return vec[:pad]
        else:
            pad_length = pad - vec.shape[-1]
            return torch.cat([vec, torch.zeros(pad_length, dtype=torch.int64)])
    
    def __getitem__(self, index):
        x = self.X[index]
        y = self.y[index]
        x_encoded = self.vocab(self.tokenizer(x))
        
        return self.pad_truncate_tensor(torch.tensor(x_encoded, dtype=torch.int64), 64), self.pad_truncate_tensor(torch.tensor(y, dtype=torch.int64), 64)
    
    def __len__(self):
        return len(self.X)

train_dataset = PosDataset(train_X, train_y, vocab)
test_dataset  = PosDataset(test_X, test_y, vocab)

In [56]:
for p in train_dataset:
  print(p)
  break

(tensor([ 1391,  1335,   146,  7064,    11,     6,   301,   440,   155,   521,
           11,     2,  9623,    44,     2,  4043,     1,   397,   482, 10108,
           13,    58,   505,   156,     8,    83,    40,   578,  8544,     1,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), tensor([ 8,  7, 12, 11,  4,  9, 11,  8, 10,  4,  4,  9,  8,  2,  9,  8,  4,  8,
        12,  7, 11, 10,  2,  4,  1, 12, 12, 10,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0]))


## Data Loaders 
Now we create use the datasets to create dataloaders which we can directly feed to our models in batches.


In [57]:
BATCH_SIZE = 64
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Model creation

Next we create the model for POS TAGGING. It takes a sequence of tokens (words) and generates predictions of POS on each of the token.

Layers Used:

1. [Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html)

2. [LSTM](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html?highlight=lstm#torch.nn.LSTM)

3. [Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html?highlight=linear#torch.nn.Linear)

In [58]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        return tag_space

model = LSTMTagger(100, 64, 20000, 13)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

## Training Loop

1. Forward Propagation: It is defined by us when we implemented the model. During the forward pass the model generates the outputs from the input data.

2. Backpropagation: These are the series of steps which use the outputs generated during the forward propagation, compare the output with the true label to calculate error.
And update the parameters(weights) of each layer to minimize this error.

[Training](https://pytorch.org/tutorials/beginner/introyt/trainingyt.html)

In [64]:
def train(model, epochs=3):
  for epoch in range(epochs):
      epoch_loss = 0.0
      running_loss = 0.0
      total_elem = 0
      for i, (X,y) in enumerate(train_dataloader):
          optimizer.zero_grad()
          tag_scores = model(X)
          tag_scores = tag_scores
          # print('tag_scores', tag_scores.shape)
          tag_scores = tag_scores.permute(0, 2, 1)
          # print('tag_scores reshape',tag_scores.shape, y.shape)     
          loss = loss_function(tag_scores, y)
          loss.backward()
          optimizer.step()
          epoch_loss += tag_scores.shape[0] * loss.item()
          running_loss += loss.item()
          total_elem += tag_scores.shape[0]
      print(f'Epoch:{epoch+1}, Loss: {epoch_loss/total_elem}')
  return model

In [65]:
model = train(model, 10)

Epoch:1, Loss: 0.11896604575713475
Epoch:2, Loss: 0.10708744110663732
Epoch:3, Loss: 0.09738020904858907
Epoch:4, Loss: 0.07963422253727913
Epoch:5, Loss: 0.06614324568708738
Epoch:6, Loss: 0.054182251845796905
Epoch:7, Loss: 0.04486758416891098
Epoch:8, Loss: 0.03814395356178284
Epoch:9, Loss: 0.03236639848848184
Epoch:10, Loss: 0.02754582133392493


# Exercise 3
Use the `tag_encoding` dictionary created at the start to create `tag_decoder_dict` which maps tag indices to tag names

In [68]:
# YOUR CODE GOES HERE
tag_decoder_dict = {v:k for k,v in tag_encoding.items()}
# tag_encoding
tag_decoder_dict

{1: 'PRON',
 2: 'CONJ',
 3: 'NUM',
 4: 'ADP',
 5: 'PRT',
 6: 'X',
 7: 'ADV',
 8: 'NOUN',
 9: 'DET',
 10: '.',
 11: 'ADJ',
 12: 'VERB',
 0: ''}

# Exercise 4
Implement a function to use the trained model in order to generate POS Tag indices.

In that function use the `tag_decoder_dict` from the previous exercise to convert the resulting indices from the model to proper POS tags.

In [None]:
# YOUR CODE GOES HERE


# Evalutation of test data