Code from https://notebook.community/sameersingh/uci-statnlp/tutorials/rnn_examples
Dataset from https://github.com/UniversalDependencies/UD_English

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
cd drive/MyDrive/disciplinas/nlp/data

/content/drive/MyDrive/disciplinas/nlp/data


In [None]:
!ls

chatbot			helena.txt		       reddit-cleanjokes.csv
checkpoint_gen		logs			       reddit-cleanjokes.txt
corona_NLP_test.csv	model5.gensim		       results
corpus.pkl		model5.gensim.expElogbeta.npy  runs
data-translate		model5.gensim.id2word	       squad
dictionary.gensim	model5.gensim.state	       tagger-udpos-model.pt
en_ewt-ud-dev.conllu	pos_tagger.final.pt	       test-squad
en_ewt-ud-test.conllu	pos_tagger.pt		       tm
en_ewt-ud-train.conllu	preprocdata


In [None]:

# text = What if Google Morphed Into GoogleOS?
# 1	What	what	PRON	WP	PronType=Int	0	root	0:root	_ if	if	SCONJ	IN	_	4	mark	4:mark	3	Google	Google	PROPN	NNP	Number=Sing	4	nsubj	4:nsubj	_4	Morphed	morph	VERB	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	1	advcl	1:advcl:if	_5	Into	into	ADP	IN	_	6	case	6:case	_6	GoogleOS	GoogleOS	PROPN	NNP	Number=Sing	4	obl	4:obl:into	SpaceAfter=No7	?	?	PUNCT	.	_	4	punct	4:punct	_

In [None]:
# Vocab

from collections import Counter


class Vocab(object):
    def __init__(self, iter, max_size=None, sos_token=None, eos_token=None, unk_token=None):
        """Initialize the vocabulary.
        Args:
            iter: An iterable which produces sequences of tokens used to update
                the vocabulary.
            max_size: (Optional) Maximum number of tokens in the vocabulary.
            sos_token: (Optional) Token denoting the start of a sequence.
            eos_token: (Optional) Token denoting the end of a sequence.
            unk_token: (Optional) Token denoting an unknown element in a
                sequence.
        """
        self.max_size = max_size
        self.pad_token = '<pad>'
        self.sos_token = sos_token
        self.eos_token = eos_token
        self.unk_token = unk_token

        # Add special tokens.
        id2word = [self.pad_token]
        if sos_token is not None:
            id2word.append(self.sos_token)
        if eos_token is not None:
            id2word.append(self.eos_token)
        if unk_token is not None:
            id2word.append(self.unk_token)

        # Update counter with token counts.
        counter = Counter()
        for x in iter:
            counter.update(x)

        # Extract lookup tables.
        if max_size is not None:
            counts = counter.most_common(max_size)
        else:
            counts = counter.items()
            counts = sorted(counts, key=lambda x: x[1], reverse=True)
        words = [x[0] for x in counts]
        id2word.extend(words)
        word2id = {x: i for i, x in enumerate(id2word)}

        self._id2word = id2word
        self._word2id = word2id

    def __len__(self):
        return len(self._id2word)

    def word2id(self, word):
        """Map a word in the vocabulary to its unique integer id.
        Args:
            word: Word to lookup.
        Returns:
            id: The integer id of the word being looked up.
        """
        if word in self._word2id:
            return self._word2id[word]
        elif self.unk_token is not None:
            return self._word2id[self.unk_token]
        else:
            raise KeyError('Word "%s" not in vocabulary.' % word)

    def id2word(self, id):
        """Map an integer id to its corresponding word in the vocabulary.
        Args:
            id: Integer id of the word being looked up.
        Returns:
            word: The corresponding word.
        """
        return self._id2word[id]

In [None]:
import re
from torch.utils.data import Dataset


class Annotation(object):
    def __init__(self):
        """A helper object for storing annotation data."""
        self.tokens = []
        self.pos_tags = []


class CoNLLDataset(Dataset):
    def __init__(self, fname, max_exs=None):
        """Initializes the CoNLLDataset.
        Args:
            fname: The .conllu file to load data from.
        """
        self.fname = fname
        self.annotations = self.process_conll_file(fname, max_exs)
        self.token_vocab = Vocab([x.tokens for x in self.annotations],
                                 unk_token='<unk>')
        self.pos_vocab = Vocab([x.pos_tags for x in self.annotations])



    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        annotation = self.annotations[idx]
        input = [self.token_vocab.word2id(x) for x in annotation.tokens]
        target = [self.pos_vocab.word2id(x) for x in annotation.pos_tags]
        return input, target

    def process_conll_file(self, fname, max_exs):
        # Read the entire file.
        with open(fname, 'r') as f:
            raw_text = f.read()
        # Split into chunks on blank lines.
        chunks = re.split(r'^\n', raw_text, flags=re.MULTILINE)
        #print(chunks)
        # Process each chunk into an annotation.
        annotations = []
        exs = 0
        for chunk in chunks:
          if not max_exs or exs < max_exs:
            annotation = Annotation()
            lines = chunk.split('\n')
            # Iterate over all lines in the chunk.
            for line in lines:
                # If line is empty ignore it.
                if len(line)==0:
                    continue
                # If line is a commend ignore it.
                if line[0] == '#':
                    continue
                # Otherwise split on tabs and retrieve the token and the
                # POS tag fields.
                fields = line.split('\t')
                annotation.tokens.append(fields[1])
                annotation.pos_tags.append(fields[3])
            if (len(annotation.tokens) > 0) and (len(annotation.pos_tags) > 0):
                annotations.append(annotation)
          exs += 1
        return annotations

In [None]:
!ls

chatbot		       logs			      reddit-cleanjokes.txt
checkpoint_gen	       model5.gensim		      results
corona_NLP_test.csv    model5.gensim.expElogbeta.npy  runs
corpus.pkl	       model5.gensim.id2word	      squad
data-translate	       model5.gensim.state	      tagger-udpos-model.pt
dictionary.gensim      pos_tagger.final.pt	      test-squad
en_ewt-ud-dev.conllu   pos_tagger.pt		      tm
en_ewt-ud-test.conllu  preprocdata
helena.txt	       reddit-cleanjokes.csv


In [None]:
dataset = CoNLLDataset('en_ewt-ud-train.conllu')

In [None]:
input, target = dataset[0]
print('Example input: %s\n' % input)
print('Example target: %s\n' % target)
print('Translated input: %s\n' % ' '.join(dataset.token_vocab.id2word(x) for x in input))
print('Translated target: %s\n' % ' '.join(dataset.pos_vocab.id2word(x) for x in target))

Example input: [266, 16, 5249, 45, 294, 703, 1154, 4233, 10099, 595, 16, 10100, 4, 3, 6865, 35, 3, 6866, 10, 3, 498, 8, 6867, 4, 758, 3, 2224, 1605, 2]

Example target: [9, 2, 9, 2, 7, 1, 3, 9, 9, 9, 2, 9, 2, 6, 1, 5, 6, 1, 5, 6, 1, 5, 9, 2, 5, 6, 7, 1, 2]

Translated input: Al - Zaman : American forces killed Shaikh Abdullah al - Ani , the preacher at the mosque in the town of Qaim , near the Syrian border .

Translated target: PROPN PUNCT PROPN PUNCT ADJ NOUN VERB PROPN PROPN PROPN PUNCT PROPN PUNCT DET NOUN ADP DET NOUN ADP DET NOUN ADP PROPN PUNCT ADP DET ADJ NOUN PUNCT



In [None]:
import torch
from torch.autograd import Variable


def pad(sequences, max_length, pad_value=0):
    """Pads a list of sequences.
    Args:
        sequences: A list of sequences to be padded.
        max_length: The length to pad to.
        pad_value: The value used for padding.
    Returns:
        A list of padded sequences.
    """
    out = []
    for sequence in sequences:
        padded = sequence + [0]*(max_length - len(sequence))
        out.append(padded)
    return out


def collate_annotations(batch):
    """Function used to collate data returned by CoNLLDataset."""
    # Get inputs, targets, and lengths.
    inputs, targets = zip(*batch)
    lengths = [len(x) for x in inputs]
    # Sort by length.
    sort = sorted(zip(inputs, targets, lengths),
                  key=lambda x: x[2],
                  reverse=True)
    inputs, targets, lengths = zip(*sort)
    # Pad.
    max_length = max(lengths)
    inputs = pad(inputs, max_length)
    targets = pad(targets, max_length)
    # Transpose.
    inputs = list(map(list, zip(*inputs)))
    targets = list(map(list, zip(*targets)))
    # Convert to PyTorch variables.
    inputs = Variable(torch.LongTensor(inputs))
    targets = Variable(torch.LongTensor(targets))
    lengths = Variable(torch.LongTensor(lengths))
    if torch.cuda.is_available():
        inputs = inputs.cuda()
        targets = targets.cuda()
        lengths = lengths.cuda()
    return inputs, targets, lengths

In [None]:
from torch.utils.data import DataLoader


for inputs, targets, lengths in DataLoader(dataset, batch_size=16, collate_fn=collate_annotations):
    print('Inputs: %s\n' % inputs.data)
    print('Targets: %s\n' % targets.data)
    print('Lengths: %s\n' % lengths.data)

    break

Inputs: tensor([[   28,  1083,   266,    28,    30,   106,    68,   266,   499,   625,
         10103,   121,  1212,    28,    28,   108],
        [10106,     3,    16,  1713,  6874,  6878, 10115,    16,  1030,   106,
            45, 10123,     8,  3581,  1081,  1606],
        [   10,  5252,  5249,  4237,    11,    11,    46,  5249,  4239,  1712,
           555,     4,    69,    60,    19,    54],
        [  180,    19,    45,     8,    10,     3,   185,    45,    51,     8,
          1849,  6874,    60,  1370,   159,    41],
        [   11,   343,   294, 10118, 10125,   759,   138,  5253, 10121,     7,
          2018,  3111,   159,    10,   450,    19],
        [ 4234,   163,   703,  3111,   180,  1031,     8,  1154,     7, 10101,
            12,     4,   450,     3,    44, 10111],
        [    5,     5,  1154,  2018,     6,    10,     3,     7, 10122, 10102,
            31,   151,    44, 10112,     3,     3],
        [    3,   408,  4233,    12,    50,     3,  2755,   807,  3112,    

In [None]:

from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Tagger(nn.Module):
    def __init__(self,
                 input_dim,
                 output_dim,
                 n_layers,
                 embedding_dim=64,
                 hidden_dim=64,
                 dropout=0.5,
                 bidirectional=True,
                 pad_idx=0):
        """Initializes the tagger.

        Args:
            input_dim: Size of the input vocabulary, projection
            output_dim: Size of the output vocabulary.
            embedding_dim: Dimension of the word embeddings.
            hidden_dim: Number of units in each LSTM hidden layer.
            bidirectional: Whether or not to use a bidirectional rnn.
        """
        super(Tagger, self).__init__()

        # Store parameters
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional

        # Define layers
        self.word_embeddings = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers = n_layers,
                          bidirectional=bidirectional,
                          dropout = dropout if n_layers > 1 else 0)

        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.activation = nn.LogSoftmax(dim=2)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, lengths=None, hidden=None):
        """Computes a forward pass of the language model.

        Args:
            x: A LongTensor w/ dimension [seq_len, batch_size].
            lengths: The lengths of the sequences in x.
            hidden: Hidden state to be fed into the lstm.

        Returns:
            net: Probability of the next word in the sequence.
            hidden: Hidden state of the lstm.
        """
        seq_len, batch_size = x.size()

        # If no hidden state is provided, then default to zeros.
        if hidden is None:
            if self.bidirectional:
                num_directions = 2
            else:
                num_directions = 1
            hidden = Variable(torch.zeros(num_directions, batch_size, self.hidden_dim))
            if torch.cuda.is_available():
                hidden = hidden.cuda()

        net = self.word_embeddings(x)
        # Pack before feeding into the RNN.
        if lengths is not None:
            lengths = lengths.data.view(-1).tolist()
            net = pack_padded_sequence(net, lengths)
        net, hidden = self.rnn(net, hidden)
        # Unpack after
        if lengths is not None:
            net, _ = pad_packed_sequence(net)
        net = self.fc(net)
        net = self.activation(net)

        return net, hidden

In [None]:
import numpy as np

# Load datasets.
train_dataset = CoNLLDataset('en_ewt-ud-train.conllu', 4096)
dev_dataset = CoNLLDataset('en_ewt-ud-dev.conllu', 1024)

dev_dataset.token_vocab = train_dataset.token_vocab
dev_dataset.pos_vocab = train_dataset.pos_vocab

# Hyperparameters / constants.
input_vocab_size = len(train_dataset.token_vocab)
output_vocab_size = len(train_dataset.pos_vocab)
batch_size = 16
epochs = 6
n_layers = 1

# Initialize the model.
model = Tagger(input_vocab_size, output_vocab_size, n_layers)
if torch.cuda.is_available():
    model = model.cuda()

# Loss function weights.
weight = torch.ones(output_vocab_size)
weight[0] = 0
if torch.cuda.is_available():
    weight = weight.cuda()

# Initialize loss function and optimizer.
loss_function = torch.nn.NLLLoss(weight)
optimizer = torch.optim.Adam(model.parameters())

# Main training loop.
data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                         collate_fn=collate_annotations)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False,
                        collate_fn=collate_annotations)
losses = []
i = 0
for epoch in range(epochs):
    for inputs, targets, lengths in data_loader:
        optimizer.zero_grad()
        outputs, _ = model(inputs, lengths=lengths)

        outputs = outputs.view(-1, output_vocab_size)
        targets = targets.view(-1)

        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()

        #losses.append(loss.data[0])
        losses.append(loss.item())
        if (i % 10) == 0:
            # Compute dev loss over entire dev set.
            # NOTE: This is expensive. You may want to only use a
            # subset of the dev set.
            #print('iteration, ', i)
            dev_losses = []
            for inputs, targets, lengths in dev_loader:
                outputs, _ = model(inputs, lengths=lengths)
                outputs = outputs.view(-1, output_vocab_size)
                targets = targets.view(-1)
                loss = loss_function(outputs, targets)
                dev_losses.append(loss.item())
            avg_train_loss = np.mean(losses)
            avg_dev_loss = np.mean(dev_losses)
            losses = []
            #print('here')
            print('Epoch %i Iteration %i - Train Loss: %0.6f - Dev Loss: %0.6f' % (epoch, i, avg_train_loss, avg_dev_loss), end='\n')
            torch.save(model, 'pos_tagger.pt')
        i += 1

torch.save(model, 'pos_tagger.final.pt')

Epoch 0 Iteration 0 - Train Loss: 2.944906 - Dev Loss: 2.940190
Epoch 0 Iteration 10 - Train Loss: 2.845249 - Dev Loss: 2.734218
Epoch 0 Iteration 20 - Train Loss: 2.640595 - Dev Loss: 2.531120
Epoch 0 Iteration 30 - Train Loss: 2.448569 - Dev Loss: 2.325695
Epoch 0 Iteration 40 - Train Loss: 2.232043 - Dev Loss: 2.138989
Epoch 0 Iteration 50 - Train Loss: 2.071410 - Dev Loss: 1.997733
Epoch 0 Iteration 60 - Train Loss: 1.869212 - Dev Loss: 1.873686
Epoch 0 Iteration 70 - Train Loss: 1.820451 - Dev Loss: 1.767294
Epoch 0 Iteration 80 - Train Loss: 1.650056 - Dev Loss: 1.681365
Epoch 0 Iteration 90 - Train Loss: 1.591510 - Dev Loss: 1.595364
Epoch 0 Iteration 100 - Train Loss: 1.475191 - Dev Loss: 1.525031
Epoch 0 Iteration 110 - Train Loss: 1.425099 - Dev Loss: 1.475127
Epoch 0 Iteration 120 - Train Loss: 1.367612 - Dev Loss: 1.422311
Epoch 0 Iteration 130 - Train Loss: 1.361713 - Dev Loss: 1.376067
Epoch 0 Iteration 140 - Train Loss: 1.284507 - Dev Loss: 1.335651
Epoch 0 Iteration 150

In [None]:
# Collect the predictions and targets
y_true = []
y_pred = []

for inputs, targets, lengths in dev_loader:
    outputs, _ = model(inputs, lengths=lengths)
    _, preds = torch.max(outputs, dim=2)
    targets = targets.view(-1)
    preds = preds.view(-1)
    if torch.cuda.is_available():
        targets = targets.cpu()
        preds = preds.cpu()
    y_true.append(targets.data.numpy())
    y_pred.append(preds.data.numpy())

# Stack into numpy arrays
y_true = np.concatenate(y_true)
y_pred = np.concatenate(y_pred)

# Compute accuracy
acc = np.mean(y_true[y_true != 0] == y_pred[y_true != 0])
print('Accuracy - %0.6f\n' % acc)

# Evaluate f1-score
from sklearn.metrics import f1_score
score = f1_score(y_true, y_pred, average=None)
print('F1-scores:\n')
for label, score in zip(dev_dataset.pos_vocab._id2word[1:], score[1:]):
    print('%s - %0.6f' % (label, score))

Accuracy - 0.816063

F1-scores:

NOUN - 0.765083
PUNCT - 0.984068
VERB - 0.781020
ADP - 0.750791
PROPN - 0.532544
DET - 0.976991
PRON - 0.960609
ADJ - 0.684080
AUX - 0.943653
ADV - 0.703019
CCONJ - 0.980344
PART - 0.909847
NUM - 0.661509
SCONJ - 0.706927
_ - 0.879121
X - 0.000598
INTJ - 0.765432
SYM - 0.311111


In [None]:
model = torch.load('pos_tagger.final.pt')

def inference(sentence):
    # Convert words to id tensor.
    ids = [[dataset.token_vocab.word2id(x)] for x in sentence]
    ids = Variable(torch.LongTensor(ids))
    if torch.cuda.is_available():
        ids = ids.cuda()
    # Get model output.
    output, _ = model(ids)
    _, preds = torch.max(output, dim=2)
    if torch.cuda.is_available():
        preds = preds.cpu()
    preds = preds.data.view(-1).numpy()
    pos_tags = [dataset.pos_vocab.id2word(x) for x in preds]
    for word, tag in zip(sentence, pos_tags):
        print('%s - %s' % (word, tag))

In [None]:
def inference_with_labels(sentence, labels):
    #print(sentence)
    # Convert words to id tensor.
    ids = [[dataset.token_vocab.word2id(x)] for x in sentence]
    print(ids)
    ids = Variable(torch.LongTensor(ids))
    if torch.cuda.is_available():
        ids = ids.cuda()
    # Get model output.
    output, _ = model(ids)
    _, preds = torch.max(output, dim=2)
    if torch.cuda.is_available():
        preds = preds.cpu()
    preds = preds.data.view(-1).numpy()
    pos_tags = [dataset.pos_vocab.id2word(x) for x in preds]
    #labels = [dataset.pos_vocab.id2word(x) for x in labels]
    #sentence = [test_dataset.token_vocab.id2word(x) for x in ids]
    for word, tag, label in zip(sentence, pos_tags, labels):
        print('%s - %s - %s' % (word, tag, label))

In [None]:
test_dataset = CoNLLDataset('en_ewt-ud-test.conllu')
dataset = CoNLLDataset('en_ewt-ud-train.conllu')

sentence, labels = test_dataset[10]
sentence = [test_dataset.token_vocab.id2word(x) for x in sentence]
print(sentence)
labels = [test_dataset.pos_vocab.id2word(x) for x in labels]
inference_with_labels(sentence, labels)



['I', 'doubt', 'the', 'very', 'few', 'who', 'actually', 'read', 'my', 'blog', 'have', 'not', 'come', 'across', 'this', 'yet', ',', 'but', 'I', 'figured', 'I', 'would', 'put', 'it', 'out', 'there', 'anyways', '.']
[[9], [1751], [3], [78], [205], [72], [336], [374], [34], [2549], [17], [24], [216], [796], [25], [384], [4], [43], [9], [3755], [9], [48], [197], [15], [55], [58], [3534], [2]]
I - DET - PRON
doubt - NOUN - VERB
the - PUNCT - DET
very - NOUN - ADV
few - NOUN - ADJ
who - PRON - PRON
actually - NOUN - ADV
read - NOUN - VERB
my - PRON - PRON
blog - NOUN - NOUN
have - PRON - AUX
not - PROPN - PART
come - ADV - VERB
across - AUX - ADP
this - PUNCT - PRON
yet - NOUN - ADV
, - PUNCT - PUNCT
but - ADJ - CCONJ
I - DET - PRON
figured - PRON - VERB
I - DET - PRON
would - PART - AUX
put - SCONJ - VERB
it - ADJ - PRON
out - VERB - ADV
there - ADJ - ADV
anyways - VERB - ADV
. - DET - PUNCT
