**Neural models of sequence prediction**

In [None]:
import torch 
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torchtext.datasets import CoNLL2000Chunking
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab  import build_vocab_from_iterator 


**Loading data**

In [4]:
train_iter = CoNLL2000Chunking(split='train')
seq_texts = []
seq_tags = []
for i in train_iter:
    seq_texts.append(i[0])
    seq_tags.append(i[2])
print(seq_texts[0])
print(seq_tags[0])

['Confidence', 'in', 'the', 'pound', 'is', 'widely', 'expected', 'to', 'take', 'another', 'sharp', 'dive', 'if', 'trade', 'figures', 'for', 'September', ',', 'due', 'for', 'release', 'tomorrow', ',', 'fail', 'to', 'show', 'a', 'substantial', 'improvement', 'from', 'July', 'and', 'August', "'s", 'near-record', 'deficits', '.']
['B-NP', 'B-PP', 'B-NP', 'I-NP', 'B-VP', 'I-VP', 'I-VP', 'I-VP', 'I-VP', 'B-NP', 'I-NP', 'I-NP', 'B-SBAR', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'O', 'B-ADJP', 'B-PP', 'B-NP', 'B-NP', 'O', 'B-VP', 'I-VP', 'I-VP', 'B-NP', 'I-NP', 'I-NP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'B-NP', 'I-NP', 'I-NP', 'O']


In [1]:

class CoNLLDataset(Dataset):
    def __init__(self, seq_texts, seq_tags):
        self.texts = seq_texts
        self.labels = seq_tags
        self.vocab = build_vocab_from_iterator(seq_texts, specials=["<pad>", "<unk>"], min_freq=2)
        self.vocab.set_default_index(1)
        self.vocab_labels = build_vocab_from_iterator(seq_tags, specials=["<pad>"], min_freq=1)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, item):
        return dict(text=self.vocab(self.texts[item]), label=self.vocab_labels(self.labels[item]))
    
    def get_vocab_size(self):
        return len(self.vocab)
    def get_tagset_size(self):
        return len(self.vocab_labels)

In [3]:

def collate_fn(batch):
    batch = sorted(batch, key=lambda x: len(x['text']), reverse=True)
    texts = [torch.tensor(x["text"]) for x in batch]
    labels = [torch.tensor(x["label"]) for x in batch]
    lengths = torch.tensor([len(text) for text in texts])
    pad_texts = pad_sequence(texts, padding_value=0, batch_first=True)
    pad_labels = pad_sequence(labels, padding_value=0, batch_first=True)

    return pad_texts, pad_labels, lengths

**Tagger Net**

In [6]:
WORD_EMBEDDING = 20

class TaggerNet(nn.Module):
    def __init__(self, vocab_size, hidden_size, n_tags):
        
        super(TaggerNet, self).__init__()
        self.embedding = nn.Embedding(vocab_size, WORD_EMBEDDING)
        self.lstm = nn.LSTM(WORD_EMBEDDING, hidden_size, 1,batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_size*2, n_tags)
        self.n_tags = n_tags
        
    def forward(self, sentence, seq_lengths):
        
        embedded_words = self.embedding(sentence)   
        packed_words = pack_padded_sequence(embedded_words, seq_lengths, batch_first=True)
        out_packed, (h_n, c_n) = self.lstm(packed_words)
        output, out_len = pad_packed_sequence(out_packed, batch_first=True)
        output = self.linear(output)
        return output.permute(0,2,1)
        



In [12]:
dataset = CoNLLDataset(seq_texts, seq_tags)
# print(dataset.get_vocab_size(), 10, dataset.get_tagset_size())
model = TaggerNet(dataset.get_vocab_size(), 10, dataset.get_tagset_size())
dataloader = DataLoader(dataset, batch_size=20, collate_fn=collate_fn)
import numpy as np
loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(5):  
    loss_sum = torch.tensor(0.)
    for i, (sentences, tags, lengths) in enumerate(dataloader):
        pred_tags = model(sentences, lengths)
       
        loss = loss_function(pred_tags, tags)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        optimizer.zero_grad()
        with torch.no_grad():
            loss_sum += loss
            print(f'epoch {epoch+1} | batch {i+1}/{len(dataloader)} loss {loss_sum/(i+1):.8f}', end='\r', flush=True)
    print("")
    


epoch 1 | batch 447/447 loss 1.81244445
epoch 2 | batch 447/447 loss 1.02728891
epoch 3 | batch 447/447 loss 0.72468919
epoch 4 | batch 447/447 loss 0.58431208
epoch 5 | batch 447/447 loss 0.50161326


**CRF**

In [52]:
def log_sum_exp(vec):
    max_score = torch.max(vec)
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score)))

In [53]:
class TaggerCRFNet(nn.Module):
    def __init__(self, vocab_size, hidden_size, n_tags):
        super(TaggerCRFNet, self).__init__()
        self.embedding = nn.Embedding(vocab_size, WORD_EMBEDDING)
        self.lstm = nn.LSTM(WORD_EMBEDDING, hidden_size, bidirectional = True)
        self.fc = nn.Linear(hidden_size * 2, n_tags)
        self.n_tags = n_tags
        self.START_TAG = self.n_tags 
        self.STOP_TAG = self.n_tags + 1
       
        self.transitions = nn.Parameter(torch.randn((n_tags+2, n_tags+2)))


    def _get_features(self, sentence, seq_lengths):
       
        embedded_words = self.embedding(sentence)   
        packed_words = pack_padded_sequence(embedded_words, seq_lengths)
        out, _ = self.lstm(packed_words)                 
        unpacked, unpacked_len = pad_packed_sequence(out)
        out = self.fc(unpacked)         
        return out.view(-1, self.n_tags)  
    
    def _get_numerator(self, features, tags):
        
        score = torch.zeros(1)
        for i, tag in enumerate(tags):
            score = score + features[i,tag]
        score = score + self.transitions[tags[0], self.START_TAG]
        for i in range(len(tags)-1):
            score = score + self.transitions[tags[i+1], tags[i]]
        score = score + self.transitions[self.STOP_TAG,tags[-1]]
        return score
    
    def _forward_alg(self, features, seq_lengths):
        
        alpha = features[0] + self.transitions[:self.n_tags, self.START_TAG].view(-1)
        for feature in features[1:]:
            # ~2x faster than a loop
            z = alpha + feature.unsqueeze(1) + self.transitions[:self.n_tags, :self.n_tags]
            z_max = z.max(1).values
            alpha = z_max + (z - z_max.unsqueeze(1)).logsumexp(1)
        return log_sum_exp(alpha + self.transitions[self.STOP_TAG, :self.n_tags])

    def seq_log_probability(self, sentence, tags, seq_lengths):
        
        features = self._get_features(sentence, seq_lengths)
        numerator = self._get_numerator(features, tags)
        partition_function = self._forward_alg(features,seq_lengths)
        return numerator - partition_function

    def forward(self, sentence, seq_lengths):
       
        features = self._get_features(sentence, seq_lengths)
        DP = []

        beta = self._initialize_beta(features)
        DP = self._calculate_DP(features, beta, DP)

        tag_sequence = self._backtrack_sequence(beta, DP)
        return tag_sequence

    def _initialize_beta(self, features):
        beta = features[0] + self.transitions[:self.n_tags, self.START_TAG].view(-1)
        return beta

    def _calculate_DP(self, features, beta, DP):
        for i, feature in enumerate(features[1:]):
            beta, tags = self._update_beta_tags(feature, beta)
            DP.append(tags)
        beta += self.transitions[self.STOP_TAG, :self.n_tags]
        return DP

    def _update_beta_tags(self, feature, beta):
        beta, tags = (beta + feature.unsqueeze(1) + self.transitions[:self.n_tags, :self.n_tags]).max(1)
        return beta, tags

    def _backtrack_sequence(self, beta, DP):
        tag = int(beta.max(0).indices)
        tag_sequence = [tag]
        for next_tag in reversed(DP):
            tag = int(next_tag[tag])
            tag_sequence.append(tag)
        return tag_sequence[::-1]

    def score_seq(self, sentence, tags, seq_lengths):
        
        features = self._get_features(sentence, seq_lengths)
        numerator = self._get_numerator(features, tags)
        return numerator 
    
    def partition_function(self, sentence, tags, seq_lengths):
        
        features = self._get_features(sentence, seq_lengths)
        gold_score = self._forward_alg(features, seq_lengths)
        return  gold_score 
    


In [55]:
CLIP_TRESHOLD = 5. 

model = TaggerCRFNet(dataset.get_vocab_size(), 10, dataset.get_tagset_size())
dataloader = DataLoader(dataset, batch_size=1, collate_fn=collate_fn )
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(1):  
    loss_sum = torch.tensor([0.])
    for i, (sentences, tags, lengths) in enumerate(dataloader):
       
        loss = -model.seq_log_probability(sentences.T, tags.T, lengths)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), CLIP_TRESHOLD)
        optimizer.step()
        optimizer.zero_grad()
        with torch.no_grad():
            loss_sum += loss
            if i% 10 == 9:
                print("Loss function: ",loss_sum/10)
                
                loss_sum = 0



Loss function:  tensor([126.3436])
Loss function:  tensor([92.0474])
Loss function:  tensor([96.8491])
Loss function:  tensor([88.6408])
Loss function:  tensor([69.5507])
Loss function:  tensor([104.2779])
Loss function:  tensor([80.1747])
Loss function:  tensor([80.7277])
Loss function:  tensor([86.6891])
Loss function:  tensor([62.9221])
Loss function:  tensor([61.8926])
Loss function:  tensor([50.2917])
Loss function:  tensor([57.0268])
Loss function:  tensor([52.3214])
Loss function:  tensor([45.2001])
Loss function:  tensor([57.6656])
Loss function:  tensor([22.0903])
Loss function:  tensor([53.6082])
Loss function:  tensor([41.1568])
Loss function:  tensor([36.2602])
Loss function:  tensor([56.9530])
Loss function:  tensor([54.5503])
Loss function:  tensor([55.0141])
Loss function:  tensor([37.4595])
Loss function:  tensor([30.6402])
Loss function:  tensor([32.4584])
Loss function:  tensor([37.7661])
Loss function:  tensor([49.5753])
Loss function:  tensor([45.2832])
Loss functio