# L2: Language modelling

## Import libraries

In [3]:
import torch
import torch.nn as nn
import numpy as np
import torch.optim as optim
import torch.nn.functional as F
from time import time
import math
import queue

## Dataset

In [4]:
class Dataset():

    ROOT = ['<root>', '<root>', 0]  # Pseudo-root

    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        with open(self.filename, 'rt', encoding='utf-8') as lines:
            tmp = [Dataset.ROOT]
            for line in lines:
                if not line.startswith('#'):  # Skip lines with comments
                    line = line.rstrip()
                    if line:
                        columns = line.split('\t')
                        if columns[0].isdigit():  # Skip range tokens
                            tmp.append([columns[1], columns[3], int(columns[6])])
                    else:
                        yield tmp
                        tmp = [Dataset.ROOT]

In [5]:
class Dataset_conllu():

    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        with open(self.filename, 'rt', encoding='utf-8') as lines:
            tmp = []
            for line in lines:
                if not line.startswith('#'):  # Skip lines with comments
                    line = line.rstrip()
                    if line:
                        columns = line.split('\t')
                        if columns[0].isdigit():  # Skip range tokens
                            tmp.append(columns)
                    else:
                        yield tmp
                        tmp = []


## Vocabularies

In [6]:
PAD = '<pad>'
UNK = '<unk>'

def make_vocabs(gold_data):
    # TODO: Replace the next line with your own code
    # Initializatioin
    word_vocab = {PAD:0, UNK:1}
    tag_vocab = {PAD:0}
    # Go through data
    for sentence in gold_data:
        for word, tag, _ in sentence:
            # new word
            if word not in word_vocab:
                word_vocab[word] = len(word_vocab)
            # new tag
            if tag not in tag_vocab:
                tag_vocab[tag] = len(tag_vocab)
    return word_vocab, tag_vocab

## Tagger

In [7]:
class Tagger(object):

    def predict(self, sentence):
        raise NotImplementedError

### Fixed window model

In [8]:
class FixedWindowModel(nn.Module):

    def __init__(self, embedding_specs, hidden_dim, output_dim, word_pretrained=None):
        # TODO: Replace the next line with your own code
        super().__init__()
        # Extracting specs
        self.window_sizes = [m for (m, n, e) in embedding_specs]
        self.num_sources = [n for (m, n, e) in embedding_specs]
        self.embed_dims = [e for (m, n, e) in embedding_specs]
        
        # Embedding layers are stored in nn module list 
        self.embeddings = nn.ModuleList() 
        # For pretrained word embeddings
        if word_pretrained != None:
            # word embedding
            self.embeddings.append(nn.Embedding.from_pretrained(word_pretrained, freeze=False))
            # tag embedding (still need training)
            tag_num = self.num_sources[-1]
            tag_embed_dim = self.embed_dims[-1]
            self.embeddings.append(nn.Embedding(tag_num, tag_embed_dim))
        # Non-pretrained
        else:
            for window_size, num_source, embed_dim in embedding_specs:
                # embedding layer
                embedding = nn.Embedding(num_source, embed_dim)
                # Initialize weights from normal distrubution ~ N(0, 0.01)
                embedding.weight.data.normal_(0, 1e-2)
                # append to module list
                self.embeddings.append(embedding)
        
        
        # Linear layers
        self.concat_len = sum([m*e for m, e in zip(self.window_sizes, self.embed_dims)])
        self.linear_1 = nn.Linear(self.concat_len, hidden_dim)
        self.relu = nn.ReLU()
        self.linear_2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, features):
        # TODO: Replace the next line with your own code
        # batch size
        B = features.shape[0]
        # Embedding and concatenation
        f_index = 0
        for i, (m, e) in enumerate(zip(self.window_sizes, self.embed_dims)):
            if i == 0:
                embedded = self.embeddings[i](features[:, f_index:f_index+m]).view((B, m*e))
            if i > 0:
                to_be_cat = self.embeddings[i](features[:, f_index:f_index+m]).view((B, m*e))
                embedded = torch.cat((embedded, to_be_cat), dim=1) 
            f_index += m
        # Feed forward
        x = self.linear_1(embedded)
        x = self.relu(x)
        x = self.linear_2(x)
        
        return x

### Actual tagger class

In [9]:
class FixedWindowTagger(Tagger):

    def __init__(self, vocab_words, vocab_tags, output_dim, word_dim=50, tag_dim=10, hidden_dim=100, word_pretrained=None):
        # TODO: Replace the next line with your own code
        self.window_size = 3
        self.vocab_tags = vocab_tags
        self.vocab_words = vocab_words
        # Embedding specs
        embedding_specs = [(self.window_size, len(vocab_words), word_dim), (1, len(vocab_tags), tag_dim)]
        # Initialize model
        if word_pretrained != None:
            self.model = FixedWindowModel(embedding_specs, hidden_dim, output_dim, word_pretrained=word_pretrained)
        else:
            self.model = FixedWindowModel(embedding_specs, hidden_dim, output_dim)

    def featurize(self, words, i, pred_tags):
        # TODO: Replace the next line with your own code
        m = self.window_size
        half_m = int((m-1) / 2)
        features = []
         
        # Get word id in the window 
        for j in range(i - half_m, i + half_m + 1):
            # need padding at head and tail
            if j < 0 or j >= len(words):
                features.append(0)
            else:
                features.append(words[j])
        
        # Append predicted tags
        features += pred_tags
            
        return torch.tensor(features)

    def predict(self, words):
        # TODO: Replace the next line with your own code
        # Initialization
        predictions = []
        pred_tags = [0]
        word_ids = []
        # Encode words into their ids
        for w in words:
            try:
                word_ids.append(self.vocab_words[w])
            except KeyError:
                # tag id for unknown words is '1'
                word_ids.append(1)
            
        # Go through sentence
        for i, word_id in enumerate(words):
            # Get feature vector
            features = self.featurize(word_ids, i , pred_tags)
            features = features.reshape((1, 4))
            # Feed through the network
            output = self.model.forward(features)
            # Get the predicted tag
            hash_value = torch.argmax(output).item()
            predicted_tag = list(self.vocab_tags.keys())[hash_value]
            predictions.append(predicted_tag)
            # update pred_tag
            pred_tags = [hash_value]
        
        return predictions

### Helper function to calculate accuracy

In [10]:
def tag_accuracy(tagger, gold_data):
    # TODO: Replace the next line with your own code
    # Initialization
    num_of_pairs = 0
    correct_count = 0
    # Go through sentences
    for sentence in gold_data:
        # Extract words and tags
        words = []
        tags = []
        for word, tag, _ in sentence:
            words.append(word)
            tags.append(tag)
            num_of_pairs += 1
        # predict with tagger
        predictions = tagger.predict(words)
        # count correct predictions
        for prediction, label in zip(predictions, tags):
            if prediction == label:
                correct_count += 1
                
    # return accuracy
    return correct_count / num_of_pairs 

## Parser

### Interface

In [11]:
class Parser(object):

    def predict(self, words, tags):
        raise NotImplementedError

### For Arc Standard Algorithm

In [12]:
class ArcStandardParser(Parser):

    MOVES = tuple(range(4))

    SH, LA, RA, ES = MOVES  # Parser moves are specified as integers.

    @staticmethod
    def initial_config(num_words):
        return (0, [], num_words * [0])

    @staticmethod
    def valid_moves(config):
        valid_moves = []
        # Get configuration
        i, stack, heads = config
        # Shift is available when there's words in buffer
        if i < len(heads):
            valid_moves.append(ArcStandardParser.SH)
        # Left and right arcs are available when there are at least two words in stack
        if len(stack) >= 2:
            valid_moves.append(ArcStandardParser.LA)
            valid_moves.append(ArcStandardParser.RA)
            
        return valid_moves

    @staticmethod
    def next_config(config, move):
        # TODO: Replace the next line with your own code
        # Get current configuration
        i = config[0]
        stack, heads = [x.copy() for x in config[1:]]
        # Shift
        if move == ArcStandardParser.SH:
            # move word to stack
            stack.append(i)
            # update buffer
            i += 1
            
        # Left arc
        elif move == ArcStandardParser.LA:
            # assign head 
            heads[stack[-2]] = stack[-1]
            # remove second item in stack
            del stack[-2]
        # Right arc
        elif move == ArcStandardParser.RA:
            # assign head 
            heads[stack[-1]] = stack[-2]
            # remove first item in stack
            del stack[-1]
            
        return (i, stack, heads)

    @staticmethod
    def is_final_config(config):
        # TODO: Replace the next line with your own code
        # Get current configuration
        i, stack, heads = config
        # Buffer clear
        if i == len(heads):
            # Stack clear (only one 'root' left)
            if len(stack) == 1:
                return True
        return False

### Generater that yield oracle moves during training

In [13]:
def oracle_moves(gold_heads):
    # Helper function to check if the word is 'used up' as head
    def check_used_up(i, our_heads):
        # count the number of times when i is used as head in gold data
        gold_count = sum([x == i for x in gold_heads])
        # count ours
        our_count = sum([x == i for x in our_heads])
        return gold_count == our_count
    # Initialization
    SH, LA, RA = tuple(range(3))
    parser = ArcStandardParser()
    # Keep track of stack and our generated heads
    stack = []
    i = 0
    our_heads = [0] * len(gold_heads)
    config = (i, stack, our_heads)
    # Not in final config
    while not parser.is_final_config(config):
        i, stack, our_heads = config
        # When there's more than 2 items in stack
        if len(stack) >= 2:
            # Choose LA if the arc is in gold heads and all the arc from second-topmost has been assigned
            if gold_heads[stack[-2]] == stack[-1] and check_used_up(stack[-2], our_heads):
                yield config, LA
                config = parser.next_config(config, LA)
            # Choose RA
            elif gold_heads[stack[-1]] == stack[-2] and check_used_up(stack[-1], our_heads):
                yield config, RA
                config = parser.next_config(config, RA)
            # Otherwise, SH
            else:
                yield config, SH
                config = parser.next_config(config, SH)
        # Can only do SH
        else:
            yield config, SH
            config = parser.next_config(config, SH)




In [14]:
def oracle_moves_with_error_state(gold_heads):
    # Helper function to check if the word is 'used up' as head
    def check_used_up(i, our_heads):
        # count the number of times when i is used as head in gold data
        gold_count = sum([x == i for x in gold_heads])
        # count ours
        our_count = sum([x == i for x in our_heads])
        return gold_count == our_count
    # Helper function to generate two other 'error examples'
    def error_states(config, gold_move):
        # check if the other two moves are valid
        error_states = []
        for move in parser.valid_moves(config):
            if move != gold_move:
                # move one step further
                error_config = parser.next_config(config, move)
                # yield error state
                error_states.append((error_config, ES))
        return error_states
    # Initialization
    SH, LA, RA, ES = tuple(range(4))
    parser = ArcStandardParser()
    # Keep track of stack and our generated heads
    stack = []
    i = 0
    our_heads = [0] * len(gold_heads)
    config = (i, stack, our_heads)
    # Not in final config
    while not parser.is_final_config(config):
        i, stack, our_heads = config
        # When there's more than 2 items in stack
        if len(stack) >= 2:
            # Choose LA if the arc is in gold heads and all the arc from second-topmost has been assigned
            if gold_heads[stack[-2]] == stack[-1] and check_used_up(stack[-2], our_heads):
                yield config, LA
                for e_config, m in error_states(config, LA):
                    yield e_config, m 
                config = parser.next_config(config, LA)
            # Choose RA
            elif gold_heads[stack[-1]] == stack[-2] and check_used_up(stack[-1], our_heads):
                yield config, RA
                for e_config, m in error_states(config, RA):
                    yield e_config, m 
                config = parser.next_config(config, RA)
            # Otherwise, SH
            else:
                yield config, SH
                for e_config, m in error_states(config, SH):
                    yield e_config, m 
                config = parser.next_config(config, SH)
        # Can only do SH
        else:
            yield config, SH
            for e_config, m in error_states(config, SH):
                yield e_config, m 
            config = parser.next_config(config, SH)




### Test the oracle with error state 

In [15]:
train_data = Dataset('en_ewt-ud-train-projectivized.conllu')
dev_data = Dataset('en_ewt-ud-dev.conllu')
example_sentence = list(train_data)[531]

In [16]:
gold_heads = [h for w, t, h in example_sentence]
gold_moves = [0, 0, 0, 1, 0, 0, 1, 2, 0, 2, 2]

print(gold_heads)
print(list(m for _, m in oracle_moves_with_error_state(gold_heads))) 
print(list(m for _, m in oracle_moves(gold_heads))) 
#assert list(m for _, m in oracle_moves(gold_heads)) == gold_moves

[0, 2, 0, 4, 2, 2]
[0, 0, 0, 3, 3, 1, 3, 3, 0, 3, 3, 0, 3, 3, 1, 3, 3, 2, 3, 3, 0, 3, 3, 2, 3, 2, 3]
[0, 0, 0, 1, 0, 0, 1, 2, 0, 2, 2]


### Actual parser class 

In [109]:
class FixedWindowParser(ArcStandardParser):

    def __init__(self, vocab_words, vocab_tags, word_dim=50, tag_dim=10, hidden_dim=180, word_pretrained=None):
        self.vocab_tags = vocab_tags
        self.vocab_words = vocab_words
        output_dim = 4 # SH, LA, RA, ES
        # Embedding specs
        embedding_specs = [(3, len(vocab_words), word_dim), (3, len(vocab_tags), tag_dim)]
        # Initialize model
        if word_pretrained != None:
            self.model = FixedWindowModel(embedding_specs, hidden_dim, output_dim, word_pretrained=word_pretrained)
        else:
            self.model = FixedWindowModel(embedding_specs, hidden_dim, output_dim)

    def featurize(self, words, tags, config):
        # feature vector
        features = torch.zeros(6, dtype=torch.int32) 
        # configuration
        i, stack, heads = config
        # 0. word form of the next word in the buffer
        try:
            features[0] = words[i]
        except IndexError:
            features[0] = self.vocab_words['<pad>'] 
        # 1. word form of the topmost word on the stack
        try:
            features[1] = words[stack[-1]]
        except IndexError:
            features[1] = self.vocab_words['<pad>']
        # 2. word form of the second-topmost word on the stack
        try:
            features[2] = words[stack[-2]]
        except IndexError:
            features[2] = self.vocab_words['<pad>']
        # 3. part-of-speech tag of the next word in the buffer
        try:
            features[3] = tags[i]
        except IndexError:
            features[3] = self.vocab_tags['<pad>'] 
        # 4. part-of-speech tag of the topmost word on the stack
        try:
            features[4] = tags[stack[-1]]
        except IndexError:
            features[4] = self.vocab_tags['<pad>']
        # 5. part-of-speech tag of the second-topmost word on the stack
        try:
            features[5] = tags[stack[-2]]
        except IndexError:
            features[5] = self.vocab_tags['<pad>']
        
        return features

    def predict(self, words, tags):
        # TODO: Replace the next line with your own code
        # Initialization
        word_ids = []
        tag_ids = []
        # Encode words and tags into their ids
        for w, t in zip(words, tags):
            try:
                word_ids.append(self.vocab_words[w])
            except KeyError:
                # word id for unknown words is '1'
                word_ids.append(1)
            try:
                tag_ids.append(self.vocab_tags[t])
            except KeyError:
                # tag id for unknown tags is '1'
                tag_ids.append(1)
            
        # Initial config
        config = self.initial_config(len(words))
        current_prob = 1
        current_state = (config, current_prob)
        # Initial priority queue
        score_queue = queue.PriorityQueue(maxsize=0) 
        # Softmax function
        softmax_func = nn.Softmax(dim=1)
        curr_count = 0
        state_count = 0
        # Keep generating moves until final config
        while not self.is_final_config(config):
            curr_count += 1
            # Get feature vector
            features = self.featurize(word_ids, tag_ids, config)
            features = features.reshape((1, 6))
            # Feed through the network
            output = self.model.forward(features)
            #valid_moves = self.valid_moves(config)
            #for i in range(3):
            #    if i not in valid_moves:
            #        output[0, i] = -math.inf
            # Stop expanding the moves tree and choose next state if we predicted 'error class'
            #print(torch.argmax(output).item())
            #if torch.argmax(output).item() != self.ES:
            # Calculate the probability of each moves with softmax function
            probs = current_prob * softmax_func(output) * 5 
            # Put 3 scores (3 outcomes of moves) into priority queue (valid ones)
            # Note that the probability is negative bc of the minheap 
            for i in self.valid_moves(config):
                pseudo_next_config = self.next_config(config, i)
                score_queue.put((-probs[0, i], pseudo_next_config))
                state_count += 1
            # Choose the next state according to the priority queue
            current_prob, config= score_queue.get()
            current_prob = -current_prob
            #print(current_prob)
        
        print(f'This sentence go through {curr_count} states out of {state_count} total states to reach final config')
        predicted_heads = config[-1]
            
        
        return predicted_heads

In [110]:
word_vocab, tag_vocab = make_vocabs(train_data)
example_sentence = list(train_data)[531]
my_parser = FixedWindowParser(word_vocab, tag_vocab, word_dim=50, tag_dim=10, hidden_dim=180, word_pretrained=None)
print(my_parser.model)
# test prediction
words = [w for w, t, h in example_sentence]
tags = [t for w, t, h in example_sentence]
heads = [h for w, t, h in example_sentence]
print(words)
print(f'predicted heads: {my_parser.predict(words, tags)}')
print(f'gold heads: {heads}')

FixedWindowModel(
  (embeddings): ModuleList(
    (0): Embedding(19675, 50)
    (1): Embedding(19, 10)
  )
  (linear_1): Linear(in_features=180, out_features=180, bias=True)
  (relu): ReLU()
  (linear_2): Linear(in_features=180, out_features=4, bias=True)
)
['<root>', 'I', 'like', 'yuor', 'blog', '.']
This sentence go through 11 states out of 24 total states to reach final config
predicted heads: [5, 5, 5, 5, 5, 0]
gold heads: [0, 2, 0, 4, 2, 2]


## Training (tagger)

### Generater that yields training examples for tagger

In [66]:
def training_examples_tagger(vocab_words, vocab_tags, gold_data, tagger, batch_size=100):
    # Initialization
    batch = torch.zeros((batch_size, 4), dtype=torch.int) 
    batch_labels = torch.zeros(batch_size, dtype=torch.long)
    batch_index = 0
    # Go through gold standard
    for sentence in gold_data: 
        words = [w for w, t, _ in sentence]
        tags = [t for w, t, _ in sentence]
        # Go through sentence
        for i, (word, tag, _) in enumerate(sentence):
            # first word do not has pred_tags
            if i == 0:
                pred_tags = [0]
            # encode to ids
            word_ids = [tagger.vocab_words[w] for w in words]
            # generate feature vector
            features = tagger.featurize(word_ids, i , pred_tags)
            # use gold label for pred_tags
            pred_tags = [tagger.vocab_tags[tag]]
            # put it in batch
            batch[batch_index] = features
            batch_labels[batch_index] = tagger.vocab_tags[tag]
            batch_index += 1
            # yield batch
            if batch_index == batch_size:
                batch_index = 0
                yield batch, batch_labels


### training loop for tagger

In [67]:
def train_fixed_window_tagger(train_data, n_epochs=2, batch_size=100, lr=1e-3, word_pretrained=None):
    # TODO: Replace the next line with your own code
    
    # Generate vocab for words and tags
    vocab_words, vocab_tags = make_vocabs(train_data)
    output_dim = len(vocab_tags)
    
    # Initialize the tagger
    if word_pretrained != None:
        tagger = FixedWindowTagger(vocab_words, vocab_tags, output_dim, word_pretrained=word_pretrained)
    else:
        tagger = FixedWindowTagger(vocab_words, vocab_tags, output_dim)
    
    # Initialize the optimizer. Here we use Adam rather than plain SGD
    optimizer = optim.Adam(tagger.model.parameters(), lr=lr)
    print("Initialization done")
    
    # Training loop
    for e in range(n_epochs):
        start = time()
        # batch
        for X, y in training_examples_tagger(vocab_words, vocab_tags, train_data, tagger, batch_size):
            # Reset the accumulated gradients
            optimizer.zero_grad()

            # Forward pass
            output = tagger.model.forward(X)

            # Loss
            loss = F.cross_entropy(output, y)
            
            # Backward pass; propagates the loss and computes the gradients
            loss.backward()

            # Update the parameters of the model
            optimizer.step()
            #print(f'Per batch loss: {loss}')
        print(f'Epoch {e} loss (train): {loss}')
        end = time()
        print(f'time for an epoch {end - start} s')
    
    return tagger

### Get data from universal dependenciese project

In [21]:
train_data = Dataset('en_ewt-ud-train-projectivized.conllu')
dev_data = Dataset('en_ewt-ud-dev.conllu')

### Initialize tagger model and train

In [22]:
tagger = train_fixed_window_tagger(train_data, n_epochs=5)
print('Calculating accuracy ...')
print('Result: {:.4f}'.format(tag_accuracy(tagger, dev_data)))

Initialization done


KeyboardInterrupt: 

### Produce a new 'retagged version' of data which use our tagging result

In [None]:
with open('en_ewt-ud-train-projectivized-retagged.conllu', 'wt', encoding="utf-8") as target:
    for sentence in Dataset_conllu('en_ewt-ud-train-projectivized.conllu'):
        words = [columns[1] for columns in sentence]
        tags = []
        #print(sentence)
        for i, t in enumerate(tagger.predict(words)):
            sentence[i][3] = t
        for columns in sentence:
            print('\t'.join(c for c in columns), file=target)
        print(file=target)

In [None]:
with open('en_ewt-ud-dev-retagged.conllu', 'wt', encoding="utf-8") as target:
    for sentence in Dataset_conllu('en_ewt-ud-dev.conllu'):
        words = [columns[1] for columns in sentence]
        tags = []
        #print(sentence)
        for i, t in enumerate(tagger.predict(words)):
            sentence[i][3] = t
        for columns in sentence:
            print('\t'.join(c for c in columns), file=target)
        print(file=target)

## Training (parser)

In [111]:
def training_examples_parser(vocab_words, vocab_tags, gold_data, parser, batch_size=100):
    # Initializtion
    batch = torch.zeros((batch_size, 6), dtype=torch.int32)
    batch_labels = torch.zeros(batch_size, dtype=torch.long)
    batch_index = 0
    # Go through data
    for sentence in gold_data:
        # Extract data
        words = [w for w, t, h in sentence]
        tags = [t for w, t, h in sentence]
        gold_heads = [h for w, t, h in sentence]
        # Encode to ids
        word_ids = []
        tag_ids = []
        for w, t in zip(words, tags):
            try:
                word_ids.append(parser.vocab_words[w])
            except KeyError:
                # word id for unknown words is '1'
                word_ids.append(1)
            try:
                tag_ids.append(parser.vocab_tags[t])
            except KeyError:
                # tag id for unknown tags is '1'
                tag_ids.append(1)
        # Static oracle
        for config, gold_move in oracle_moves_with_error_state(gold_heads): 
            # Feature vector
            features = parser.featurize(word_ids, tag_ids, config)
            # Put in batch
            batch[batch_index] = features
            batch_labels[batch_index] = gold_move
            batch_index += 1
            # Yielding batch
            if batch_index == batch_size:
                yield batch, batch_labels
                batch_index = 0

In [112]:
def train_fixed_window_parser(train_data, n_epochs=1, batch_size=100, lr=1e-2):
    # TODO: Replace the next line with your own code
    # Generate vocab for words and tags
    vocab_words, vocab_tags = make_vocabs(train_data)
    output_dim = len(vocab_tags)
    
    # Initialize the parser:
    parser = FixedWindowParser(vocab_words, vocab_tags, word_dim=50, tag_dim=10, hidden_dim=180, word_pretrained=None)
    
    # Initialize the optimizer. Here we use Adam rather than plain SGD
    optimizer = optim.Adam(parser.model.parameters(), lr=lr)
    print("Initialization done")
    
    # Training loop
    for e in range(n_epochs):
        start = time()
        # batch
        for X, y in training_examples_parser(vocab_words, vocab_tags, train_data, parser, batch_size):
            # Reset the accumulated gradients
            optimizer.zero_grad()

            # Forward pass
            output = parser.model.forward(X)

            # Loss
            loss = F.cross_entropy(output, y)
            
            # Backward pass; propagates the loss and computes the gradients
            loss.backward()

            # Update the parameters of the model
            optimizer.step()
            #print(f'Per batch loss: {loss}')
            
        print(f'Epoch {e} loss (train): {loss}')
        end = time()
        print(f'time for an epoch {end - start} s')
    
    return parser

### Unlabelled attachment score 

In [113]:
def uas(parser, gold_data):
    # TODO: Replace the next line with your own code
    # Initialization
    token_num = 0
    correct_num = 0
    # go through gold data
    for sentence in gold_data:
        # extract data
        words = [w for w, t, h in sentence]
        tags = [t for w, t, h in sentence]
        gold_heads = [h for w, t, h in sentence]
        # predict 
        predicted_heads = parser.predict(words, tags)
        # count correct tokens (skip the first which is 'root')
        for pred, label in zip(predicted_heads[1:], gold_heads[1:]):
            # total count
            token_num += 1
            # correct count
            if pred == label:
                correct_num += 1
        
    return correct_num / token_num 

### Train and validate the model with the data produced by *our own tagger*

In [114]:
train_data = Dataset('en_ewt-ud-train-projectivized-retagged.conllu')
dev_data = Dataset('en_ewt-ud-dev-retagged.conllu')

In [115]:
parser = train_fixed_window_parser(train_data, n_epochs=1)

Initialization done
Epoch 0 loss (train): 0.5169500112533569
time for an epoch 140.21152997016907 s


In [116]:
example_sentence = list(train_data)[531]
words = [w for w, t, h in example_sentence]
tags = [t for w, t, h in example_sentence]
heads = [h for w, t, h in example_sentence]
print(words)
print(parser.predict(words, tags))
print(heads)

['<root>', 'I', 'like', 'yuor', 'blog', '.']
This sentence go through 11 states out of 27 total states to reach final config
[0, 2, 0, 4, 2, 2]
[0, 2, 0, 4, 2, 2]


In [117]:
print('{:.4f}'.format(uas(parser, dev_data)))

This sentence go through 15 states out of 39 total states to reach final config
This sentence go through 44 states out of 126 total states to reach final config
This sentence go through 71 states out of 207 total states to reach final config
This sentence go through 3 states out of 4 total states to reach final config
This sentence go through 66 states out of 192 total states to reach final config
This sentence go through 41 states out of 117 total states to reach final config
This sentence go through 72 states out of 210 total states to reach final config
This sentence go through 35 states out of 99 total states to reach final config
This sentence go through 37 states out of 105 total states to reach final config
This sentence go through 19 states out of 51 total states to reach final config
This sentence go through 44 states out of 126 total states to reach final config
This sentence go through 49 states out of 141 total states to reach final config
This sentence go through 41 states