# Dependency parsing baseline


Dependency parsing is the task of mapping a sentence to a formal representation of its syntactic structure in the form of a dependency tree, which consists of directed arcs between individual words (tokens). Here we will implement a dependency parser baseline based on the arc-standard algorithm and the fixed-window model that we implemented in Lab L3.

### Download neccessary python files

In [None]:
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/batchify.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/create_vocab.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/data_handling.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/syntax_parser.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/projectivize.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/taggers.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/uas.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/window_models.py


## Imports

In [32]:
from batchify import *
from create_vocab import *
from data_handling import *
import syntax_parser as parser 
from projectivize import *
from uas import *
from window_models import *
from taggers import *
import importlib


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
device

device(type='cuda')

## Prepare data set

### Download data files 

In [None]:
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/data/en_gum-ud-dev-projectivized.conllu
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/data/en_gum-ud-train-projectivized.conllu
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/data/en_gum-ud-test-projectivized.conllu

In [33]:
train_data = Dataset('./en_gum-ud-train-projectivized.conllu')
dev_data = Dataset('./en_gum-ud-dev-projectivized.conllu')
test_data = Dataset('./en_gum-ud-test-projectivized.conllu')

# Tagger

In [15]:
import torch.optim as optim
import torch
import torch.nn.functional as F

def train_fixed_window(train_data, n_epochs=1, batch_size=100, lr=1e-2):
    vocab_words, vocab_tags = make_vocabs(train_data)
    tagger = FixedWindowTagger(vocab_words, vocab_tags, len(vocab_tags))
    
    optimizer = optim.Adam(tagger.model.parameters(), lr=lr)
    for i in range(n_epochs):
        total_loss = 0
        batch_nr = 0
        for x, y in training_examples_tagger(vocab_words, vocab_tags, train_data, tagger):
            batch_nr += 1
            
            optimizer.zero_grad()
            y_pred = tagger.model.forward(x)
            
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            total_loss += loss.item()
            optimizer.step()
            if batch_nr % 100 == 1:
                print(total_loss/batch_nr)
                #pass
    return tagger


## Train tagger

In [16]:
tagger = train_fixed_window(train_data)

2.941803216934204
1.0744318774726132
0.7767540175671601
0.6463600118393914
0.5697921334285094
0.5255260007556327
0.4930651445381852
0.47285652569841896
0.4522376473848888


## Eval tagger on dev

In [17]:
accuracy(tagger, dev_data)

0.8845684287632768

# Parser

In [38]:
import torch.optim as optim
import torch
import torch.nn.functional as F
import tqdm as tqdm
import time

SAVE = True 

LR = 1e-3
BATCH_SIZE = 100
EPOCHS = 6

LSTM_DIM = 180
LINEAR_HIDDEN_DIM = 180
WORD_DIM = 100
TAG_DIM = 25
DROPOUT_VALUE = 0.3

def train_fixed_parser(train_data, n_epochs=EPOCHS, batch_size=BATCH_SIZE, lr=LR):

    # Create folder for saving model
    if SAVE and not os.path.exists("models"):
      os.makedirs("models")

    vocab_words, vocab_tags = make_vocabs(train_data)
    myparser = parser.FixedWindowParser(vocab_words, vocab_tags, WORD_DIM, TAG_DIM, LSTM_DIM, LINEAR_HIDDEN_DIM, DROPOUT_VALUE)
    myparser.model.train()
    optimizer = optim.Adam(myparser.model.parameters(), lr=lr)

    start_time = time.time()
    best_acc = 0
    for i in tqdm.tqdm(range(n_epochs)):
        total_loss = 0
        batch_nr = 0
        
        for words, tags, i, x, y in training_examples_parser(vocab_words, vocab_tags, train_data, myparser):
            words = words.to(device)
            tags = tags.to(device)
            x = x.to(device)
            y = y.to(device)
            i = i.to(device)

            batch_nr += 1
            
            optimizer.zero_grad()
            y_pred = myparser.model.forward(words[i], tags[i], x)
            
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            total_loss += loss.item()
            optimizer.step()
        print("loss: ", total_loss/batch_nr, "time was: ", time.time() - start_time)
        acc = uas(myparser, dev_data)
        print("", acc)
        if SAVE and best_acc < acc:
          best_acc = acc
          torch.save(myparser.model.state_dict(), "./models/lstm_parser")
        myparser.model.train()
            
    
    return myparser

## Train parser

In [39]:
import os
myparser = train_fixed_parser(train_data)



  0%|          | 0/1 [00:00<?, ?it/s][A[A

loss:  0.3845988561939882 time was:  538.7620625495911


  pred = torch.nn.functional.log_softmax(pred)


100%|██████████| 1/1 [14:19<00:00, 859.56s/it]

 0.8024105654571099





## Load trained model

In [43]:
vocab_words, vocab_tags = make_vocabs(train_data)
loaded_parser = parser.FixedWindowParser(vocab_words, vocab_tags)
loaded_parser.model.load_state_dict(torch.load("./models/lstm_parser"))
loaded_parser.model = loaded_parser.model.to(device)

## Eval on dev with gold tags

In [None]:
uas(loaded_parser, dev_data)

  pred = torch.nn.functional.log_softmax(pred)


## Eval on test with gold tags

In [None]:
uas(loaded_parser, test_data)

## Eval on dev with predicted tags 

In [None]:
def calc_uas_with_tagger_preds(tagger, parser, data):
    correct = 0
    total = 0
    
    new_data = []
    for sent in data:
        pred_tags = tagger.predict(sent)
    
        # Replace gold tags with predicted
        for i , (_, tag) in enumerate(pred_tags):
            sent[i] = (sent[i][0], tag, sent[i][2])
        new_data.append(sent)
        
    return uas(parser, new_data)

In [None]:
calc_uas_with_tagger_preds(tagger, parser, dev_data)