# Dependency parsing baseline


Dependency parsing is the task of mapping a sentence to a formal representation of its syntactic structure in the form of a dependency tree, which consists of directed arcs between individual words (tokens). Here we will implement a dependency parser baseline based on the arc-standard algorithm and the fixed-window model that we implemented in Lab L3.

### Download neccessary python files

In [None]:
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/batchify.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/create_vocab.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/data_handling.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/syntax_parser.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/projectivize.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/taggers.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/uas.py
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/window_models.py


## Imports

In [1]:
from batchify import *
from create_vocab import *
from data_handling import *
import syntax_parser as parser 
from projectivize import *
from uas import *
from window_models import *
from taggers import *
import importlib


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
device

device(type='cuda')

## Prepare data set

### Download data files 

In [None]:
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/data/en_gum-ud-dev-projectivized.conllu
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/data/en_gum-ud-train-projectivized.conllu
!wget https://raw.githubusercontent.com/hugocedervall/nlp-project/main/data/en_gum-ud-test-projectivized.conllu

In [4]:
train_data = Dataset('data/en_gum-ud-train-projectivized.conllu')
dev_data = Dataset('data/en_gum-ud-dev-projectivized.conllu')
test_data = Dataset('data/en_gum-ud-test-projectivized.conllu')

# Tagger

In [5]:
import torch.optim as optim
import torch
import torch.nn.functional as F

def train_fixed_window(train_data, n_epochs=1, batch_size=100, lr=1e-2):
    vocab_words, vocab_tags = make_vocabs(train_data)
    tagger = FixedWindowTagger(vocab_words, vocab_tags, len(vocab_tags))
    
    optimizer = optim.Adam(tagger.model.parameters(), lr=lr)
    for i in range(n_epochs):
        total_loss = 0
        batch_nr = 0
        for x, y in training_examples_tagger(vocab_words, vocab_tags, train_data, tagger):
            batch_nr += 1
            
            optimizer.zero_grad()
            y_pred = tagger.model.forward(x)
            
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            total_loss += loss.item()
            optimizer.step()
            if batch_nr % 100 == 1:
                print(total_loss/batch_nr)
                #pass
    return tagger


## Train tagger

In [9]:
tagger = train_fixed_window(train_data)

2.9646172523498535
1.073240417212543
0.7750730240967736
0.6449551091736734
0.570190991063665
0.5248511874985077
0.4928285411486213
0.47443602890966624
0.4532936280325855


## Eval tagger on dev

In [10]:
accuracy(tagger, dev_data)

0.8816994261994873

# Parser

In [8]:
import torch.optim as optim
import torch
import torch.nn.functional as F
import tqdm as tqdm
import time

SAVE = True 

LR = 1e-3
BATCH_SIZE = 100
EPOCHS = 6

LSTM_DIM = 180
LINEAR_HIDDEN_DIM = 180
WORD_DIM = 100
TAG_DIM = 25
DROPOUT_VALUE = 0.3

def train_fixed_parser(train_data, n_epochs=EPOCHS, batch_size=BATCH_SIZE, lr=LR):

    # Create folder for saving model
    if SAVE and not os.path.exists("models"):
      os.makedirs("models")

    vocab_words, vocab_tags = make_vocabs(train_data)
    myparser = parser.FixedWindowParser(vocab_words, vocab_tags, WORD_DIM, TAG_DIM, LSTM_DIM, LINEAR_HIDDEN_DIM, DROPOUT_VALUE)
    myparser.model.train()
    optimizer = optim.Adam(myparser.model.parameters(), lr=lr)

    start_time = time.time()
    best_acc = 0
    for i in tqdm.tqdm(range(n_epochs)):
        total_loss = 0
        batch_nr = 0
        
        for words, tags, i, x, y in training_examples_parser(vocab_words, vocab_tags, train_data, myparser):
            words = words.to(device)
            tags = tags.to(device)
            x = x.to(device)
            y = y.to(device)
            i = i.to(device)

            batch_nr += 1
            
            optimizer.zero_grad()
            y_pred = myparser.model.forward(words[i], tags[i], x)
            
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            total_loss += loss.item()
            optimizer.step()
        print("loss: ", total_loss/batch_nr, "time was: ", time.time() - start_time)
        acc = uas(myparser, dev_data)
        print("", acc)
        if SAVE and best_acc < acc:
          best_acc = acc
          torch.save(myparser.model.state_dict(), "./models/lstm_parser")
        myparser.model.train()
            
    
    return myparser

## Train parser

In [12]:
import os
myparser = train_fixed_parser(train_data)

  0%|          | 0/6 [00:00<?, ?it/s]

loss:  0.3870662623013759 time was:  135.02782893180847


  pred = torch.nn.functional.log_softmax(pred)
 17%|█▋        | 1/6 [03:49<19:07, 229.47s/it]

 0.8006154635209642
loss:  0.23430996226512732 time was:  361.3182520866394


 33%|███▎      | 2/6 [07:35<15:13, 228.47s/it]

 0.8300423131170663
loss:  0.16593136837601055 time was:  591.2211616039276


 50%|█████     | 3/6 [11:25<11:27, 229.04s/it]

 0.8331196307218874
loss:  0.12016238521166418 time was:  820.3819451332092


 67%|██████▋   | 4/6 [15:16<07:38, 229.48s/it]

 0.8377997179125529
loss:  0.09332666659199107 time was:  1049.4799551963806


 83%|████████▎ | 5/6 [19:05<03:49, 229.22s/it]

 0.837222720861649
loss:  0.07550786692880988 time was:  1278.283739566803


100%|██████████| 6/6 [22:52<00:00, 228.75s/it]

 0.8481215540453905





## Load trained model

In [9]:
vocab_words, vocab_tags = make_vocabs(train_data)
loaded_parser = parser.FixedWindowParser(vocab_words, vocab_tags)
loaded_parser.model.load_state_dict(torch.load("./models/lstm_parser"))
loaded_parser.model = loaded_parser.model.to(device)

## Eval on dev with gold tags

In [12]:
uas(loaded_parser, dev_data)

0.8492114373637646

## Eval on test with gold tags

In [16]:
uas(loaded_parser, test_data)

0.8410146929549165

## Eval on dev with predicted tags 

In [10]:
from tagger_lstm import *
tagger = get_saved_tagger().to(device)

#test_loss, test_acc = evaluate(tagger, criterion, TAG_PAD_IDX, test_data)
#print(f"{test_acc*100:.3f} %")

vocab_words, vocab_tags = make_vocabs(train_data)
def calc_uas_with_tagger_preds(tagger, parser, data):
    correct = 0
    total = 0
    
    new_data = []
    for sent in data:
        words = sent2id(sent)
        pred_tags = predict_tags(tagger, words, vocab_tags)
    
        # Replace gold tags with predicted
        for i , tag in enumerate(pred_tags):
            sent[i] = (sent[i][0], tag, sent[i][2])
        new_data.append(sent)
        
    return uas(parser, new_data)

There are 1 GPU(s) available.
We will use the GPU: GeForce RTX 3070


In [11]:
calc_uas_with_tagger_preds(tagger, loaded_parser, dev_data)

  pred = torch.nn.functional.log_softmax(pred)


0.7809975637902296