# BERT part of speech tagger

In [1]:
import torch
# If GPU available
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1060 3GB


In [2]:
from batchify import *
from create_vocab import *
from data_handling import *
from parser import *
from projectivize import *
from uas import *
from window_models import *
from taggers import *

## Import data

In [3]:
train_data = Dataset('data/en_gum-ud-train-projectivized.conllu')
dev_data = Dataset('data/en_gum-ud-dev-projectivized.conllu')
test_data = Dataset('data/en_gum-ud-test-projectivized.conllu')

In [4]:
len(list([word for elem in train_data for word in elem]))

86148

In [5]:
word_vocab, label_vocab = make_vocabs(train_data)

In [6]:
words = [[word[0] for word in sent] for sent in train_data]
tags = [word[1] for sent in train_data for word in sent]

In [142]:
#words[:10]

## Load embeddings

In [113]:
import torch

from gensim.models import Word2Vec

word_model = Word2Vec(words, size=200, min_count=1, workers=4)

weights = torch.FloatTensor(word_model.wv.vectors)
embedding = nn.Embedding.from_pretrained(weights)

In [51]:
embedding

Embedding(170, 200)

In [7]:
from gensim.models import KeyedVectors
filename='GoogleNews-vectors-negative300.bin'
word_model2 = KeyedVectors.load_word2vec_format(filename, binary=True)

In [8]:
weights = torch.FloatTensor(word_model2.wv.vectors)
embedding = nn.Embedding.from_pretrained(weights)

  weights = torch.FloatTensor(word_model2.wv.vectors)


In [9]:
word_model2.wv["I", "hey"].shape

  word_model2.wv["I", "hey"].shape


(2, 300)

In [12]:
len(word_model2.wv.vectors)

  len(word_model2.wv.vectors)


3000000

In [138]:
embedding

Embedding(3000000, 300)

## Train tagger

In [10]:
import torch.optim as optim
import torch
import torch.nn.functional as F

from batchify import *
from create_vocab import *
from data_handling import *
from parser import *
from projectivize import *
from uas import *
from window_models import *
from taggers import *



# If GPU available
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

    
    
train_data = Dataset('data/en_gum-ud-train-projectivized.conllu')
dev_data = Dataset('data/en_gum-ud-dev-projectivized.conllu')
test_data = Dataset('data/en_gum-ud-test-projectivized.conllu')

    
def train_fixed_window(train_data, n_epochs=2, batch_size=1, lr=1e-3):
    vocab_words, vocab_tags = make_vocabs(train_data)
    tagger = FixedWindowTagger(vocab_words, vocab_tags, len(vocab_tags))
    tagger.model.to(device)
    
    optimizer = optim.Adam(tagger.model.parameters(), lr=lr)
    for i in range(n_epochs):
        total_loss = 0
        batch_nr = 0
        for x, y in training_examples_tagger2(vocab_words, vocab_tags, train_data, tagger):
            x = x.to(device)
            y = y.to(device)
            batch_nr += 1
            
            optimizer.zero_grad()
            # shape + (batch, seq_len, classes)
            y_pred = tagger.model.forward(x)
            # Cross entropy wants (batch, classes, seq_len)
            print(y_pred.shape)
            #y_pred = y_pred.reshape((y_pred.shape[0], y_pred.shape[2], y_pred.shape[1]))
            if(i==14449):
                print(".....")
                print(torch.argmax(y_pred, dim=1).tolist()[0])
                print(x.tolist()[0])
                print(y.tolist()[0])
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            total_loss += loss.item()
            optimizer.step()
            #if batch_nr % 500 == 1:
                #print(total_loss/batch_nr)
                #pass
        print(f"Epoch {i}, loss: {total_loss/batch_nr:.4f}, val_acc: {accuracy_sentences(tagger, train_data):.4f}")
    return tagger


There are 1 GPU(s) available.
We will use the GPU: GeForce RTX 3070


In [2]:
x = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
y = [1, 17, 10, 11, 3, 6, 3, 2, 5, 7, 10, 11, 3, 5, 15, 5, 5, 10, 8, 6]
#res = tagger.model.forward(torch.tensor(x).unsqueeze(0).to(device))
#res = res.reshape((res.shape[0], res.shape[2], res.shape[1]))
#print(torch.argmax(res, dim=1))

id_to_tag = list(tagger.vocab_tags.keys())
id_to_word = list(tagger.vocab_words.keys())

print(list(map(lambda tag : id_to_word[tag], x)))

input_sent = torch.tensor(x).unsqueeze(0).to(device)
pred = tagger.model.forward(input_sent)
pred = pred.reshape((pred.shape[0], pred.shape[2], pred.shape[1]))
pred_tags = torch.argmax(pred, dim=1).squeeze(0)
print(pred_tags)
pred_tags_list = pred_tags.tolist()
print(list(map(lambda tag : id_to_tag[tag], pred_tags_list)))




NameError: name 'tagger' is not defined

In [11]:
tagger = train_fixed_window(train_data, n_epochs=100)

Epoch 0, loss: -12.6118, val_acc: 0.0606
Epoch 1, loss: -40.0902, val_acc: 0.0590
Epoch 2, loss: -66.3024, val_acc: 0.0599
Epoch 3, loss: -91.9623, val_acc: 0.0607
Epoch 4, loss: -117.3589, val_acc: 0.0603
Epoch 5, loss: -142.6423, val_acc: 0.0615
Epoch 6, loss: -167.7978, val_acc: 0.0615
Epoch 7, loss: -192.9043, val_acc: 0.0611
Epoch 8, loss: -218.0611, val_acc: 0.0612
Epoch 9, loss: -243.1523, val_acc: 0.0629
Epoch 10, loss: -268.1913, val_acc: 0.0627
Epoch 11, loss: -293.1948, val_acc: 0.0630
Epoch 12, loss: -318.1716, val_acc: 0.0630
Epoch 13, loss: -343.1265, val_acc: 0.0634
Epoch 14, loss: -368.0781, val_acc: 0.0634
Epoch 15, loss: -393.1441, val_acc: 0.0631
Epoch 16, loss: -418.1827, val_acc: 0.0631
Epoch 17, loss: -443.1799, val_acc: 0.0631
Epoch 18, loss: -468.1530, val_acc: 0.0631
Epoch 19, loss: -493.1088, val_acc: 0.0636
Epoch 20, loss: -518.0505, val_acc: 0.0636
Epoch 21, loss: -542.9814, val_acc: 0.0634
Epoch 22, loss: -567.9027, val_acc: 0.0634
Epoch 23, loss: -592.8156

In [8]:
accuracy_sentences(tagger, train_data)

0.08094209964247574

In [5]:
for sentence in train_data:
    pred = tagger.predict_sentence(sentence)
    for i in range(len(sentence)):
        print(sentence[i][1], pred[i])

<root> PROPN
ADJ AUX
NOUN ADP
CCONJ PROPN
ADJ ADV
NOUN ADP
PUNCT ADJ
<root> PUNCT
NOUN X
ADP DET
NOUN X
<root> ADJ
PROPN NUM
PROPN INTJ
PROPN NUM
PROPN SYM
ADP PRON
PROPN NUM
PUNCT VERB
PROPN NUM
PROPN NUM
<root> SYM
PROPN SYM
PROPN ADJ
PROPN SYM
PROPN SYM
PROPN SYM
PUNCT X
PROPN X
PROPN X
<root> SYM
PROPN SYM
PROPN ADJ
PROPN SYM
PROPN SYM
PROPN SYM
PUNCT X
PROPN X
PROPN X
<root> SYM
PROPN SYM
PROPN ADJ
PROPN SYM
PROPN SYM
PROPN SYM
PUNCT X
PROPN DET
PROPN X
<root> ADJ
ADV PROPN
AUX PUNCT
NOUN ADP
VERB DET
ADP PRON
CCONJ ADP
VERB CCONJ
NOUN PROPN
PUNCT VERB
<root> CCONJ
DET PROPN
NOUN DET
ADP ADP
ADJ VERB
NOUN PUNCT
AUX DET
PRON PUNCT
VERB DET
ADP <root>
PUNCT PUNCT
<root> PUNCT
AUX AUX
NOUN PUNCT
NOUN AUX
VERB CCONJ
DET AUX
NOUN <root>
ADP CCONJ
SCONJ AUX
NOUN PUNCT
VERB CCONJ
ADP X
NOUN NUM
PUNCT CCONJ
<root> NOUN
DET AUX
NOUN CCONJ
NOUN PUNCT
ADP X
NOUN ADV
AUX ADJ
DET CCONJ
ADJ ADJ
NUM ADJ
PUNCT ADJ
VERB ADV
NOUN CCONJ
ADP PROPN
NOUN VERB
PUNCT ADP
NOUN CCONJ
PUNCT DET
NOUN ADJ
PUN

In [None]:
accuracy(tagger, dev_data)

In [7]:
accuracy(tagger, test_data)

NameError: name 'tagger' is not defined

In [13]:
accuracy(tagger, train_data)

0.9864651529925245