In [2]:
import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim
import nltk
from pytorch_pretrained_bert import BertTokenizer

In [3]:
#Data preparation

In [4]:
training_file = open("corpus", "r")
all_lines = training_file.readlines()
training_file.close()

In [5]:
def split_words():
    for words in all_lines:
        split_words=words.split(' ')
    return split_words

In [6]:
tagged=list(split_words())
tagged

['Tene/JJ',
 'yĩla/RB',
 'Ĩsilaeli/NP',
 "yatongoew'e/VB",
 'nĩ/PRE',
 'Asili/NNS',
 ',/COMMA',
 'nĩkweethĩiwe/VB',
 'na/CONJ',
 'yũa/NN',
 'nthĩ/NN',
 'ĩsu/JJ',
 './.',
 'Kwoou/JJ',
 'mũndũ/NN',
 'ũmwe/JJ',
 'kuma/PRE',
 'Mbetheleemu/NP',
 'nthĩ/NN',
 'ya/PRE',
 'Yuta/NP',
 'nĩwaendie/VB',
 'e/RB',
 'na/CONJ',
 'mũka/NN',
 'na/CONJ',
 'ana/NNS',
 'make/PP$',
 'elĩ/NUM',
 'kwĩkala/VB',
 'kwa/RB',
 'kavinda/NN',
 'ũeninĩ/JJ',
 'nthĩ/NN',
 'ya/PRE',
 'Moavi/NP',
 './.',
 'Mũndũ/NN',
 'ũsu/JJ',
 'eetawa/VB',
 'Elimeleki/NP',
 ',/COMMA',
 'na/CONJ',
 'mũka/NN',
 'eetawa/VB',
 'Naũmi/NP',
 './.',
 'Ana/NNS',
 'make/PP$',
 'ũmwe/NUM',
 'eetawa/VB',
 'Maloni/NP',
 'na/CONJ',
 'ũla/DET',
 'ũngĩ/JJ',
 'eetawa/VB',
 'Kilioni/NP',
 './.',
 'Andũ/NNS',
 'asu/JJ',
 'maĩ/VB',
 'ma/PP$',
 'mũsyĩ/NN',
 'wa/PRE',
 'Aevilathi/NPS',
 'ala/JJ',
 'matwĩe/VB',
 'Mbetheleemu/NP',
 'nthĩ/NN',
 'ya/PRE',
 'Yuta/NP',
 './.',
 'Nĩmaendie/VB',
 'Moavi/NP',
 'matũa/VB',
 "kw'o/RB",
 './.',
 'Elimeleki/NP',
 'mũũme

In [8]:
#categorize and tag words using str2tupple
tagged_sents=[nltk.tag.str2tuple(t) for t in tagged]
len(tagged_sents)

134

In [9]:
tagged_sents[0]

('Tene', 'JJ')

In [16]:
#tags = list(set(word_pos[1] for sent in tagged_tokens for word_pos in sent))
tags=list(set([pair[1] for pair in tagged_sents]))
tags

['PP$',
 'NPS',
 'NN',
 'PRE',
 '.',
 'NP',
 'RB',
 'VB',
 'CONJ',
 'NUM',
 'NNS',
 'COMMA',
 '.\n',
 'DET',
 'JJ']

In [17]:
",".join(tags)

'PP$,NPS,NN,PRE,.,NP,RB,VB,CONJ,NUM,NNS,COMMA,.\n,DET,JJ'

In [18]:
# By convention, the 0'th slot is reserved for padding.
tags = ["<pad>"] + tags

In [19]:
tags

['<pad>',
 'PP$',
 'NPS',
 'NN',
 'PRE',
 '.',
 'NP',
 'RB',
 'VB',
 'CONJ',
 'NUM',
 'NNS',
 'COMMA',
 '.\n',
 'DET',
 'JJ']

In [20]:
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

In [21]:
tag2idx

{'<pad>': 0,
 'PP$': 1,
 'NPS': 2,
 'NN': 3,
 'PRE': 4,
 '.': 5,
 'NP': 6,
 'RB': 7,
 'VB': 8,
 'CONJ': 9,
 'NUM': 10,
 'NNS': 11,
 'COMMA': 12,
 '.\n': 13,
 'DET': 14,
 'JJ': 15}

In [22]:
idx2tag

{0: '<pad>',
 1: 'PP$',
 2: 'NPS',
 3: 'NN',
 4: 'PRE',
 5: '.',
 6: 'NP',
 7: 'RB',
 8: 'VB',
 9: 'CONJ',
 10: 'NUM',
 11: 'NNS',
 12: 'COMMA',
 13: '.\n',
 14: 'DET',
 15: 'JJ'}

In [23]:
# Let's split the data into train and test (or eval)
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(tagged_sents, test_size=.1)
len(train_data), len(test_data)

(120, 14)

In [24]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [39]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [40]:
# sents,tags_li=[],[]
# words = [word_pos[0] for word_pos in tagged_sents]
# tags = [word_pos[1] for word_pos in tagged_sents]
# sents.append(["[CLS]"] + words + ["[SEP]"])
# tags_li.append(["<pad>"] + tags + ["<pad>"])

In [51]:
class PosDataset(data.Dataset):
    def __init__(self, tagged_sents):
        sents, tags_li = [], [] # list of lists
        words = [word_pos[0] for word_pos in tagged_sents]
        tags = [word_pos[1] for word_pos in tagged_sents]
        sents.append(["[CLS]"] + words + ["[SEP]"])
        tags_li.append(["<pad>"] + tags + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li
#         for sent in tagged_sents:
#             words = [word_pos[0] for word_pos in sent]
#             tags = [word_pos[1] for word_pos in sent]
#             sents.append(["[CLS]"] + words + ["[SEP]"])
#             tags_li.append(["<pad>"] + tags + ["<pad>"])
#         self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)
        
        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen

In [52]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

In [53]:
from pytorch_pretrained_bert import BertModel

In [54]:
class Net(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64
        '''
        x = x.to(device)
        y = y.to(device)
        
        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]
        
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

In [55]:
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [56]:
def eval(model, iterator):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")
            
    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

    print("acc=%.2f"%acc)

In [57]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

In [58]:
train_dataset = PosDataset(train_data)
eval_dataset = PosDataset(test_data)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = 0.0001)

criterion = nn.CrossEntropyLoss(ignore_index=0)

In [59]:
train(model, train_iter, optimizer, criterion)

step: 0, loss: 2.8034958839416504


In [60]:
eval(model, test_iter)

acc=0.43


In [61]:
open('result', 'r').read().splitlines()[:100]

['nĩkweethĩiwe VB VB',
 'elĩ NUM NN',
 ', COMMA COMMA',
 'make PP$ RB',
 'ĩkũmi NUM NP',
 'mũndũ NN NN',
 'Moavi NP NP',
 'ũla DET JJ',
 'Mbetheleemu NP NP',
 'na CONJ VB',
 'nthĩ NN NN',
 'nĩ PRE VB',
 'ana NNS NP',
 'Kwoou JJ VB',
 '']