In [19]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
import sys
import numpy as np

sys.path.append("../")

warnings.filterwarnings("ignore")

In [20]:
import os

data_dir = "/data/aaemeljanov/training_data_v10/processed/"
train_path = data_dir + "brexit_parsed.csv"
valid_path = data_dir + "asia_bibi_parsed.csv"
model_dir = "/home/aaemeljanov/models/multi_cased_L-12_H-768_A-12/"
init_checkpoint_pt = os.path.join(model_dir, "pytorch_model.bin")
bert_config_file = os.path.join(model_dir, "bert_config.json")
vocab_file = os.path.join(model_dir, "vocab.txt")

In [3]:
import pandas as pd

In [4]:
tr = pd.read_csv(train_path)

In [5]:
vl = pd.read_csv(valid_path)

In [6]:
len(tr), len(vl)

(22621, 5457)

In [7]:
tr = tr.append(vl)

In [8]:
len(tr)

28078

In [9]:
tr.to_csv(data_dir + "all.csv", index=False)

In [12]:
from modules import BertNerData as NerData
from modules.models.bert_models import BertBiLSTMAttnNMTJoint, BertBiLSTMAttnNCRFJoint
from modules import NerLearner

In [5]:
data = NerData.create(train_path, valid_path, vocab_file, is_cls=True, batch_size=16)

                                                       

In [6]:
import torch

In [7]:
torch.cuda.set_device(1)

In [8]:
model = BertBiLSTMAttnNCRFJoint.create(
        len(data.label2idx), len(data.cls2idx), bert_config_file, init_checkpoint_pt,
        enc_hidden_dim=1024, rnn_layers=1, num_heads=6, input_dropout=0.5, nbest=11)

build CRF...


In [9]:
num_epochs = 150
learner = NerLearner(model, data,
                     best_model_path="/home/aaemeljanov/models/AGRR-2019/slavic-clf.cpt",
                     lr=0.0001, clip=1.0, sup_labels=data.id2label[1:],
                     t_total=num_epochs * len(data.train_dl))

In [10]:
learner.load_model()

In [11]:
from modules.data.bert_data import get_bert_data_loader_for_predict

In [12]:
dl = get_bert_data_loader_for_predict(valid_path, learner)

                                                     

In [39]:
preds_res, preds_cls = learner.predict(dl)

                                                 

In [40]:
from sklearn_crfsuite.metrics import flat_classification_report

In [41]:
from modules.utils.utils import bert_labels2tokens, first_choicer

In [69]:
pred_tokens, pred_labels = bert_labels2tokens(dl, preds_res)
true_tokens, true_labels = bert_labels2tokens(dl, [x.labels for x in dl.dataset])

In [70]:
len(true_tokens)

5457

In [71]:
assert pred_tokens == true_tokens
tokens_report = flat_classification_report(true_labels, pred_labels)

In [72]:
print(tokens_report)

              precision    recall  f1-score   support

       I_EVT       0.53      0.32      0.40        53
       I_LOC       0.69      0.91      0.78      1432
         I_O       0.99      0.99      0.99     72129
       I_ORG       0.71      0.70      0.71      1869
       I_PER       0.88      0.82      0.85      4107
       I_PRO       0.45      0.53      0.49       244

   micro avg       0.97      0.97      0.97     79834
   macro avg       0.71      0.71      0.70     79834
weighted avg       0.97      0.97      0.97     79834



In [45]:
from sklearn.metrics import accuracy_score, f1_score

pl = []
for line in pred_labels:
    for p in line:
        pl.append(p)
        
tl = []
for line in true_labels:
    for p in line:
        tl.append(p)
accuracy_score(tl, pl), f1_score(tl, pl, average="macro")

(0.971378109577373, 0.7029411114912586)

In [49]:
true_cls = [x.cls for x in dl.dataset]

In [54]:
print(flat_classification_report([true_cls], [preds_cls]))

              precision    recall  f1-score   support

       False       0.83      0.99      0.90      1727
        True       1.00      0.91      0.95      3730

   micro avg       0.93      0.93      0.93      5457
   macro avg       0.91      0.95      0.93      5457
weighted avg       0.94      0.93      0.93      5457



### Fix with cls result

In [57]:
len(preds_cls) - np.array(preds_cls).sum()

2068

In [47]:
fixed_labels = []
for cls, label in zip(preds_cls, pred_tokens):
    if not cls:
        label = ["I_O"] * len(label)
    fixed_labels.append(label)

In [48]:
from sklearn.metrics import accuracy_score, f1_score

pl = []
for line in fixed_labels:
    for p in line:
        pl.append(p)

tl = []
for line in true_labels:
    for p in line:
        tl.append(p)
accuracy_score(tl, pl), f1_score(tl, pl, average="macro")

(0.3545732394719042, 4.998277433302695e-05)

In [58]:
from modules.utils.utils import tokens2spans

In [59]:
pred_spans = tokens2spans(pred_tokens, pred_labels)

In [60]:
true_spans = tokens2spans(true_tokens, true_labels)

In [61]:
from modules.utils.plot_metrics import get_bert_span_report

In [62]:
from modules.utils.utils import voting_choicer

In [74]:
print(get_bert_span_report(dl, preds_res, fn=first_choicer))

              precision    recall  f1-score   support

         PER      0.733     0.725     0.729      2851
         PRO      0.384     0.547     0.451       170
         EVT      0.385     0.370     0.377        27
         LOC      0.648     0.872     0.744      1472
         ORG      0.550     0.630     0.587       986

   micro avg      0.658     0.740     0.696      5506
   macro avg      0.540     0.629     0.578      5506
weighted avg      0.665     0.740     0.697      5506



In [64]:
max([len(data.train_dl.dataset[idx].bert_tokens) for idx in range(len(data.train_dl.dataset))])

144

### Predict test

In [21]:
test_path = data_dir + "test.csv"

In [None]:
from modules import BertNerData as NerData
from modules.models.bert_models import BertBiLSTMAttnNMTJoint, BertBiLSTMAttnNCRFJoint
from modules import NerLearner

data = NerData.create(train_path, test_path, vocab_file, is_cls=True, batch_size=16)

In [17]:
data.cls2idx

{False: 1, 'O': 2, True: 0}

In [18]:
import torch

torch.cuda.set_device(4)

model = BertBiLSTMAttnNCRFJoint.create(
        len(data.label2idx), 2, bert_config_file, init_checkpoint_pt,
        enc_hidden_dim=1024, rnn_layers=1, num_heads=6, input_dropout=0.5, nbest=11)
# model = BertBiLSTMAttnNMTJoint.create(
#    len(data.label2idx), len(data.cls2idx), bert_config_file, init_checkpoint_pt,
#    enc_hidden_dim=128, rnn_layers=1, dec_embedding_dim=32, dec_hidden_dim=128, input_dropout=0.5, nbest=11)
# model = torch.nn.DataParallel(model, [2, 3])
num_epochs = 150
learner = NerLearner(model, data,
                     best_model_path="/home/aaemeljanov/models/AGRR-2019/slavic-all.cpt",
                     lr=0.0001, clip=1.0, sup_labels=data.id2label[1:],
                     t_total=num_epochs * len(data.train_dl))


build CRF...


In [20]:
learner.load_model()

In [21]:
from modules.data.bert_data import get_bert_data_loader_for_predict

dl = get_bert_data_loader_for_predict(test_path, learner)

                                                       

In [23]:
preds_res, preds_cls = learner.predict(dl)

                                                 

In [52]:
from modules.utils.utils import bert_labels2tokens, first_choicer

pred_tokens, pred_labels = bert_labels2tokens(dl, preds_res, fn=first_choicer)

In [53]:
from modules.utils.utils import tokens2spans

In [54]:
pred_spans = tokens2spans(pred_tokens, pred_labels)

#### To annotation df

In [16]:
import pandas as pd

In [22]:
tdf = pd.read_csv(test_path)

In [58]:
tdf["spans"] = pred_spans

In [70]:
paths = []
annotations = []

In [71]:
for path, group in tdf.groupby("path"):
    path = path.split("|")[1].replace("/home/aaemeljanov/data/", "")
    l = [r for _, r in group.iterrows()]
    spans = {}
    for r in l:
        for s in r.spans:
            if s[1] != "O":
                spans[s[0]] = s[1]
    annotation = "\n".join(["{}\tNormalForm\t{}".format(k, spans[k]) for k in spans])
    paths.append(path)
    annotations.append(annotation)

In [72]:
paths[0]

'raw/nord_stream/bg/nord_stream_bg.txt_file_1003.txt'

In [73]:
print(annotations[0])

Германия	NoralForm	LOC
Украйна	NoralForm	LOC
Швеция	NoralForm	LOC
Русия	NoralForm	LOC
Витренко	NoralForm	PER
Европа“	NoralForm	LOC
Европа	NoralForm	LOC
Дания	NoralForm	LOC
Юрий Витренко	NoralForm	PER


In [74]:
res_df = pd.DataFrame({"paths": paths, "annotations": annotations})

In [75]:
res_df.head()

Unnamed: 0,annotations,paths
0,Германия\tNoralForm\tLOC\nУкрайна\tNoralForm\t...,raw/nord_stream/bg/nord_stream_bg.txt_file_100...
1,Германия\tNoralForm\tLOC\nШвеция\tNoralForm\tL...,raw/nord_stream/bg/nord_stream_bg.txt_file_100...
2,Куртика\tNoralForm\tPER\nпоток\tNoralForm\tLOC...,raw/nord_stream/bg/nord_stream_bg.txt_file_100...
3,Варшава\tNoralForm\tLOC\nЕС\tNoralForm\tORG\nП...,raw/nord_stream/bg/nord_stream_bg.txt_file_100...
4,Германия\tNoralForm\tLOC\nУкрайна\tNoralForm\t...,raw/nord_stream/bg/nord_stream_bg.txt_file_100...


In [76]:
res_df.to_csv("/home/aaemeljanov/data/test_pred_slavic.csv", index=False)

### Add index

In [1]:
import pandas as pd

In [39]:
tdf = pd.read_csv("test_df.csv")

In [45]:
tdf.head(10)

Unnamed: 0.1,Unnamed: 0,tokens,tags,lang,path,domain,cltags
0,0,германия,LOC,bg,raw/nord_stream/bg/nord_stream_bg.txt_file_100...,nord_stream,LOC-0
1,1,украйна,LOC,bg,raw/nord_stream/bg/nord_stream_bg.txt_file_100...,nord_stream,LOC-1
2,2,швеция,LOC,bg,raw/nord_stream/bg/nord_stream_bg.txt_file_100...,nord_stream,LOC-2
3,3,русия,LOC,bg,raw/nord_stream/bg/nord_stream_bg.txt_file_100...,nord_stream,LOC-3
4,4,витренко,PER,bg,raw/nord_stream/bg/nord_stream_bg.txt_file_100...,nord_stream,PER-4
5,5,европа,LOC,bg,raw/nord_stream/bg/nord_stream_bg.txt_file_100...,nord_stream,LOC-5
6,7,дания,LOC,bg,raw/nord_stream/bg/nord_stream_bg.txt_file_100...,nord_stream,LOC-6
7,8,юрий_витренко,PER,bg,raw/nord_stream/bg/nord_stream_bg.txt_file_100...,nord_stream,PER-7
8,11,бисер_петков,PER,bg,raw/nord_stream/bg/nord_stream_bg.txt_file_100...,nord_stream,PER-8
9,12,георге_иванова,PER,bg,raw/nord_stream/bg/nord_stream_bg.txt_file_100...,nord_stream,PER-9


In [7]:
res_df = pd.read_csv("/home/aaemeljanov/data/test_pred_slavic.csv")

In [8]:
res_df.head()

Unnamed: 0,annotations,paths
0,Германия\tNoralForm\tLOC\nУкрайна\tNoralForm\t...,raw/nord_stream/bg/nord_stream_bg.txt_file_100...
1,Германия\tNoralForm\tLOC\nШвеция\tNoralForm\tL...,raw/nord_stream/bg/nord_stream_bg.txt_file_100...
2,Куртика\tNoralForm\tPER\nпоток\tNoralForm\tLOC...,raw/nord_stream/bg/nord_stream_bg.txt_file_100...
3,Варшава\tNoralForm\tLOC\nЕС\tNoralForm\tORG\nП...,raw/nord_stream/bg/nord_stream_bg.txt_file_100...
4,Германия\tNoralForm\tLOC\nУкрайна\tNoralForm\t...,raw/nord_stream/bg/nord_stream_bg.txt_file_100...


In [105]:
paths = []
annotations = []

In [109]:
for path, group in tdf.groupby("path"):
    l = [r for _, r in group.iterrows()]
    text = "{}-{}".format(path.split("/")[2], path.split(".")[-2].split("_")[-1])
    for r in l:
        if isinstance(r.tokens, str):
            tokens = r.tokens.replace("_", " ")
            tag = r.tags
            cltags = r.cltags
            normal_form = "NormalForm"
            line = "\n{}\t{}\t{}\t{}".format(tokens, normal_form, tag, cltags)
            text+=line
    path = "/home/aaemeljanov/data/annotated/" + path[4:-3] + "out"
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)
    paths.append(path)
    annotations.append(text)

In [113]:
len(annotations)

2394

In [115]:
len(paths)

2394

In [None]:
!