In [None]:
import copy

import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils import data
# import tensorflow as tf
from tqdm import tqdm,trange
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from transformers import *

In [None]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import warnings
import pickle
import random

warnings.filterwarnings('ignore')

data = pd.read_csv("train_ex40.txt", sep='\s',skip_blank_lines=True,encoding = 'utf-8')

dev_data = pd.read_csv("sample2.data",sep = '\s',encoding = 'utf-8')
data

In [None]:
np.where(np.isnan(data['Sentence#']))

In [None]:
# SEED = 1234

# random.seed(SEED)
# np.random.seed(SEED)
# torch.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True

In [None]:
!nvidia-smi

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["0"].values.tolist(),
                                                     s["1"].values.tolist())]
        self.grouped = self.data.groupby("Sentence#").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(data)
dev_getter = SentenceGetter(dev_data)

In [None]:
getter.sentences

In [None]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
dev_sentences = [[word[0] for word in sentence] for sentence in dev_getter.sentences]
sentences

In [None]:
labels = [[s[1] for s in sent] for sent in getter.sentences]
dev_labels = [[s[1] for s in sent] for sent in dev_getter.sentences]

In [None]:
labels

In [None]:
tag_values = list(set(data["1"].values))
tag_values.append("PAD")
tag_values
# tag2idx = {t: i for i, t in enumerate(tag_values)}
# tag2idx

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences

In [None]:
MAX_LEN = 500
bs = 4

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

In [None]:
torch.cuda.get_device_name(0)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [None]:
print(sentences[0])
a = [tokenizer.tokenize(i) for i in sentences[0]]
a1 = tokenizer.encode(sentences[0])
print(a)
print(a1)
b = [tokenizer.convert_tokens_to_ids(i) for i in a] 
print(b)

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        if n_subwords > 1 and 'B-' in label:
            labels.extend([label])
            _ = 'I-' + label.split('B-')[1]
            if _ not in tag_values:
                tag_values.append(_)
                print(_)
            labels.extend([_] * (n_subwords-1))
        else:
            labels.extend([label] * n_subwords)
    return tokenized_sentence, labels

In [None]:
# tokenized_texts_and_labels = [
#     tokenize_and_preserve_labels(sent, labs)
#     for sent, labs in zip(sentences, labels)
# ]
# print('done')
# dev_tokenized_texts_and_labels = [
#     tokenize_and_preserve_labels(sent, labs)
#     for sent, labs in zip(dev_sentences, dev_labels)
# ]

In [None]:
def encode_sent_labels(sentence,label):
    return tokenizer.encode(sentence),['O']+label+['O']

In [None]:
tokenized_texts_and_labels = [
    encode_sent_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]
print('done')
dev_tokenized_texts_and_labels = [
    encode_sent_labels(sent, labs)
    for sent, labs in zip(dev_sentences, dev_labels)
]

In [None]:
len(tokenized_texts_and_labels[0][1])

In [None]:
tag2idx = {t: i for i, t in enumerate(tag_values)}
tag2idx

In [None]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
tokenized_labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

dev_tokenized_texts = [token_label_pair[0] for token_label_pair in dev_tokenized_texts_and_labels]
dev_tokenized_labels = [token_label_pair[1] for token_label_pair in dev_tokenized_texts_and_labels]

In [None]:
# for i,j in zip(dev_tokenized_texts,dev_tokenized_labels):
#     if len(i) != len(j):
#         print('asd')
for idx,i in enumerate(dev_tokenized_texts_and_labels):
    if 100 in i:
        print(idx+1)

In [None]:
idx = 4
print(dev_tokenized_texts[idx])
print(dev_tokenized_labels[idx])

In [None]:
# input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
#                           maxlen=MAX_LEN, dtype="long", value=0.0,
#                           truncating="post", padding="post")

# dev_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in dev_tokenized_texts],
#                           maxlen=MAX_LEN, dtype="long", value=0.0,
#                           truncating="post", padding="post")

input_ids = pad_sequences([txt for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

dev_input_ids = pad_sequences([txt for txt in dev_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [None]:
input_ids

In [None]:
input_ids.shape

In [None]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in tokenized_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
dev_tags =pad_sequences([[tag2idx.get(l) for l in lab] for lab in dev_tokenized_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]
dev_attention_masks = [[float(i != 0.0) for i in ii] for ii in dev_input_ids]

In [None]:
tr_inputs = torch.LongTensor(input_ids)
val_inputs = torch.LongTensor(dev_input_ids)
tr_tags = torch.LongTensor(tags)
val_tags = torch.LongTensor(dev_tags)
tr_masks = torch.tensor(attention_masks)
val_masks = torch.tensor(dev_attention_masks)

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
# train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,batch_size=bs)

valid_data = TensorDataset(val_inputs,val_masks,val_tags)
# valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data,batch_size=bs)

In [None]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-chinese",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False,
)
# model = AutoModelForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner" )

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=1e-5,
    eps=1e-8
)

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 30
max_grad_norm = 2.9

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
model.cuda()

In [None]:
## Store the average loss after each epoch so we can plot them.
from tqdm import tqdm_notebook as tqdm
# t_dataloader = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
loss_all = []
for _ in trange(4, desc="Epoch"):
    model.train()
    total_loss = 0

    # Training loop
    predictions , true_labels = [], []
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)
    print(total_loss)
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    if 'PAD' not in pred_tags:
        print("Training F1-Score: {}".format(f1_score([valid_tags], [pred_tags])))
        print(classification_report([valid_tags],[pred_tags] ))
    else:
        pass
    loss_all.append(avg_train_loss)
plt.plot(loss_all , 'r-o', label="training loss")


#     model.eval()
#     # Reset the validation loss for this epoch.
#     eval_loss, eval_accuracy = 0, 0
#     predictions , true_labels = [], []
#     for batch in valid_dataloader:
#         batch = tuple(t.to(device) for t in batch)
#         b_input_ids, b_input_mask, b_labels = batch

#         # Telling the model not to compute or store gradients,
#         # saving memory and speeding up validation
#         with torch.no_grad():
#             outputs = model(b_input_ids, token_type_ids=None,
#                             attention_mask=b_input_mask, labels=b_labels)
#         # Move logits and labels to CPU
#         logits = outputs[1].detach().cpu().numpy()
#         label_ids = b_labels.to('cpu').numpy()

#         # Calculate the accuracy for this batch of test sentences.
#         eval_loss += outputs[0].mean().item()
#         predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
#         true_labels.extend(label_ids)

#     eval_loss = eval_loss / len(valid_dataloader)
#     validation_loss_values.append(eval_loss)
#     print("Validation loss: {}".format(eval_loss))
#     pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
#                                  for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
#     valid_tags = [tag_values[l_i] for l in true_labels
#                                   for l_i in l if tag_values[l_i] != "PAD"]
#     print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
#     print()
#     print(classification_report(valid_tags,pred_tags ))

In [None]:
plt.plot(loss_all[20:] , 'b-o', label="training loss")

In [None]:
c = 0
for i in pred_tags:
    if i == 'PAD':
        c +=1
c

In [None]:
model.eval()
eval_loss, eval_accuracy = 0, 0
predictions , true_labels ,x_list = [], [],[]
for batch in train_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
    logits = outputs[1].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    input_ids = b_input_ids.to('cpu').numpy()
    eval_loss += outputs[0].mean().item()
    x_list.extend(input_ids)
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.extend(label_ids)
eval_loss = eval_loss / len(train_dataloader)
# validation_loss_values.append(eval_loss)
print("Validation loss: {}".format(eval_loss))
X = []
for x in x_list:
    _ = []
    for i in x:
        if i!= 0:
            _.append(tokenizer.convert_ids_to_tokens(int(i)))
    X.append(_)
pred_tags = []
for p, l in zip(predictions, true_labels):
    _ = []
    for p_i, l_i in zip(p, l):
        if tag_values[l_i] != "PAD":
            _.append(tag_values[p_i])
    pred_tags.append(_)    
valid_tags = []
for l in true_labels:
    _ = []
    for l_i in l:
        if tag_values[l_i] != "PAD":
            _.append(tag_values[l_i])
    valid_tags.append(_)
print(classification_report(valid_tags,pred_tags))

In [None]:
# x_list
c = 0
for i in x_list:
    if 100 in i:
        c+=1
c

In [None]:
# len(valid_tags)
len(predictions)

In [None]:
fp = 0
fn = 0
tp = 0 
inpec = []
res = False
for idx,(x,y_pr,y_tr) in enumerate(zip(X,pred_tags,valid_tags)):
    for i,j,k in zip(x,y_pr,y_tr):
        if 'time' in k and k != j:
            fn+=1
            res = True
            continue
    if res:
        inpec.append((x,y_pr,y_tr,[idx+1]*len(x)))
        res = False
#         elif 'time' in j and k != j:
#             fp+=1
#         elif 'time' in k and k == j:
#             tp+=1
print(tp,fp,fn)
len(inpec)

In [None]:
with open("./inspect_roberta_3.txt","w+",encoding="utf-8") as f:
    for i in inpec:
        f.write('word')
        f.write('\t')
        f.write('pred')
        f.write('\t')
        f.write('true')
        f.write('\t')
        f.write('#')
        f.write('\t')
        f.write('\n')
        for j in zip(i[0],i[1],i[2],i[3]):
            for q in j:
                f.write(str(q))
                f.write('\t')
            f.write('\n')
        f.write('\n')  

### TEst

In [None]:
test_data = pd.read_csv("test_ex40.txt",sep = '\s',encoding='utf-8')
test_data

In [None]:
# w = 0
# for i in test_data.index:
#     if test_data['0'][i] == '…':
#         w+=1
#         test_data['0'][i] = 'ˋ'

In [None]:
class testGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [w for w in s['0'].values.tolist()]
        self.grouped = self.data.groupby('Sentence#').apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
test_getter = testGetter(test_data)

In [None]:
# test_sentences = [[word for word in sentence] for sentence in test_getter.sentences]
test_sentences = []
for idx,sentence in enumerate(test_getter.sentences , start = 1):
    _ = []
    c = 0
    for word in sentence:
        _.append(word)
        if len(sentence)>511:
            c += 1
        if c >457:
            test_sentences.append(_)
            _ = []
            c = 0
    test_sentences.append(_)

In [None]:
s = 0
sent_b = []
for i in test_sentences:
    if '燒' in i:
        sent_b.append(i)
sent_b

In [None]:
len(max(test_sentences,key = len))

In [None]:
pred_valid = []
c = 1
for i in test_sentences:
    if len(i) > 500:
        print(c,len(i))
    try:
        tokenized_sentence = tokenizer.encode(i)
    except:
        print(i)
    input_ids = torch.tensor([tokenized_sentence]).cuda()
    pred_valid.append(input_ids)
    c += 1

In [None]:
c = 0
for i in pred_valid:
    for j in i[0]:
        if j.cpu().item() == 100:
            c += 1
c

In [None]:
all_label_indices = []
c = 0
for i in pred_valid:
    with torch.no_grad():
        try:
            output = model(i)
        except:
            print(len(i[0]))
            print(c)
    c += 1
    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    all_label_indices.append(label_indices)
all_label_indices

In [None]:
all_new_tokens = []
all_new_labels = []
for i in range(len(pred_valid)):
    tokens = tokenizer.convert_ids_to_tokens(pred_valid[i].to('cpu').numpy()[0])
    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, all_label_indices[i][0]):
        if token.startswith("##"):
            print(token)
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_labels.append(tag_values[label_idx])
            new_tokens.append(token)
    all_new_tokens.append(new_tokens)
    all_new_labels.extend(new_labels[1:-1])

In [None]:
len(all_new_labels)

In [None]:
for i,j in zip(all_new_labels,test_data.index):
    test_data['1'][j] = i

In [None]:
test_data
test_data.to_csv('final_roberta_4.txt',sep='\t',index=None,encoding= 'utf-8')

In [None]:
class Upload_f(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t,p) for w, t ,p in zip(s['0'].values.tolist(),
                                                     s['1'].values.tolist(),
                                                     s['2'].values.tolist())]
        self.grouped = self.data.groupby('2').apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [None]:
getter = Upload_f(test_data)
len(getter.sentences)

In [None]:
def art_append(art_id,s_id,e_id,text,ner_type):
    q = []
    q.append(art_id)
    q.append(s_id)
    q.append(e_id)
    q.append(text)
    q.append(ner_type)
    return q

upload = []
for sentence in getter.sentences:
    str_len = 0
    res = False
    s = ''
    n_t = ''
    res2 = False
    for sent in sentence:
        if not (res or res2) and (sent[1] == 'O' or 'I-' in sent[1]):
            str_len += len(sent[0])
            continue
        elif not res and 'B-' in sent[1]:
            res = True
            res2 = False
            s = sent[0]
            n_t = sent[1].split('B-')[1]
            st_id = str_len
            str_len += len(sent[0])
            continue
        elif (res or res2) and 'B-' in sent[1]:
            res = True
            if res2:
                res2 = False
            end_id = str_len
            _1 = art_append(sent[2],st_id,end_id,s,n_t)
            upload.append(_1) 
            s = sent[0]
            n_t = sent[1].split('B-')[1]
            st_id = str_len
            str_len += len(sent[0])
            continue
        elif res and ('I-' in sent[1]):
            res2 = True
            s += sent[0]
            str_len += len(sent[0])
            continue
        elif res and (sent[1] == 'O'):
            res = False
            res2 = False
            end_id = str_len
            _1 = art_append(sent[2],st_id,end_id,s,n_t)
            upload.append(_1) 
            s = ''
            n_t = ''
            str_len += len(sent[0])
            continue
    if res2:
        res2 = False
        end_id = str_len
        _1 = art_append(sent[2],st_id,end_id,s,n_t)
        upload.append(_1) 

In [None]:
with open("./final_roberta_3.tsv","w+",encoding="utf-8") as f: 
    f.write('article_id')
    f.write('\t')
    f.write('start_position')
    f.write('\t')
    f.write('end_position')
    f.write('\t')
    f.write('entity_text')
    f.write('\t')
    f.write('entity_type')
    f.write('\n')
    for q in upload:
        for j in q[:-1]:
            f.write(str(j))
            f.write('\t')
        f.write(str(q[-1]))
        f.write('\n')

In [None]:
a = ['醫', '師', '：', '因', '為', '你', '之', '前', '打', '針', '，', '假', '如', '效', '果', '有', '效', '有', '時', '候', '2', '0', '1', '1', '3', '月', '4', '號', '就', '見', '效', '了', '。']
# a = inpec[2][0]
print(a,len(a))
b = tokenizer.encode(a)
# print(b)
test_i = torch.tensor([b]).cuda()
# print(test_i)
test_o = model(test_i)
# print(test_o[0].shape)
# print(test_o[0].to('cpu').data.numpy())
test_l = np.argmax(test_o[0].to('cpu').data.numpy(), axis=2)
# print(test_l)
test_tag = [tag_values[i] for i in test_l[0][1:-1]]
print(test_tag ,len(test_tag))
# tag_values[label_idx]
# c = tokenizer.convert_ids_to_tokens(b)
# print(c)
# tokenizer.convert_tokens_to_ids('[UNK]')