In [1]:
import warnings
warnings.filterwarnings("ignore")

import re
import os
import jieba
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

from ark_nlp.model.ner.w2ner_bert import W2NERBert
from ark_nlp.model.ner.w2ner_bert import W2NERBertConfig
from ark_nlp.model.ner.w2ner_bert import Dataset
from ark_nlp.model.ner.w2ner_bert import Task
from ark_nlp.model.ner.w2ner_bert import get_default_w2ner_optimizer
from ark_nlp.factory.lr_scheduler import get_default_linear_schedule_with_warmup, get_default_cosine_schedule_with_warmup
from ark_nlp.model.ner.w2ner_bert import Tokenizer
from ark_nlp.factory.utils.seed import set_seed

set_seed(2022)
tqdm.pandas(desc="inference")

In [2]:
def E_trans_to_C(string):
    E_pun = u',.!?[]()<>"\''
    C_pun = u'，。！？【】（）《》“‘'
    table= {ord(f):ord(t) for f,t in zip(E_pun,C_pun)}
    return string.translate(table)

In [3]:
# test = pd.read_csv("data/test.csv", sep="\t")
train = pd.read_csv("data/train.csv", sep="\t")

In [4]:
# test["text"] = test["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：；→，。、\-”]+$", "", line.strip())))
train["text"] = train["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：→；，。、\-”]+$", "", line.strip())))
train["tag"] = train["tag"].apply(lambda x: [E_trans_to_C(i) for i in eval(str(x))])

In [5]:
train["entities"] = train.progress_apply(lambda row: [["LOC", *i.span()] for tag in row["tag"] for i in re.finditer(tag, row["text"])], axis=1)

inference: 100%|██████████| 6000/6000 [00:00<00:00, 19502.83it/s]


In [6]:
datalist = []

for _, row in train.iterrows():
    entity_labels = []
    for _type, _start_idx, _end_idx in row["entities"]:
        entity_labels.append({
            'start_idx': _start_idx,
            'end_idx': _end_idx,
            'type': _type,
            'entity': row["text"][_start_idx: _end_idx]
    })

    datalist.append({
        'text': row["text"],
        'entities': entity_labels
    })

In [7]:
data = pd.DataFrame(datalist)
train_data_df, dev_data_df = train_test_split(data, test_size=0.3)

In [8]:
pseudo = pd.read_csv("submits/data/pseudo_best.csv", sep="\t")
pseudo["text"] = pseudo["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：→；，。、\-”]+$", "", line.strip())))
pseudo["tag"] = pseudo["tag"].apply(lambda x: [E_trans_to_C(i) for i in eval(str(x))])
pseudo = pseudo[pseudo["tag"].apply(len) > 0]
pseudo["entities"] = pseudo.progress_apply(lambda row: [["LOC", *i.span()] for tag in row["tag"] for i in re.finditer(tag, row["text"])], axis=1)

pseudo_datalist = []

for _, row in train.iterrows():
    entity_labels = []
    for _type, _start_idx, _end_idx in row["entities"]:
        entity_labels.append({
            'start_idx': _start_idx,
            'end_idx': _end_idx,
            'type': _type,
            'entity': row["text"][_start_idx: _end_idx]
    })

    pseudo_datalist.append({
        'text': row["text"],
        'entities': entity_labels
    })

pseudo_data = pd.DataFrame(pseudo_datalist)
train_data_df = pd.concat([train_data_df, pseudo_data]).reset_index(drop=True)

inference: 100%|██████████| 2628/2628 [00:00<00:00, 17030.31it/s]


In [9]:
def get_label(x):
    
    entities = []
    for entity in x:
        entity_ = {}
        idx = list(range(entity['start_idx'], entity['end_idx']))
        entity_['idx'] = idx
        entity_['type'] = entity['type']
        entity_['entity'] = entity['entity']
        entities.append(entity_)
    
    return entities

In [10]:
train_data_df['label'] = train_data_df['entities'].apply(lambda x: get_label(x))
dev_data_df['label'] = dev_data_df['entities'].apply(lambda x: get_label(x))

In [11]:
train_data_df = train_data_df.loc[:,['text', 'label']]
train_data_df['label'] = train_data_df['label'].apply(lambda x: str(x))
dev_data_df = dev_data_df.loc[:,['text', 'label']]
dev_data_df['label'] = dev_data_df['label'].apply(lambda x: str(x))

In [12]:
ner_train_dataset = Dataset(train_data_df)
ner_dev_dataset = Dataset(dev_data_df, categories=ner_train_dataset.categories)

In [13]:
prompt = None
tokenizer = Tokenizer(vocab='chinese-roberta-large-upos', max_seq_len=52)

In [14]:
ner_train_dataset.convert_to_ids(tokenizer, prompt=prompt)
ner_dev_dataset.convert_to_ids(tokenizer, prompt=prompt)

In [None]:
# prompt = "提取上述句子中的所有命名实体信息"
# tokenizer = Tokenizer(vocab='roberta-base-finetuned-cluener2020-chinese', max_seq_len=52 + len(prompt) + 1)

In [None]:
# ner_train_dataset.convert_to_ids(tokenizer, prompt=prompt)
# ner_dev_dataset.convert_to_ids(tokenizer, prompt=prompt)

### 模型构建

In [15]:
config = W2NERBertConfig.from_pretrained('chinese-roberta-large-upos', num_labels=len(ner_train_dataset.cat2id))

In [16]:
torch.cuda.empty_cache()

In [17]:
dl_module = W2NERBert.from_pretrained('chinese-roberta-large-upos', config=config)

Some weights of the model checkpoint at chinese-roberta-large-upos were not used when initializing W2NERBert: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing W2NERBert from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing W2NERBert from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of W2NERBert were not initialized from the model checkpoint at chinese-roberta-large-upos and are newly initialized: ['convLayer.convs.0.bias', 'bert.pooler.dense.bias', 'predictor.mlp1.linear.bias', 'predictor.mlp2.linear.bias', 'predictor.mlp_rel.linear.bias', 'convLayer.convs.1.weight', 'encoder.bias_ih_l0_reverse', 'cln.gamma', 'predictor.linear.bias', 'convLayer.convs.2.bias'

In [18]:
# 设置运行次数
num_epoches, batch_size = 10, 16 # 0.91
# num_epoches, batch_size = 15, 16
# num_epoches, batch_size = 40, 256 # 0.91

In [20]:
# optimizer = get_default_w2ner_optimizer(dl_module) # 0.91
# # optimizer = get_default_w2ner_optimizer(dl_module, lr=5e-4, bert_lr=1e-5, weight_decay=0.01)
# # optimizer = get_default_w2ner_optimizer(dl_module, lr=5e-2, bert_lr=5e-5, weight_decay=0.01)
optimizer = get_default_w2ner_optimizer(dl_module, lr=1e-2, bert_lr=5e-5, weight_decay=0.01) # 0.91143

In [22]:
# 注意lr衰减轮次的设定
show_step = len(ner_train_dataset) // batch_size + 2
t_total = len(ner_train_dataset) // batch_size * num_epoches
scheduler = get_default_cosine_schedule_with_warmup(optimizer, t_total, warmup_ratio=0.2)

In [23]:
# model = Task(dl_module, optimizer, 'ce', cude_device=2, scheduler=scheduler, grad_clip=5.0, ema_decay=0.995, save_path="outputs/roberta-finetuned-large") # 0.91
model = Task(dl_module, optimizer, 'ce', cude_device=2, scheduler=scheduler, grad_clip=5.0, ema_decay=0.995, fgm_attack=True, save_path="outputs/roberta-finetuned-prompt", )

In [24]:
# model.fit(ner_train_dataset, ner_dev_dataset, lr=5e-5, epochs=num_epoches, batch_size=batch_size, show_step=show_step)
model.fit(ner_train_dataset, ner_dev_dataset, epochs=num_epoches, batch_size=batch_size, show_step=show_step)

100%|██████████| 638/638 [04:32<00:00,  2.34it/s]


epoch:[0],train loss is:0.041481 

eval loss is 0.003307, precision is:0.8037578288100209, recall is:0.8621241202815099, f1_score is:0.831918505942275
current best metric: 0.8621241202815099


100%|██████████| 638/638 [04:32<00:00,  2.34it/s]


epoch:[1],train loss is:0.003001 

save best model to outputs/roberta-finetuned-promot_best.
eval loss is 0.001973, precision is:0.8744257274119449, recall is:0.9133077415227128, f1_score is:0.8934439054920984
current best metric: 0.9133077415227128


100%|██████████| 638/638 [04:32<00:00,  2.34it/s]


epoch:[2],train loss is:0.001862 

save best model to outputs/roberta-finetuned-promot_best.
eval loss is 0.001271, precision is:0.9225, recall is:0.944337811900192, f1_score is:0.9332911792601961
current best metric: 0.944337811900192


100%|██████████| 638/638 [04:33<00:00,  2.33it/s]


epoch:[3],train loss is:0.001280 

save best model to outputs/roberta-finetuned-promot_best.
eval loss is 0.000902, precision is:0.94976, recall is:0.9494561740243123, f1_score is:0.9496080627099664
current best metric: 0.9494561740243123


100%|██████████| 638/638 [04:32<00:00,  2.34it/s]


epoch:[4],train loss is:0.000878 

save best model to outputs/roberta-finetuned-promot_best.
eval loss is 0.000681, precision is:0.9689846555664381, recall is:0.9494561740243123, f1_score is:0.9591210211665859
current best metric: 0.9494561740243123


 42%|████▏     | 270/638 [01:55<02:38,  2.32it/s]

In [None]:
from ark_nlp.model.ner.w2ner_bert import Predictor

In [None]:
class IFW2NERPredictor(Predictor):
    def E_trans_to_C(self, string):
        E_pun = u',.!?[]()<>"\''
        C_pun = u'，。！？【】（）《》“‘'
        table= {ord(f):ord(t) for f,t in zip(E_pun,C_pun)}

        return string.translate(table)

    def predict_one_sample(self, text='', prompt=None, cv=False):
        text = text.strip()
        
        features = self._get_input_ids(E_trans_to_C(re.sub("[\(《：；→，。、\-”]+$", "", text)), prompt=prompt)
        self.module.eval()

        with torch.no_grad():
            inputs = self._get_module_one_sample_inputs(features)
            logit = self.module(**inputs)

        preds = torch.argmax(logit, -1)

        instance, l = preds.cpu().numpy()[0], int(inputs['input_lengths'].cpu().numpy()[0])

        forward_dict = {}
        head_dict = {}
        ht_type_dict = {}
        for i in range(l):
            for j in range(i + 1, l):
                if instance[i, j] == 1:
                    if i not in forward_dict:
                        forward_dict[i] = [j]
                    else:
                        forward_dict[i].append(j)
        for i in range(l):
            for j in range(i, l):
                if instance[j, i] > 1:
                    ht_type_dict[(i, j)] = instance[j, i]
                    if i not in head_dict:
                        head_dict[i] = {j}
                    else:
                        head_dict[i].add(j)

        predicts = []

        def find_entity(key, entity, tails):
            entity.append(key)
            if key not in forward_dict:
                if key in tails:
                    predicts.append(entity.copy())
                entity.pop()
                return
            else:
                if key in tails:
                    predicts.append(entity.copy())
            for k in forward_dict[key]:
                find_entity(k, entity, tails)
            entity.pop()

        for head in head_dict:
            find_entity(head, [], head_dict[head])

        entities = []
        for entity_ in predicts:
            entities.append({
                "idx": entity_,
                "entity": ''.join([text[i] for i in entity_]),
                "type": self.id2cat[ht_type_dict[(entity_[0], entity_[-1])]]
            })

        if cv:
            return text, int(inputs['input_lengths'].cpu().numpy()[0]), logit.cpu().numpy()

        return entities

    def get_result(self, text, text_len, logit):
        preds = np.argmax(logit, -1)

        instance, l = preds[0], text_len

        forward_dict = {}
        head_dict = {}
        ht_type_dict = {}
        for i in range(l):
            for j in range(i + 1, l):
                if instance[i, j] == 1:
                    if i not in forward_dict:
                        forward_dict[i] = [j]
                    else:
                        forward_dict[i].append(j)
        for i in range(l):
            for j in range(i, l):
                if instance[j, i] > 1:
                    ht_type_dict[(i, j)] = instance[j, i]
                    if i not in head_dict:
                        head_dict[i] = {j}
                    else:
                        head_dict[i].add(j)

        predicts = []

        def find_entity(key, entity, tails):
            entity.append(key)
            if key not in forward_dict:
                if key in tails:
                    predicts.append(entity.copy())
                entity.pop()
                return
            else:
                if key in tails:
                    predicts.append(entity.copy())
            for k in forward_dict[key]:
                find_entity(k, entity, tails)
            entity.pop()

        for head in head_dict:
            find_entity(head, [], head_dict[head])

        entities = []
        for entity_ in predicts:
            entities.append({
                "idx": entity_,
                "entity": ''.join([text[i] for i in entity_]),
                "type": self.id2cat[ht_type_dict[(entity_[0], entity_[-1])]]
            })

        return entities

In [None]:
ner_predictor_instance = IFW2NERPredictor(model.module, tokenizer, ner_train_dataset.cat2id)
ner_predictor_instance_best = IFW2NERPredictor(torch.load("outputs/roberta-finetuned-prompt_best.pkl"), tokenizer, ner_train_dataset.cat2id)

In [None]:
test = pd.read_csv("data/test.csv", sep="\t")

In [None]:
predict_results = []
tta_data = []

for _line in tqdm(test["text"].tolist()):
    label = set()

    text, text_len, logit = ner_predictor_instance.predict_one_sample(_line, prompt=prompt, cv=True)
    _, _, logit_best = ner_predictor_instance_best.predict_one_sample(_line, prompt=prompt, cv=True)

    logit = np.sum(np.array([logit_best, logit]), axis=0)

    for _preditc in ner_predictor_instance.get_result(text, text_len, logit):
        label.add(_preditc["entity"])
    
    label = list(label)
    if len(label) > 0:
        tta_data.append([_line, label])

    predict_results.append(label)

In [None]:
with open('w2ner_submit_prompt_cv.txt', 'w', encoding='utf-8') as f:
    f.write("tag\n")
    for _result in predict_results:
       f.write(f"{str(_result)}\n")

In [None]:
ner_predictor_instance = IFW2NERPredictor(torch.load("outputs/roberta-finetuned.pkl"), tokenizer, ner_train_dataset.cat2id)

In [None]:
predict_results = []

for _line in tqdm(test["text"].tolist()):
    label = set()
    for _preditc in ner_predictor_instance.predict_one_sample(_line, prompt=prompt):
        label.add(_preditc["entity"])
    label = list(label)

    predict_results.append(label)

with open('w2ner_submit_prompt.txt', 'w', encoding='utf-8') as f:
    f.write("tag\n")
    for _result in predict_results:
       f.write(f"{str(_result)}\n")

In [None]:
# pseudo_data = pd.DataFrame(pseudo_data, columns=["text", "tag"])
# pseudo_data.to_csv("data/pseudo.csv", index=False, encoding="utf-8", sep="\t")

In [None]:
# from transformers import AutoTokenizer
# model.module.save_pretrained("outputs/roberta-finetuned")
# AutoTokenizer.from_pretrained('roberta-base-finetuned-cluener2020-chinese').save_pretrained("outputs/roberta-finetuned-cosine")