In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import re
import os
import jieba
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
from ark_nlp.model.ner.w2ner_bert import W2NERBertConfig
from ark_nlp.model.ner.w2ner_bert import Tokenizer
from ark_nlp.model.ner.w2ner_bert import W2NERBert
from ark_nlp.model.ner.w2ner_bert import Dataset
from ark_nlp.model.ner.w2ner_bert import Task
from ark_nlp.model.ner.w2ner_bert import get_default_w2ner_optimizer
from ark_nlp.factory.lr_scheduler import get_default_linear_schedule_with_warmup, get_default_cosine_schedule_with_warmup
from ark_nlp.factory.utils.seed import set_seed

In [None]:
set_seed(42)
tqdm.pandas(desc="inference")

In [None]:
def E_trans_to_C(string):
    E_pun = u',.!?[]()<>"\''
    C_pun = u'，。！？【】（）《》“‘'
    table= {ord(f):ord(t) for f,t in zip(E_pun,C_pun)}
    return string.translate(table)

In [None]:
test = pd.read_csv("data/test.csv", sep="\t")
train = pd.read_csv("data/train.csv", sep="\t")

In [None]:
test["text"] = test["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：；→，。、\-”]+$", "", line.strip())))
train["text"] = train["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：→；，。、\-”]+$", "", line.strip())))
train["tag"] = train["tag"].apply(lambda x: [E_trans_to_C(i) for i in eval(str(x))])

In [None]:
re.finditer("[\(《：；→，。、\-”]+$", )

In [None]:
train["entities"] = train.progress_apply(lambda row: [["LOC", *i.span()] for tag in row["tag"] for i in re.finditer(tag, row["text"])], axis=1)

In [None]:
datalist = []

for _, row in train.iterrows():
    entity_labels = []
    for _type, _start_idx, _end_idx in row["entities"]:
        entity_labels.append({
            'start_idx': _start_idx,
            'end_idx': _end_idx,
            'type': _type,
            'entity': row["text"][_start_idx: _end_idx]
    })

    datalist.append({
        'text': row["text"],
        'entities': entity_labels
    })

In [None]:
train_data_df = pd.DataFrame(datalist)

In [None]:
pseudo = pd.read_csv("data/pseudo_best.csv", sep="\t")
pseudo["text"] = pseudo["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：→；，。、\-”]+$", "", line.strip())))
pseudo["tag"] = pseudo["tag"].apply(lambda x: [E_trans_to_C(i) for i in eval(str(x))])
pseudo["entities"] = pseudo.progress_apply(lambda row: [["LOC", *i.span()] for tag in row["tag"] for i in re.finditer(tag, row["text"])], axis=1)

pseudo_datalist = []

for _, row in train.iterrows():
    entity_labels = []
    for _type, _start_idx, _end_idx in row["entities"]:
        entity_labels.append({
            'start_idx': _start_idx,
            'end_idx': _end_idx,
            'type': _type,
            'entity': row["text"][_start_idx: _end_idx]
    })

    pseudo_datalist.append({
        'text': row["text"],
        'entities': entity_labels
    })

pseudo_data = pd.DataFrame(pseudo_datalist)
train_data_df = pd.concat([train_data_df, pseudo_data]).reset_index(drop=True)

In [None]:
def get_label(x):
    
    entities = []
    for entity in x:
        entity_ = {}
        idx = list(range(entity['start_idx'], entity['end_idx']))
        entity_['idx'] = idx
        entity_['type'] = entity['type']
        entity_['entity'] = entity['entity']
        entities.append(entity_)
    
    return entities

In [None]:
train_data_df['label'] = train_data_df['entities'].apply(lambda x: get_label(x))

In [None]:
train_data_df = train_data_df.loc[:,['text', 'label']]
train_data_df['label'] = train_data_df['label'].apply(lambda x: str(x))

In [None]:
ner_train_dataset = Dataset(train_data_df)

In [None]:
tokenizer = Tokenizer(vocab='roberta-base-finetuned-cluener2020-chinese', max_seq_len=52)

In [None]:
ner_train_dataset.convert_to_ids(tokenizer)

### 模型构建

In [None]:
config = W2NERBertConfig.from_pretrained('roberta-base-finetuned-cluener2020-chinese', num_labels=len(ner_train_dataset.cat2id))

In [None]:
torch.cuda.empty_cache()

In [None]:
dl_module = W2NERBert.from_pretrained('roberta-base-finetuned-cluener2020-chinese', config=config)

In [None]:
# 设置运行次数
# num_epoches, batch_size = 10, 16 # 0.91
# num_epoches, batch_size = 15, 16
num_epoches, batch_size = 40, 256 # 0.91

In [None]:
# optimizer = get_default_w2ner_optimizer(dl_module) # 0.91
# optimizer = get_default_w2ner_optimizer(dl_module, lr=5e-4, bert_lr=1e-5, weight_decay=0.01)
# optimizer = get_default_w2ner_optimizer(dl_module, lr=5e-2, bert_lr=5e-5, weight_decay=0.01)
optimizer = get_default_w2ner_optimizer(dl_module, lr=1e-2, bert_lr=5e-5, weight_decay=0.01) # 0.91143

In [None]:
# 注意lr衰减轮次的设定
show_step = len(ner_train_dataset) // batch_size + 2
t_total = len(ner_train_dataset) // batch_size * num_epoches
scheduler = get_default_cosine_schedule_with_warmup(optimizer, t_total, warmup_ratio=0.2)

In [None]:
model = Task(dl_module, optimizer, 'ce', cude_device=2, scheduler=scheduler, grad_clip=5.0, ema_decay=0.995, fgm_attack=True, save_path="outputs/roberta-finetuned-allpseudo", )

In [None]:
# model.fit(ner_train_dataset, epochs=num_epoches, batch_size=batch_size, show_step=show_step, lr=1e-5)
model.fit(ner_train_dataset, epochs=num_epoches, batch_size=batch_size, show_step=show_step)

In [None]:
from ark_nlp.model.ner.w2ner_bert import Predictor

In [None]:
class IFW2NERPredictor(Predictor):
    def E_trans_to_C(self, string):
        E_pun = u',.!?[]()<>"\''
        C_pun = u'，。！？【】（）《》“‘'
        table= {ord(f):ord(t) for f,t in zip(E_pun,C_pun)}

        return string.translate(table)

    def predict_one_sample(self, text=''):
        text = text.strip()
        
        features = self._get_input_ids(E_trans_to_C(re.sub("[\(《：；→，。、\-”]+$", "", text)))
        self.module.eval()

        with torch.no_grad():
            inputs = self._get_module_one_sample_inputs(features)
            logit = self.module(**inputs)

        preds = torch.argmax(logit, -1)

        instance, l = preds.cpu().numpy()[0], int(inputs['input_lengths'].cpu().numpy()[0])

        forward_dict = {}
        head_dict = {}
        ht_type_dict = {}
        for i in range(l):
            for j in range(i + 1, l):
                if instance[i, j] == 1:
                    if i not in forward_dict:
                        forward_dict[i] = [j]
                    else:
                        forward_dict[i].append(j)
        for i in range(l):
            for j in range(i, l):
                if instance[j, i] > 1:
                    ht_type_dict[(i, j)] = instance[j, i]
                    if i not in head_dict:
                        head_dict[i] = {j}
                    else:
                        head_dict[i].add(j)

        predicts = []

        def find_entity(key, entity, tails):
            entity.append(key)
            if key not in forward_dict:
                if key in tails:
                    predicts.append(entity.copy())
                entity.pop()
                return
            else:
                if key in tails:
                    predicts.append(entity.copy())
            for k in forward_dict[key]:
                find_entity(k, entity, tails)
            entity.pop()

        for head in head_dict:
            find_entity(head, [], head_dict[head])

        entities = []
        for entity_ in predicts:
            entities.append({
                "idx": entity_,
                "entity": ''.join([text[i] for i in entity_]),
                "type": self.id2cat[ht_type_dict[(entity_[0], entity_[-1])]]
            })

        return entities

In [None]:
ner_predictor_instance = IFW2NERPredictor(model.module, tokenizer, ner_train_dataset.cat2id)

In [None]:
test = pd.read_csv("data/test.csv", sep="\t")

In [None]:
predict_results = []
pseudo_data = []

for _line in tqdm(test["text"].tolist()):
    label = set()
    for _preditc in ner_predictor_instance.predict_one_sample(_line):
        label.add(_preditc["entity"])
    
    label = list(label)
    if len(label) > 0:
        pseudo_data.append([_line, label])

    predict_results.append(label)

In [None]:
with open('w2ner_submit_all_tta.txt', 'w', encoding='utf-8') as f:
    f.write("tag\n")
    for _result in predict_results:
       f.write(f"{str(_result)}\n")

In [None]:
pseudo_data = pd.DataFrame(pseudo_data, columns=["text", "tag"])
pseudo_data.to_csv("data/pseudo_all_pseudo.csv", index=False, encoding="utf-8", sep="\t")

In [None]:
# AutoTokenizer.from_pretrained('roberta-base-finetuned-cluener2020-chinese').save_pretrained("outputs/roberta-finetuned-allpseudo")