In [1]:
import warnings
import gc
import re
import os
import jieba
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold

from ark_nlp.model.ner.w2ner_bert import W2NERBert
from ark_nlp.model.ner.w2ner_bert import W2NERBertConfig
from ark_nlp.model.ner.w2ner_bert import Dataset
from ark_nlp.model.ner.w2ner_bert import Task
from ark_nlp.model.ner.w2ner_bert import get_default_w2ner_optimizer
from ark_nlp.factory.lr_scheduler import get_default_linear_schedule_with_warmup, get_default_cosine_schedule_with_warmup
from ark_nlp.model.ner.w2ner_bert import Tokenizer
from ark_nlp.factory.utils.seed import set_seed

from transformers import AutoTokenizer

In [2]:
set_seed(42)
tqdm.pandas(desc="inference")
warnings.filterwarnings("ignore")

In [3]:
def E_trans_to_C(string):
    E_pun = u',.!?[]()<>"\''
    C_pun = u'，。！？【】（）《》“‘'
    table= {ord(f):ord(t) for f,t in zip(E_pun,C_pun)}
    return string.translate(table)

In [4]:
# test = pd.read_csv("data/test.csv", sep="\t")
train = pd.read_csv("data/train.csv", sep="\t")

In [5]:
# test["text"] = test["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：；→，。、\-”]+$", "", line.strip())))
train["text"] = train["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：→；，。、\-”]+$", "", line.strip())))
train["tag"] = train["tag"].apply(lambda x: [E_trans_to_C(i) for i in eval(str(x))])

In [6]:
train["entities"] = train.progress_apply(lambda row: [["LOC", *i.span()] for tag in row["tag"] for i in re.finditer(tag, row["text"])], axis=1)

inference: 100%|██████████| 6000/6000 [00:00<00:00, 19458.15it/s]


In [7]:
datalist = []

for _, row in train.iterrows():
    entity_labels = []
    for _type, _start_idx, _end_idx in row["entities"]:
        entity_labels.append({
            'start_idx': _start_idx,
            'end_idx': _end_idx,
            'type': _type,
            'entity': row["text"][_start_idx: _end_idx]
    })

    datalist.append({
        'text': row["text"],
        'entities': entity_labels
    })

In [8]:
data = pd.DataFrame(datalist)

In [12]:
pseudo = pd.read_csv("submits/data/pseudo_best.csv", sep="\t")
pseudo["text"] = pseudo["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：→；，。、\-”]+$", "", line.strip())))
pseudo["tag"] = pseudo["tag"].apply(lambda x: [E_trans_to_C(i) for i in eval(str(x))])
pseudo = pseudo[pseudo["tag"].apply(len) > 0]
pseudo["entities"] = pseudo.progress_apply(lambda row: [["LOC", *i.span()] for tag in row["tag"] for i in re.finditer(tag, row["text"])], axis=1)

pseudo_datalist = []

for _, row in train.iterrows():
    entity_labels = []
    for _type, _start_idx, _end_idx in row["entities"]:
        entity_labels.append({
            'start_idx': _start_idx,
            'end_idx': _end_idx,
            'type': _type,
            'entity': row["text"][_start_idx: _end_idx]
    })

    pseudo_datalist.append({
        'text': row["text"],
        'entities': entity_labels
    })

pseudo_data = pd.DataFrame(pseudo_datalist)
dataset = pd.concat([data, pseudo_data]).reset_index(drop=True)

inference: 100%|██████████| 2628/2628 [00:00<00:00, 20023.93it/s]


In [13]:
def get_label(x):
    
    entities = []
    for entity in x:
        if entity['entity'].strip():
            entity_ = {}
            idx = list(range(entity['start_idx'], entity['end_idx']))
            entity_['idx'] = idx
            entity_['type'] = entity['type']
            entity_['entity'] = entity['entity']
            entities.append(entity_)
    
    return entities

In [14]:
dataset['label'] = dataset['entities'].apply(lambda x: get_label(x))

In [15]:
dataset = dataset.loc[:,['text', 'label']]
dataset['label'] = dataset['label'].apply(lambda x: str(x))

In [16]:
cv = 5
seed = 42
device = 1
max_len = 52
batch_size = 128
num_epoches = 20
model_name = "roberta-base-finetuned-cluener2020-chinese"
categories = ['<none>', '<suc>', 'LOC']
kf = KFold(n_splits=cv, shuffle=True, random_state=seed)

In [17]:
from transformers import BertPreTrainedModel, BertModel, BertTokenizer
from ark_nlp.nn.layer.nezha_block import NeZhaPreTrainedModel, NeZhaModel
from ark_nlp.nn.configuration.configuration_nezha import NeZhaConfig
from ark_nlp.nn.layer.roformer_block import RoFormerPreTrainedModel
from ark_nlp.nn.configuration.configuration_roformer import RoFormerConfig
from ark_nlp.model.ner.w2ner_bert.w2ner_bert import ConvolutionLayer, CoPredictor, LayerNorm, pack_padded_sequence, pad_packed_sequence


In [18]:
class W2NERBert(BertPreTrainedModel):

    def __init__(
        self,
        config,
        use_bert_last_4_layers=True,

        dist_emb_size=20,
        type_emb_size=20,
        lstm_hid_size=512,
        conv_hid_size=96,

        biaffine_size=512,
        ffnn_hid_size=288,
        dilation=[1, 2, 3],

        conv_dropout=0.5,
        emb_dropout=0.5,
        out_dropout=0.33,
        ** kwargs
    ):
        super(W2NERBert, self).__init__(config)
        self.num_labels = config.num_labels

        self.use_bert_last_4_layers = use_bert_last_4_layers

        self.lstm_hid_size = lstm_hid_size
        self.conv_hid_size = conv_hid_size

        lstm_input_size = 0

        self.bert = BertModel(config)
        lstm_input_size += config.hidden_size

        self.dis_embs = nn.Embedding(20, dist_emb_size)
        self.reg_embs = nn.Embedding(3, type_emb_size)

        self.encoder = nn.LSTM(lstm_input_size, lstm_hid_size // 2, num_layers=1, batch_first=True,
                               bidirectional=True)

        conv_input_size = lstm_hid_size + dist_emb_size + type_emb_size

        self.convLayer = ConvolutionLayer(conv_input_size, conv_hid_size, dilation, conv_dropout)
        self.dropout = nn.Dropout(emb_dropout)
        self.predictor = CoPredictor(self.num_labels, lstm_hid_size, biaffine_size,
                                     conv_hid_size * len(dilation), ffnn_hid_size,
                                     out_dropout)

        self.cln = LayerNorm(lstm_hid_size, lstm_hid_size, conditional=True)

    def forward(
            self,
            input_ids,
            attention_mask,
            token_type_ids,
            grid_mask2d,
            dist_inputs,
            pieces2word,
            input_lengths,
            **kwargs
    ):
        bert_embs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True,)

        if self.use_bert_last_4_layers:
            bert_embs = torch.stack(bert_embs[2][-4:], dim=-1).mean(-1)
        else:
            bert_embs = bert_embs[0]

        length = pieces2word.size(1)

        min_value = torch.min(bert_embs).item()

        # Max pooling word representations from pieces
        _bert_embs = bert_embs.unsqueeze(1).expand(-1, length, -1, -1)
        _bert_embs = torch.masked_fill(_bert_embs, pieces2word.eq(0).unsqueeze(-1), min_value)
        word_reps, _ = torch.max(_bert_embs, dim=2)

        word_reps = self.dropout(word_reps)
        packed_embs = pack_padded_sequence(word_reps, input_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_outs, (hidden, _) = self.encoder(packed_embs)
        # 源码每个batch的长度等于 input_lengths.max()
        word_reps, _ = pad_packed_sequence(packed_outs, batch_first=True, total_length=length)

        cln = self.cln(word_reps.unsqueeze(2), word_reps)

        dis_emb = self.dis_embs(dist_inputs)
        tril_mask = torch.tril(grid_mask2d.clone().long())
        reg_inputs = tril_mask + grid_mask2d.clone().long()
        reg_emb = self.reg_embs(reg_inputs)

        conv_inputs = torch.cat([dis_emb, reg_emb, cln], dim=-1)
        conv_inputs = torch.masked_fill(conv_inputs, grid_mask2d.eq(0).unsqueeze(-1), 0.0)
        conv_outputs = self.convLayer(conv_inputs)
        conv_outputs = torch.masked_fill(conv_outputs, grid_mask2d.eq(0).unsqueeze(-1), 0.0)
        outputs = self.predictor(word_reps, word_reps, conv_outputs)

        return outputs

In [19]:
class W2NERNezha(NeZhaPreTrainedModel):

    def __init__(self,use_bert_last_4_layers=True,dist_emb_size=20,type_emb_size=20,lstm_hid_size=512,conv_hid_size=96,biaffine_size=512,ffnn_hid_size=288,dilation=[1, 2, 3],conv_dropout=0.5,emb_dropout=0.5,out_dropout=0.33,**kwargs):
        config = NeZhaConfig.from_pretrained("nezha-cn-base", num_labels=len({'<none>': 0, '<suc>': 1, 'LOC': 2}), output_hidden_states=True)
        super(W2NERNezha, self).__init__(config)
        self.num_labels = config.num_labels

        self.use_bert_last_4_layers = use_bert_last_4_layers

        self.lstm_hid_size = lstm_hid_size
        self.conv_hid_size = conv_hid_size

        lstm_input_size = 0

        self.bert = NeZhaModel.from_pretrained("nezha-cn-base", config=config)
        lstm_input_size += config.hidden_size

        self.dis_embs = nn.Embedding(20, dist_emb_size)
        self.reg_embs = nn.Embedding(3, type_emb_size)

        self.encoder = nn.LSTM(lstm_input_size, lstm_hid_size // 2, num_layers=1, batch_first=True, bidirectional=True)

        conv_input_size = lstm_hid_size + dist_emb_size + type_emb_size

        self.convLayer = ConvolutionLayer(conv_input_size, conv_hid_size, dilation, conv_dropout)
        self.dropout = nn.Dropout(emb_dropout)
        self.predictor = CoPredictor(self.num_labels, lstm_hid_size, biaffine_size, conv_hid_size * len(dilation), ffnn_hid_size, out_dropout)

        self.cln = LayerNorm(lstm_hid_size, lstm_hid_size, conditional=True)

    def forward( self, input_ids, attention_mask, token_type_ids, grid_mask2d, dist_inputs, pieces2word, input_lengths, **kwargs):
        bert_embs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        # import pdb; pdb.set_trace()
        if self.use_bert_last_4_layers:
            bert_embs = torch.stack(bert_embs[2][-4:], dim=-1).mean(-1)
        else:
            bert_embs = bert_embs[0]

        length = pieces2word.size(1)

        min_value = torch.min(bert_embs).item()

        # Max pooling word representations from pieces
        _bert_embs = bert_embs.unsqueeze(1).expand(-1, length, -1, -1)
        _bert_embs = torch.masked_fill(_bert_embs, pieces2word.eq(0).unsqueeze(-1), min_value)
        word_reps, _ = torch.max(_bert_embs, dim=2)

        word_reps = self.dropout(word_reps)
        packed_embs = pack_padded_sequence(word_reps, input_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_outs, (hidden, _) = self.encoder(packed_embs)
        # 源码每个batch的长度等于 input_lengths.max()
        word_reps, _ = pad_packed_sequence(packed_outs, batch_first=True, total_length=length)

        cln = self.cln(word_reps.unsqueeze(2), word_reps)

        dis_emb = self.dis_embs(dist_inputs)
        tril_mask = torch.tril(grid_mask2d.clone().long())
        reg_inputs = tril_mask + grid_mask2d.clone().long()
        reg_emb = self.reg_embs(reg_inputs)

        conv_inputs = torch.cat([dis_emb, reg_emb, cln], dim=-1)
        conv_inputs = torch.masked_fill(conv_inputs, grid_mask2d.eq(0).unsqueeze(-1), 0.0)
        conv_outputs = self.convLayer(conv_inputs)
        conv_outputs = torch.masked_fill(conv_outputs, grid_mask2d.eq(0).unsqueeze(-1), 0.0)
        outputs = self.predictor(word_reps, word_reps, conv_outputs)

        return outputs

In [None]:
def train_nezha():
    print("/" * 50, f" finetuning nezha ", "/" * 50)
    tokenizer = Tokenizer(vocab="nezha-cn-base", max_seq_len=max_len)
    train_data_df, dev_data_df = train_test_split(dataset, test_size=0.3)
    ner_train_dataset = Dataset(train_data_df, categories=categories)
    ner_dev_dataset = Dataset(dev_data_df, categories=categories)
    ner_train_dataset.convert_to_ids(tokenizer)
    ner_dev_dataset.convert_to_ids(tokenizer)

    show_step = len(ner_train_dataset) // batch_size + 2
    t_total = len(ner_train_dataset) // batch_size * num_epoches

    torch.cuda.empty_cache()

    dl_module = W2NERNezha.from_pretrained("nezha-cn-base")

    optimizer = get_default_w2ner_optimizer(dl_module, lr=1e-2, bert_lr=5e-5, weight_decay=0.01)
    scheduler = get_default_cosine_schedule_with_warmup(optimizer, t_total, warmup_ratio=0.2)
    
    model = Task(dl_module, optimizer, 'ce', cude_device=device, scheduler=scheduler, grad_clip=5.0, ema_decay=0.995, fgm_attack=True, save_path=f"outputs/nezha-cn-base-finetuned", )
    model.fit(ner_train_dataset, ner_dev_dataset, epochs=num_epoches, batch_size=batch_size, show_step=show_step)

    # BertTokenizer.from_pretrained("nezha-cn-base").save_pretrained(f"outputs/nezha-cn-base-finetuned")

    gc.collect()
    del train_data_df, dev_data_df, ner_train_dataset, ner_dev_dataset, dl_module
    print("/" * 110)

    return model

In [None]:
# model = train_nezha()

In [None]:
def train_roberta_cv():
    for i, (train_index, dev_index) in enumerate(kf.split(dataset), start=1):
        print("/" * 50, f" cv {i} ", "/" * 50)
        tokenizer = Tokenizer(vocab=model_name, max_seq_len=max_len)

        train_data_df, dev_data_df = dataset.loc[train_index], dataset.loc[dev_index]
        ner_train_dataset = Dataset(train_data_df, categories=categories)
        ner_dev_dataset = Dataset(dev_data_df, categories=categories)
        ner_train_dataset.convert_to_ids(tokenizer)
        ner_dev_dataset.convert_to_ids(tokenizer)

        show_step = len(ner_train_dataset) // batch_size + 2
        t_total = len(ner_train_dataset) // batch_size * num_epoches

        torch.cuda.empty_cache()

        config = W2NERBertConfig.from_pretrained(model_name, num_labels=len(ner_train_dataset.cat2id))
        dl_module = W2NERBert.from_pretrained(model_name, config=config)

        optimizer = get_default_w2ner_optimizer(dl_module, lr=1e-2, bert_lr=5e-5, weight_decay=0.01)
        scheduler = get_default_cosine_schedule_with_warmup(optimizer, t_total, warmup_ratio=0.1)
        
        model = Task(dl_module, optimizer, 'ce', cude_device=device, scheduler=scheduler, grad_clip=5.0, ema_decay=0.995, fgm_attack=True, save_path=f"outputs/roberta-kflod-{i}", )
        model.fit(ner_train_dataset, ner_dev_dataset, epochs=num_epoches, batch_size=batch_size, show_step=show_step)

        # AutoTokenizer.from_pretrained(model_name).save_pretrained(f"outputs/roberta-kflod-{i}")
        
        gc.collect()
        del train_data_df, dev_data_df, ner_train_dataset, ner_dev_dataset, model, dl_module
        
        print("/" * 108)

In [None]:
train_roberta_cv()

### 模型预测

In [20]:
from collections import Counter
from ark_nlp.model.ner.w2ner_bert import Predictor

In [21]:
class IFW2NERPredictor(Predictor):
    def E_trans_to_C(self, string):
        E_pun = u',.!?[]()<>"\''
        C_pun = u'，。！？【】（）《》“‘'
        table= {ord(f):ord(t) for f,t in zip(E_pun,C_pun)}

        return string.translate(table)

    def predict_one_sample(self, text='', prompt=None, cv=False):
        text = text.strip()
        
        features = self._get_input_ids(E_trans_to_C(re.sub("[\(《：；→，。、\-”]+$", "", text)), prompt=prompt)
        self.module.eval()

        with torch.no_grad():
            inputs = self._get_module_one_sample_inputs(features)
            logit = self.module(**inputs)

        preds = torch.argmax(logit, -1)

        instance, l = preds.cpu().numpy()[0], int(inputs['input_lengths'].cpu().numpy()[0])

        forward_dict = {}
        head_dict = {}
        ht_type_dict = {}
        for i in range(l):
            for j in range(i + 1, l):
                if instance[i, j] == 1:
                    if i not in forward_dict:
                        forward_dict[i] = [j]
                    else:
                        forward_dict[i].append(j)
        for i in range(l):
            for j in range(i, l):
                if instance[j, i] > 1:
                    ht_type_dict[(i, j)] = instance[j, i]
                    if i not in head_dict:
                        head_dict[i] = {j}
                    else:
                        head_dict[i].add(j)

        predicts = []

        def find_entity(key, entity, tails):
            entity.append(key)
            if key not in forward_dict:
                if key in tails:
                    predicts.append(entity.copy())
                entity.pop()
                return
            else:
                if key in tails:
                    predicts.append(entity.copy())
            for k in forward_dict[key]:
                find_entity(k, entity, tails)
            entity.pop()

        for head in head_dict:
            find_entity(head, [], head_dict[head])

        entities = []
        for entity_ in predicts:
            entities.append({
                "idx": entity_,
                "entity": ''.join([text[i] for i in entity_]),
                "type": self.id2cat[ht_type_dict[(entity_[0], entity_[-1])]]
            })

        if cv:
            return text, int(inputs['input_lengths'].cpu().numpy()[0]), logit.cpu().numpy()

        return entities

    def get_result(self, text, text_len, logit):
        preds = np.argmax(logit, -1)

        instance, l = preds[0], text_len

        forward_dict = {}
        head_dict = {}
        ht_type_dict = {}
        for i in range(l):
            for j in range(i + 1, l):
                if instance[i, j] == 1:
                    if i not in forward_dict:
                        forward_dict[i] = [j]
                    else:
                        forward_dict[i].append(j)
        for i in range(l):
            for j in range(i, l):
                if instance[j, i] > 1:
                    ht_type_dict[(i, j)] = instance[j, i]
                    if i not in head_dict:
                        head_dict[i] = {j}
                    else:
                        head_dict[i].add(j)

        predicts = []

        def find_entity(key, entity, tails):
            entity.append(key)
            if key not in forward_dict:
                if key in tails:
                    predicts.append(entity.copy())
                entity.pop()
                return
            else:
                if key in tails:
                    predicts.append(entity.copy())
            for k in forward_dict[key]:
                find_entity(k, entity, tails)
            entity.pop()

        for head in head_dict:
            find_entity(head, [], head_dict[head])

        entities = []
        for entity_ in predicts:
            entities.append({
                "idx": entity_,
                "entity": ''.join([text[i] for i in entity_]),
                "type": self.id2cat[ht_type_dict[(entity_[0], entity_[-1])]]
            })

        return entities

In [22]:
test = pd.read_csv("data/test.csv", sep="\t")
test["text"] = test["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：；→，。、\-”]+$", "", line.strip())))
test_dateset = test["text"].tolist()

In [23]:
# def predict_cv():
#     predict_results_cv = [[] for i in range(len(test))]

#     for i in range(1, cv + 1):
#         model_name = f"./outputs/roberta-kflod-{i}.pkl"
        
#         tokenizer = Tokenizer(vocab=model_name, max_seq_len=max_len)
#         config = W2NERBertConfig.from_pretrained(model_name, num_labels=len({'<none>': 0, '<suc>': 1, 'LOC': 2}))
#         module = W2NERBert.from_pretrained(model_name, config=config).to(torch.device(f"cuda:{device}"))

#         ner_predictor_instance = Predictor(module, tokenizer, {'<none>': 0, '<suc>': 1, 'LOC': 2})
        
#         for index, _line in tqdm(enumerate(test_dateset), desc=f"{model_name} inference: ", total=len(test_dateset)):
#             predict_results_cv[index].extend([_preditc["entity"] for _preditc in ner_predictor_instance.predict_one_sample(_line)])

#     predict_results = []
#     for predict_ in predict_results_cv:
#         predict_results.append([k for k, v in dict(Counter(predict_)).items() if v > 3] if predict_ else [])
    
#     return predict_results, predict_results_cv

In [24]:
predict_results = {"avg": [], "sum": []}
pseudo_data = []

models_path = ['roberta-kflod-1_best.pkl', 'roberta-kflod-2.pkl', 'roberta-kflod-2_best.pkl', 'roberta-kflod-3_best.pkl', 'roberta-kflod-3.pkl', 'roberta-kflod-5.pkl', 'roberta-kflod-4_best.pkl', 'roberta-kflod-1.pkl', 'roberta-kflod-5_best.pkl', 'roberta-kflod-4.pkl']
tokenizer = Tokenizer(vocab=model_name, max_seq_len=max_len)
ner_predictor_instances = [IFW2NERPredictor(torch.load(f"./outputs/{f}"), tokenizer, {'<none>': 0, '<suc>': 1, 'LOC': 2}) for f in models_path]

for _line in tqdm(test_dateset):
    logits = []
    for ner_predictor_instance in ner_predictor_instances:
        text, text_len, logit = ner_predictor_instance.predict_one_sample(_line, cv=True)
        logits.append(logit)

    logit_sum = np.sum(np.array(logits), axis=0)
    logit_avg = np.mean(np.array(logits), axis=0)
    
    label_sum = set()
    label_avg = set()
    for _preditc in ner_predictor_instances[0].get_result(text, text_len, logit_sum):
        label_sum.add(_preditc["entity"])
    
    for _preditc in ner_predictor_instances[0].get_result(text, text_len, logit_avg):
        label_avg.add(_preditc["entity"])
    
    label_sum = list(label_sum)
    label_avg = list(label_avg)

    predict_results["sum"].append(label_sum)
    predict_results["avg"].append(label_avg)

100%|██████████| 2657/2657 [06:44<00:00,  6.57it/s]


In [None]:
def predict_nezha(model=None):
    predict_results = [[] for i in range(len(test))]
    model_name = f"./outputs/nezha-cn-base-finetuned.pkl"

    tokenizer = Tokenizer(vocab="nezha-cn-base", max_seq_len=max_len)
    
    if model is None:
        # module = W2NERNezha.from_pretrained(model_name)
        module = torch.load(model_name)
    else:
        module = model.module
        
    ner_predictor_instance = IFW2NERPredictor(module, tokenizer, {'<none>': 0, '<suc>': 1, 'LOC': 2})
    
    for index, _line in tqdm(enumerate(test_dateset), desc=f"{model_name} inference: ", total=len(test_dateset)):
        predict_results[index].extend([_preditc["entity"] for _preditc in ner_predictor_instance.predict_one_sample(_line)])
    
    return [list(set(i)) for i in predict_results]

In [None]:
predict_results, predict_results_cv = predict_cv()
# predict_results = predict_nezha(model=None)

In [None]:
with open('w2ner_nezha_submit.txt', 'w', encoding='utf-8') as f:
    f.write("tag\n")
    for _result in predict_results:
       f.write(f"{str(_result)}\n")

In [None]:
# pseudo_data = pd.DataFrame(pseudo_data, columns=["text", "tag"])
# pseudo_data.to_csv("data/pseudo.csv", index=False, encoding="utf-8", sep="\t")

In [26]:
for method, predict_result in predict_results.items():
    with open(f'w2ner_submit_cv_{method}.txt', 'w', encoding='utf-8') as f:
        f.write("tag\n")
        for _result in predict_result:
            f.write(f"{str(_result)}\n")