In [None]:
import warnings
import re
import os
import jieba
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split

from ark_nlp.model.ner.biaffine_bert import BiaffineBert
from ark_nlp.model.ner.biaffine_bert import BiaffineBertConfig
from ark_nlp.model.ner.biaffine_bert import Dataset
from ark_nlp.model.ner.biaffine_bert import Task
from ark_nlp.model.ner.biaffine_bert import get_default_model_optimizer
from ark_nlp.factory.optimizer import get_w2ner_model_optimizer as get_biaffine_model_optimizer
from ark_nlp.factory.lr_scheduler import get_default_cosine_schedule_with_warmup
from ark_nlp.model.ner.biaffine_bert import Tokenizer
from ark_nlp.factory.utils.seed import set_seed
from ark_nlp.nn.layer.biaffine_block import Biaffine
from transformers import AutoModel, AutoModelForPreTraining, AutoTokenizer, BertPreTrainedModel

In [None]:
set_seed(42)
warnings.filterwarnings("ignore")
tqdm.pandas(desc="inference")

In [None]:
def E_trans_to_C(string):
    E_pun = u',.!?[]()<>"\''
    C_pun = u'，。！？【】（）《》“‘'
    table= {ord(f):ord(t) for f,t in zip(E_pun,C_pun)}
    return string.translate(table)

In [None]:
test = pd.read_csv("data/test.csv", sep="\t")
train = pd.read_csv("data/train.csv", sep="\t")

In [None]:
test["text"] = test["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：；→，。、\-”]+$", "", line.strip())))
train["text"] = train["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：→；，。、\-”]+$", "", line.strip())))
train["tag"] = train["tag"].apply(lambda x: [E_trans_to_C(i) for i in eval(str(x))])

In [None]:
train["entities"] = train.progress_apply(lambda row: [["LOC", *i.span()] for tag in row["tag"] for i in re.finditer(tag, row["text"])], axis=1)

In [None]:
datalist = []

for _, row in train.iterrows():
    entity_labels = []
    for _type, _start_idx, _end_idx in row["entities"]:
        entity_labels.append({
            'start_idx': _start_idx,
            'end_idx': _end_idx,
            'type': _type,
            'entity': row["text"][_start_idx: _end_idx]
    })

    datalist.append({
        'text': row["text"],
        'label': entity_labels
    })

In [None]:
data = pd.DataFrame(datalist)
train_data_df, dev_data_df = train_test_split(data, test_size=0.3)

In [None]:
pseudo = pd.read_csv("data/pseudo.csv", sep="\t")
pseudo["text"] = pseudo["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：→；，。、\-”]+$", "", line.strip())))
pseudo["tag"] = pseudo["tag"].apply(lambda x: [E_trans_to_C(i) for i in eval(str(x))])
pseudo["entities"] = pseudo.progress_apply(lambda row: [["LOC", *i.span()] for tag in row["tag"] for i in re.finditer(tag, row["text"])], axis=1)

pseudo_datalist = []

for _, row in train.iterrows():
    entity_labels = []
    for _type, _start_idx, _end_idx in row["entities"]:
        entity_labels.append({
            'start_idx': _start_idx,
            'end_idx': _end_idx,
            'type': _type,
            'entity': row["text"][_start_idx: _end_idx]
    })

    pseudo_datalist.append({
        'text': row["text"],
        'label': entity_labels
    })

pseudo_data = pd.DataFrame(pseudo_datalist)
train_data_df = pd.concat([train_data_df, pseudo_data]).reset_index(drop=True)

In [None]:
train_data_df = train_data_df.loc[:,['text', 'label']]
train_data_df['label'] = train_data_df['label'].apply(lambda x: str(x))
dev_data_df = dev_data_df.loc[:,['text', 'label']]
dev_data_df['label'] = dev_data_df['label'].apply(lambda x: str(x))

In [None]:
ner_train_dataset = Dataset(train_data_df)
ner_dev_dataset = Dataset(dev_data_df)

In [None]:
tokenizer = Tokenizer(vocab='roberta-base-finetuned-cluener2020-chinese', max_seq_len=52)

In [None]:
ner_train_dataset.convert_to_ids(tokenizer)
ner_dev_dataset.convert_to_ids(tokenizer)

In [None]:
class BiaffineBert(BertPreTrainedModel):
    
    def __init__(
        self,
        config,
        encoder_trained=True,
        biaffine_size=128,
        lstm_dropout=0.4,
        select_bert_layer=-1
    ):
        super(BiaffineBert, self).__init__(config)

        self.num_labels = config.num_labels
        self.select_bert_layer = select_bert_layer

        self.bert = AutoModel.from_pretrained("./outputs/roberta-finetuned-cosine")

        for param in self.bert.parameters():
            param.requires_grad = encoder_trained

        self.lstm = torch.nn.LSTM(
            input_size=config.hidden_size,
            hidden_size=config.hidden_size,
            num_layers=1,
            batch_first=True,
            dropout=lstm_dropout,
            bidirectional=True
        )

        self.start_encoder = torch.nn.Sequential(
            torch.nn.Linear(
                in_features=2*config.hidden_size,
                out_features=biaffine_size),
            torch.nn.ReLU()
        )

        self.end_encoder = torch.nn.Sequential(
            torch.nn.Linear(
                in_features=2*config.hidden_size,
                out_features=biaffine_size),
            torch.nn.ReLU()
        )

        self.biaffne = Biaffine(biaffine_size, self.num_labels)

        self.reset_params()

    def reset_params(self):
        nn.init.xavier_uniform_(self.start_encoder[0].weight)
        nn.init.xavier_uniform_(self.end_encoder[0].weight)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        **kwargs
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True,
            output_hidden_states=True
        )

        sequence_output = outputs.hidden_states[self.select_bert_layer]

        # lstm编码
        sequence_output, _ = self.lstm(sequence_output)

        start_logits = self.start_encoder(sequence_output)
        end_logits = self.end_encoder(sequence_output)

        span_logits = self.biaffne(start_logits, end_logits)
        span_logits = span_logits.contiguous()

        return span_logits

In [None]:
config = BiaffineBertConfig.from_pretrained('./outputs/roberta-finetuned-cosine', num_labels=len(ner_train_dataset.cat2id))

In [None]:
torch.cuda.empty_cache()

In [None]:
dl_module = BiaffineBert.from_pretrained('./outputs/roberta-finetuned-cosine', config=config)

In [None]:
# 设置运行次数
num_epoches = 30
batch_size = 256
# 注意lr衰减轮次的设定
show_step = len(ner_train_dataset) // batch_size + 2
t_total = len(ner_train_dataset) // batch_size * num_epoches

In [None]:
# optimizer = get_biaffine_model_optimizer(dl_module, lr=5e-4, bert_lr=1e-5, weight_decay=0.01)
optimizer = get_default_model_optimizer(dl_module)
scheduler = get_default_cosine_schedule_with_warmup(optimizer, t_total, warmup_ratio=0.1)

In [None]:
model = Task(dl_module, optimizer, torch.nn.CrossEntropyLoss(reduction="none"), scheduler=None, cude_device=2, grad_clip=10.0, ema_decay=0.995, fgm_attack=True, save_path="outputs/roberta-finetuned-biaffine")

In [None]:
model.fit(
    ner_train_dataset,
    ner_dev_dataset,
    lr=2e-4,
    epochs=num_epoches,
    batch_size=batch_size,
    show_step=show_step
)

In [None]:
import ark_nlp.model.ner.biaffine_bert as biaffine
import imp
imp.reload(biaffine)

In [None]:
ner_predictor_instance = biaffine.Predictor(model.module, tokenizer, ner_train_dataset.cat2id)

In [None]:
predict_results = []
pseudo_data = []

for _line in tqdm(test["text"].tolist()):
    label = set()
    for _preditc in ner_predictor_instance.predict_one_sample(_line):
        label.add(_preditc["entity"][:-1])
    
    label = list(label)
    if len(label) > 0:
        pseudo_data.append([_line, label])

    predict_results.append(label)

In [None]:
with open('biaffine_submit.txt', 'w', encoding='utf-8') as f:
    f.write("tag\n")
    for _result in predict_results:
       f.write(f"{str(_result)}\n")

In [None]:
# pseudo_data = pd.DataFrame(pseudo_data, columns=["text", "tag"])
# pseudo_data.to_csv("data/pseudo.csv", index=False, encoding="utf-8", sep="\t")