In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import re
import os
import jieba
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
from ark_nlp.model.ner.w2ner_bert import W2NERBert
from ark_nlp.model.ner.w2ner_bert import W2NERBertConfig
from ark_nlp.model.ner.w2ner_bert import Dataset
from ark_nlp.model.ner.w2ner_bert import Task
from ark_nlp.model.ner.w2ner_bert import get_default_w2ner_optimizer
from ark_nlp.factory.lr_scheduler import get_default_linear_schedule_with_warmup, get_default_cosine_schedule_with_warmup
from ark_nlp.model.ner.w2ner_bert import Tokenizer
from ark_nlp.factory.utils.seed import set_seed

In [None]:
set_seed(42)
tqdm.pandas(desc="inference")

In [None]:
def E_trans_to_C(string):
    E_pun = u',.!?[]()<>"\''
    C_pun = u'，。！？【】（）《》“‘'
    table= {ord(f):ord(t) for f,t in zip(E_pun,C_pun)}
    return string.translate(table)

In [None]:
test = pd.read_csv("data/test.csv", sep="\t")
train = pd.read_csv("data/train.csv", sep="\t")

In [None]:
test["text"] = test["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：；→，。、\-”]+$", "", line.strip())))
train["text"] = train["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：→；，。、\-”]+$", "", line.strip())))
train["tag"] = train["tag"].apply(lambda x: [E_trans_to_C(i) for i in eval(str(x))])

In [None]:
train["entities"] = train.progress_apply(lambda row: [["LOC", *i.span()] for tag in row["tag"] for i in re.finditer(tag, row["text"])], axis=1)

In [None]:
datalist = []

for _, row in train.iterrows():
    entity_labels = []
    for _type, _start_idx, _end_idx in row["entities"]:
        entity_labels.append({
            'start_idx': _start_idx,
            'end_idx': _end_idx,
            'type': _type,
            'entity': row["text"][_start_idx: _end_idx]
    })

    datalist.append({
        'text': row["text"],
        'entities': entity_labels
    })

In [None]:
data = pd.DataFrame(datalist)
train_data_df, dev_data_df = train_test_split(data, test_size=0.3)

In [None]:
tta = pd.read_csv("data/tta.csv", sep="\t")
tta["text"] = tta["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：→；，。、\-”]+$", "", line.strip())))
tta["tag"] = tta["tag"].apply(lambda x: [E_trans_to_C(i) for i in eval(str(x))])
tta["entities"] = tta.progress_apply(lambda row: [["LOC", *i.span()] for tag in row["tag"] for i in re.finditer(tag, row["text"])], axis=1)

tta_datalist = []

for _, row in train.iterrows():
    entity_labels = []
    for _type, _start_idx, _end_idx in row["entities"]:
        entity_labels.append({
            'start_idx': _start_idx,
            'end_idx': _end_idx,
            'type': _type,
            'entity': row["text"][_start_idx: _end_idx]
    })

    tta_datalist.append({
        'text': row["text"],
        'entities': entity_labels
    })

tta_data = pd.DataFrame(tta_datalist)
train_data_df = pd.concat([train_data_df, tta_data]).reset_index(drop=True)

In [None]:
def get_label(x):
    
    entities = []
    for entity in x:
        entity_ = {}
        idx = list(range(entity['start_idx'], entity['end_idx']))
        entity_['idx'] = idx
        entity_['type'] = entity['type']
        entity_['entity'] = entity['entity']
        entities.append(entity_)
    
    return entities

In [None]:
train_data_df['label'] = train_data_df['entities'].apply(lambda x: get_label(x))
dev_data_df['label'] = dev_data_df['entities'].apply(lambda x: get_label(x))

In [None]:
train_data_df = train_data_df.loc[:,['text', 'label']]
train_data_df['label'] = train_data_df['label'].apply(lambda x: str(x))
dev_data_df = dev_data_df.loc[:,['text', 'label']]
dev_data_df['label'] = dev_data_df['label'].apply(lambda x: str(x))

In [None]:
ner_train_dataset = Dataset(train_data_df)
ner_dev_dataset = Dataset(dev_data_df, categories=ner_train_dataset.categories)

In [None]:
tokenizer = Tokenizer(vocab='./outputs/roberta-finetuned-cosine', max_seq_len=52)

In [None]:
ner_train_dataset.convert_to_ids(tokenizer)
ner_dev_dataset.convert_to_ids(tokenizer)

### 模型构建

In [None]:
config = W2NERBertConfig.from_pretrained('./outputs/roberta-finetuned-cosine', num_labels=len(ner_train_dataset.cat2id))

In [None]:
torch.cuda.empty_cache()

In [None]:
dl_module = W2NERBert.from_pretrained('./outputs/roberta-finetuned-cosine', config=config)

In [None]:
# 设置运行次数
num_epoches, batch_size = 25, 256

In [None]:
optimizer = get_default_w2ner_optimizer(dl_module, lr=5e-3, bert_lr=2.5e-5, weight_decay=0.01)

In [None]:
# 注意lr衰减轮次的设定
show_step = len(ner_train_dataset) // batch_size + 2
t_total = len(ner_train_dataset) // batch_size * num_epoches
scheduler = get_default_cosine_schedule_with_warmup(optimizer, t_total, warmup_ratio=0.6)

In [None]:
model = Task(dl_module, optimizer, 'ce', cude_device=2, scheduler=scheduler, grad_clip=10.0, ema_decay=0.995, fgm_attack=True, save_path="outputs/roberta-finetuned-tta", )

In [None]:
model.fit(ner_train_dataset, ner_dev_dataset, epochs=num_epoches, batch_size=batch_size, show_step=show_step)

In [None]:
from ark_nlp.model.ner.w2ner_bert import Predictor

In [None]:
ner_predictor_instance = Predictor(model.module, tokenizer, ner_train_dataset.cat2id)

In [None]:
predict_results = []
tta_data = []

for _line in tqdm(test["text"].tolist()):
    label = set()
    for _preditc in ner_predictor_instance.predict_one_sample(_line):
        label.add(_preditc["entity"])
    
    label = list(label)
    if len(label) > 0:
        tta_data.append([_line, label])

    predict_results.append(label)

In [None]:
with open('w2ner_submit_tta.txt', 'w', encoding='utf-8') as f:
    f.write("tag\n")
    for _result in predict_results:
       f.write(f"{str(_result)}\n")

In [None]:
tta_data = pd.DataFrame(tta_data, columns=["text", "tag"])
tta_data.to_csv("data/tta_finetune.csv", index=False, encoding="utf-8", sep="\t")

In [None]:
AutoTokenizer.from_pretrained('./outputs/roberta-finetuned-cosine').save_pretrained("outputs/roberta-finetuned-tta")