In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import re
import os
import jieba
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
from ark_nlp.model.ner.w2ner_bert import W2NERBertConfig
from ark_nlp.model.ner.w2ner_bert import Tokenizer
from ark_nlp.model.ner.w2ner_bert import W2NERBert
from ark_nlp.model.ner.w2ner_bert import Dataset
from ark_nlp.model.ner.w2ner_bert import Task
from ark_nlp.model.ner.w2ner_bert import get_default_w2ner_optimizer
from ark_nlp.factory.lr_scheduler import get_default_linear_schedule_with_warmup, get_default_cosine_schedule_with_warmup
from ark_nlp.factory.utils.seed import set_seed

In [3]:
set_seed(42)
tqdm.pandas(desc="inference")

In [4]:
def E_trans_to_C(string):
    E_pun = u',.!?[]()<>"\''
    C_pun = u'，。！？【】（）《》“‘'
    table= {ord(f):ord(t) for f,t in zip(E_pun,C_pun)}
    return string.translate(table)

In [5]:
test = pd.read_csv("data/test.csv", sep="\t")
train = pd.read_csv("data/train.csv", sep="\t")

In [6]:
test["text"] = test["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：；→，。、\-”]+$", "", line.strip())))
train["text"] = train["text"].apply(lambda line: E_trans_to_C(re.sub("[\(《：→；，。、\-”]+$", "", line.strip())))
train["tag"] = train["tag"].apply(lambda x: [E_trans_to_C(i) for i in eval(str(x))])

In [7]:
train["entities"] = train.progress_apply(lambda row: [["LOC", *i.span()] for tag in row["tag"] for i in re.finditer(tag, row["text"])], axis=1)

inference: 100%|██████████| 6000/6000 [00:00<00:00, 19709.33it/s]


In [8]:
datalist = []

for _, row in train.iterrows():
    entity_labels = []
    for _type, _start_idx, _end_idx in row["entities"]:
        entity_labels.append({
            'start_idx': _start_idx,
            'end_idx': _end_idx,
            'type': _type,
            'entity': row["text"][_start_idx: _end_idx]
    })

    datalist.append({
        'text': row["text"],
        'entities': entity_labels
    })

In [9]:
train_data_df = pd.DataFrame(datalist)

In [10]:
def get_label(x):
    
    entities = []
    for entity in x:
        entity_ = {}
        idx = list(range(entity['start_idx'], entity['end_idx']))
        entity_['idx'] = idx
        entity_['type'] = entity['type']
        entity_['entity'] = entity['entity']
        entities.append(entity_)
    
    return entities

In [11]:
train_data_df['label'] = train_data_df['entities'].apply(lambda x: get_label(x))

In [12]:
train_data_df = train_data_df.loc[:,['text', 'label']]
train_data_df['label'] = train_data_df['label'].apply(lambda x: str(x))

In [13]:
ner_train_dataset = Dataset(train_data_df)

In [14]:
tokenizer = Tokenizer(vocab='roberta-base-finetuned-cluener2020-chinese', max_seq_len=52)

In [15]:
ner_train_dataset.convert_to_ids(tokenizer)

### 模型构建

In [16]:
config = W2NERBertConfig.from_pretrained('roberta-base-finetuned-cluener2020-chinese', num_labels=len(ner_train_dataset.cat2id))

In [17]:
torch.cuda.empty_cache()

In [18]:
dl_module = W2NERBert.from_pretrained('roberta-base-finetuned-cluener2020-chinese', config=config)

Some weights of the model checkpoint at roberta-base-finetuned-cluener2020-chinese were not used when initializing W2NERBert: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing W2NERBert from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing W2NERBert from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of W2NERBert were not initialized from the model checkpoint at roberta-base-finetuned-cluener2020-chinese and are newly initialized: ['convLayer.base.1.bias', 'predictor.biaffine.weight', 'predictor.linear.bias', 'encoder.bias_hh_l0', 'convLayer.base.1.weight', 'predictor.mlp2.linear.weight', 'encoder.weight_hh_l0_reverse', 'cln.beta', 'predictor.mlp1.linear.bias',

In [19]:
# 设置运行次数
# num_epoches, batch_size = 10, 16 # 0.91
# num_epoches, batch_size = 15, 16
num_epoches, batch_size = 40, 256 # 0.91

In [20]:
# optimizer = get_default_w2ner_optimizer(dl_module) # 0.91
# optimizer = get_default_w2ner_optimizer(dl_module, lr=5e-4, bert_lr=1e-5, weight_decay=0.01)
# optimizer = get_default_w2ner_optimizer(dl_module, lr=5e-2, bert_lr=5e-5, weight_decay=0.01)
optimizer = get_default_w2ner_optimizer(dl_module, lr=1e-2, bert_lr=5e-5, weight_decay=0.01) # 0.91143

In [21]:
# 注意lr衰减轮次的设定
show_step = len(ner_train_dataset) // batch_size + 2
t_total = len(ner_train_dataset) // batch_size * num_epoches
scheduler = get_default_cosine_schedule_with_warmup(optimizer, t_total, warmup_ratio=0.2)

In [22]:
model = Task(dl_module, optimizer, 'ce', cude_device=2, scheduler=scheduler, grad_clip=5.0, ema_decay=0.995, fgm_attack=True, save_path="outputs/roberta-finetuned-all", )

In [23]:
model.fit(ner_train_dataset, epochs=num_epoches, batch_size=batch_size, show_step=show_step)

100%|██████████| 24/24 [00:44<00:00,  1.84s/it]


epoch:[0],train loss is:0.324292 



100%|██████████| 24/24 [00:44<00:00,  1.86s/it]


epoch:[1],train loss is:0.024199 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[2],train loss is:0.019560 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[3],train loss is:0.012175 



100%|██████████| 24/24 [00:44<00:00,  1.86s/it]


epoch:[4],train loss is:0.004632 



100%|██████████| 24/24 [00:44<00:00,  1.85s/it]


epoch:[5],train loss is:0.003418 



100%|██████████| 24/24 [00:44<00:00,  1.85s/it]


epoch:[6],train loss is:0.003075 



100%|██████████| 24/24 [00:44<00:00,  1.86s/it]


epoch:[7],train loss is:0.002798 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[8],train loss is:0.002524 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[9],train loss is:0.002366 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[10],train loss is:0.002226 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[11],train loss is:0.002126 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[12],train loss is:0.002021 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[13],train loss is:0.001882 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[14],train loss is:0.001790 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[15],train loss is:0.001757 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[16],train loss is:0.001610 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[17],train loss is:0.001563 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[18],train loss is:0.001467 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[19],train loss is:0.001423 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[20],train loss is:0.001344 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[21],train loss is:0.001309 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[22],train loss is:0.001223 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[23],train loss is:0.001161 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[24],train loss is:0.001170 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[25],train loss is:0.001122 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[26],train loss is:0.001093 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[27],train loss is:0.001057 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[28],train loss is:0.001027 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[29],train loss is:0.001004 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[30],train loss is:0.000962 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[31],train loss is:0.000937 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[32],train loss is:0.000953 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[33],train loss is:0.000918 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[34],train loss is:0.000931 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[35],train loss is:0.000920 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[36],train loss is:0.000905 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[37],train loss is:0.000906 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[38],train loss is:0.000942 



100%|██████████| 24/24 [00:44<00:00,  1.87s/it]


epoch:[39],train loss is:0.000914 

save finetuned model to outputs/roberta-finetuned-all.


In [24]:
from ark_nlp.model.ner.w2ner_bert import Predictor

In [25]:
ner_predictor_instance = Predictor(model.module, tokenizer, ner_train_dataset.cat2id)

In [26]:
predict_results = []
pseudo_data = []

for _line in tqdm(test["text"].tolist()):
    label = set()
    for _preditc in ner_predictor_instance.predict_one_sample(_line):
        label.add(_preditc["entity"])
    
    label = list(label)
    if len(label) > 0:
        pseudo_data.append([_line, label])

    predict_results.append(label)

100%|██████████| 2657/2657 [00:35<00:00, 74.68it/s]


In [27]:
with open('w2ner_submit_all.txt', 'w', encoding='utf-8') as f:
    f.write("tag\n")
    for _result in predict_results:
       f.write(f"{str(_result)}\n")

In [28]:
pseudo_data = pd.DataFrame(pseudo_data, columns=["text", "tag"])
pseudo_data.to_csv("data/pseudo_all.csv", index=False, encoding="utf-8", sep="\t")

In [29]:
# AutoTokenizer.from_pretrained('roberta-base-finetuned-cluener2020-chinese').save_pretrained("outputs/roberta-finetuned-all")

('outputs/roberta-finetuned-all/tokenizer_config.json',
 'outputs/roberta-finetuned-all/special_tokens_map.json',
 'outputs/roberta-finetuned-all/vocab.txt',
 'outputs/roberta-finetuned-all/added_tokens.json',
 'outputs/roberta-finetuned-all/tokenizer.json')