In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from data_preprocess.generate_mrc_dataset import generate_query_ner_dataset
from data_preprocess.file_utils import load_conll

In [3]:
input_test_path = "../wlp_wnut/WNUT_2020_NER/data/test_data/Conll_Format/protocol_103_conll.txt"

In [3]:
# generate_query_ner_dataset("../wlp_wnut/data/ner/wlp_conll/train.txt", "../wlp_wnut/data/ner/wlp_mrc_query/train_mrc_query.json", "flat", "en_wnut_20_wlp")
# generate_query_ner_dataset("../wlp_wnut/data/ner/wlp_conll/dev.txt", "../wlp_wnut/data/ner/wlp_mrc_query/dev_mrc_query.json", "flat", "en_wnut_20_wlp")
# generate_query_ner_dataset("../wlp_wnut/data/ner/wlp_conll/test.txt", "../wlp_wnut/data/ner/wlp_mrc_query/test_mrc_query.json", "flat", "en_wnut_20_wlp")

import os
os.makedirs("../wlp_wnut/data/ner/wlp_mrc_query_test", exist_ok=True)

generate_query_ner_dataset("../wlp_wnut/WNUT_2020_NER/data/train_data/Conll_Format/protocol_100_conll.txt", "../wlp_wnut/data/ner/wlp_mrc_query_test/mrc-ner.train", "flat", "en_wnut_20_wlp")
generate_query_ner_dataset("../wlp_wnut/WNUT_2020_NER/data/dev_data/Conll_Format/protocol_102_conll.txt", "../wlp_wnut/data/ner/wlp_mrc_query_test/mrc-ner.dev", "flat", "en_wnut_20_wlp")
generate_query_ner_dataset("../wlp_wnut/WNUT_2020_NER/data/test_data/Conll_Format/protocol_103_conll.txt", "../wlp_wnut/data/ner/wlp_mrc_query_test/mrc-ner.test", "flat", "en_wnut_20_wlp")

In [4]:
import os 
import argparse 
import numpy as np 
import random
from transformers import AutoTokenizer
import torch

from data_loader.model_config import Config 
from data_loader.mrc_data_loader import MRCNERDataLoader
from data_loader.mrc_data_processor import Conll03Processor, MSRAProcessor, Onto4ZhProcessor, Onto5EngProcessor, GeniaProcessor, ACE2004Processor, ACE2005Processor, ResumeZhProcessor, QueryNERProcessor
from data_loader.mrc_utils import convert_examples_to_features
from layer.optim import AdamW, lr_linear_decay, BertAdam
from model.bert_mrc import BertQueryNER
from data_loader.bert_tokenizer import BertTokenizer4Tagger 
from metric.mrc_ner_evaluate  import flat_ner_performance, nested_ner_performance


export REPO_PATH=.
export PYTHONPATH="$PYTHONPATH:$REPO_PATH"
export CONFIG_PATH=${FOLDER_PATH}/config/en_bert_base_cased.json
export DATA_PATH=~/Documents/wlp_wnut/data/ner/wlp_mrc_query
export BERT_PATH=bert-base-cased
export OUTPUT_PATH=./wlp_wnut_output
export DATA_SIGN=en_wnut_20_wlp 
export ENTITY_SIGN=flat

CUDA_VISIBLE_DEVICES=1 python3 $REPO_PATH/run/train_bert_mrc.py \
--data_dir $DATA_PATH \
--n_gpu $N_GPU \
--entity_sign $ENTITY_SIGN \
--data_sign $DATA_SIGN \
--bert_model $BERT_PATH \
--config_path $CONFIG_PATH \
--output_dir $OUTPUT_PATH \
--train_batch_size 4 \
--dev_batch_size 4 \
--test_batch_size 4 \
--num_train_epochs 5 \
--n_gpu 1

In [5]:
class WlpWnut20Processor(QueryNERProcessor):
    def get_labels(self, ):
        return [
            "Action",
            "Reagent",
            "Modifier",
            "Location",
            "Amount",
            "Time",
            "Method",
            "Concentration",
            "Temperature",
            "Device",
            "Measure-Type",
            "Numerical",
            "Speed",
            "Generic-Measure",
            "Size",
            "Seal",
            "pH",
            "Mention", 
            "O"
        ]

In [6]:
def load_data(config):

    print("-*-"*10)
    print("current data_sign: {}".format(config.data_sign))

    if config.data_sign == "conll03":
        data_processor = Conll03Processor()
    elif config.data_sign == "zh_msra":
        data_processor = MSRAProcessor()
    elif config.data_sign == "zh_onto":
        data_processor = Onto4ZhProcessor()
    elif config.data_sign == "en_onto":
        data_processor = Onto5EngProcessor()
    elif config.data_sign == "genia":
        data_processor = GeniaProcessor()
    elif config.data_sign == "ace2004":
        data_processor = ACE2004Processor()
    elif config.data_sign == "ace2005":
        data_processor = ACE2005Processor()
    elif config.data_sign == "resume":
        data_processor = ResumeZhProcessor()
    elif config.data_sign == "en_wnut_20_wlp":
        data_processor = WlpWnut20Processor()
    else:
        raise ValueError("Please Notice that your data_sign DO NOT exits !!!!!")


    label_list = data_processor.get_labels()
    tokenizer = AutoTokenizer.from_pretrained(config.bert_model)
    
    print(tokenizer)

    dataset_loaders = MRCNERDataLoader(config, data_processor, label_list, tokenizer, mode="train", allow_impossible=True)
    train_dataloader = dataset_loaders.get_dataloader(data_sign="train", num_data_processor=config.num_data_processor)
    dev_dataloader = dataset_loaders.get_dataloader(data_sign="dev", num_data_processor=config.num_data_processor)
    test_dataloader = dataset_loaders.get_dataloader(data_sign="test", num_data_processor=config.num_data_processor)
    num_train_steps = dataset_loaders.get_num_train_epochs()

    return train_dataloader, dev_dataloader, test_dataloader, num_train_steps, label_list 


def load_model(config, num_train_steps, label_list):
    device = torch.device("cuda") 
    n_gpu = config.n_gpu
    model = BertQueryNER(config, ) 
    model.to(device)
    if config.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # prepare optimzier 
    param_optimizer = list(model.named_parameters())

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight", 'gamma', 'beta']
    optimizer_grouped_parameters = [
    {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01},
    {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}]

    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, betas=(0.9, 0.98), eps=1e-6, weight_decay=0.01)
    # optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion,
    #                     t_total=num_train_steps, max_grad_norm=config.clip_grad)
    sheduler = None

    if config.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=config.amp_level)

    # Distributed training (should be after apex fp16 initialization)
    if config.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[config.local_rank], output_device=config.local_rank, find_unused_parameters=True
            )

    return model, optimizer, sheduler, device, n_gpu


def train(model, optimizer, sheduler,  train_dataloader, dev_dataloader, test_dataloader, config, \
    device, n_gpu, label_list):

    dev_best_acc = 0 
    dev_best_precision = 0 
    dev_best_recall = 0 
    dev_best_f1 = 0 
    dev_best_loss = 10000000000000

    test_acc_when_dev_best = 0 
    test_pre_when_dev_best = 0 
    test_rec_when_dev_best = 0 
    test_f1_when_dev_best = 0 
    test_loss_when_dev_best = 1000000000000000

    model.train()
    for idx in range(int(config.num_train_epochs)):
        tr_loss = 0 
        nb_tr_examples, nb_tr_steps = 0, 0
        print("#######"*10)
        print("EPOCH: ", str(idx))
        if idx != 0:
            lr_linear_decay(optimizer) 
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch) 
            input_ids, input_mask, segment_ids, start_pos, end_pos, span_pos, span_label_mask, ner_cate = batch

            loss = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, \
                start_positions=start_pos, end_positions=end_pos, span_positions=span_pos, span_label_mask=span_label_mask)

            if config.n_gpu > 1:
                loss = loss.mean()

            if config.fp16:
                from apex import amp
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)

            optimizer.step()
            model.zero_grad()

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1 

            if nb_tr_steps % config.checkpoint == 0:
                print("-*-"*15)
                print("current training loss is : ")
                print(loss.item())
                model, tmp_dev_loss, tmp_dev_acc, tmp_dev_prec, tmp_dev_rec, tmp_dev_f1 = eval_checkpoint(model, dev_dataloader, config, device, n_gpu, label_list, eval_sign="dev")
                print("......"*10)
                print("DEV: loss, acc, precision, recall, f1")
                print(tmp_dev_loss, tmp_dev_acc, tmp_dev_prec, tmp_dev_rec, tmp_dev_f1)

                if tmp_dev_f1 > dev_best_f1 :
                    dev_best_acc = tmp_dev_acc 
                    dev_best_loss = tmp_dev_loss 
                    dev_best_precision = tmp_dev_prec 
                    dev_best_recall = tmp_dev_rec 
                    dev_best_f1 = tmp_dev_f1 

                    # export model 
                    if config.export_model:
                        model_to_save = model.module if hasattr(model, "module") else model 
                        output_model_file = os.path.join(config.output_dir, "bert_finetune_model_{}_{}.bin".format(str(idx),str(nb_tr_steps)))
                        torch.save(model_to_save.state_dict(), output_model_file)
                        print("SAVED model path is :") 
                        print(output_model_file)

                    model = model.cuda().to(device)
                    model, tmp_test_loss, tmp_test_acc, tmp_test_prec, tmp_test_rec, tmp_test_f1 = eval_checkpoint(model, test_dataloader, config, device, n_gpu, label_list, eval_sign="test")
                    print("......"*10)
                    print("TEST: loss, acc, precision, recall, f1")
                    print(tmp_test_loss, tmp_test_acc, tmp_test_prec, tmp_test_rec, tmp_test_f1)

                    test_acc_when_dev_best = tmp_test_acc 
                    test_pre_when_dev_best = tmp_test_prec
                    test_rec_when_dev_best = tmp_test_rec
                    test_f1_when_dev_best = tmp_test_f1 
                    test_loss_when_dev_best = tmp_test_loss
                    model = model.cuda().to(device)

                print("-*-"*15)

    print("=&="*15)
    print("Best DEV : overall best loss, acc, precision, recall, f1 ")
    print(dev_best_loss, dev_best_acc, dev_best_precision, dev_best_recall, dev_best_f1)
    print("scores on TEST when Best DEV:loss, acc, precision, recall, f1 ")
    print(test_loss_when_dev_best, test_acc_when_dev_best, test_pre_when_dev_best, test_rec_when_dev_best, test_f1_when_dev_best)
    print("=&="*15)


def eval_checkpoint(model_object, eval_dataloader, config, device, n_gpu, label_list, eval_sign="dev"):
    # input_dataloader type can only be one of dev_dataloader, test_dataloader

    eval_loss = 0
    start_pred_lst = []
    end_pred_lst = []
    span_pred_lst = []
    start_scores_lst = []
    end_scores_lst = []
    mask_lst = []
    start_gold_lst = []
    span_gold_lst = []
    end_gold_lst = []
    eval_steps = 0
    ner_cate_lst = []

    for input_ids, input_mask, segment_ids, start_pos, end_pos, span_pos, span_label_mask, ner_cate in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        start_pos = start_pos.to(device)
        end_pos = end_pos.to(device)
        span_pos = span_pos.to(device)
        span_label_mask = span_label_mask.to(device)

        with torch.no_grad():
            model_object.eval()
            tmp_eval_loss = model_object(input_ids, segment_ids, input_mask, start_pos, end_pos, span_pos, span_label_mask)
            start_labels, end_labels, span_scores = model_object(input_ids, segment_ids, input_mask)

        start_pos = start_pos.to("cpu").numpy().tolist()
        end_pos = end_pos.to("cpu").numpy().tolist()
        span_pos = span_pos.to("cpu").numpy().tolist()

        start_label = start_labels.detach().cpu().numpy().tolist()
        end_label = end_labels.detach().cpu().numpy().tolist()
        span_scores = span_scores.detach().cpu().numpy().tolist()
        span_label = span_scores
        input_mask = input_mask.to("cpu").detach().numpy().tolist()

        ner_cate_lst += ner_cate.numpy().tolist()
        eval_loss += tmp_eval_loss.mean().item()
        mask_lst += input_mask 
        eval_steps += 1

        start_pred_lst += start_label 
        end_pred_lst += end_label 
        span_pred_lst += span_label
        
        start_gold_lst += start_pos 
        end_gold_lst += end_pos 
        span_gold_lst += span_pos 

    
    if config.entity_sign == "flat":
        eval_accuracy, eval_precision, eval_recall, eval_f1 = flat_ner_performance(start_pred_lst, end_pred_lst, span_pred_lst, start_gold_lst, end_gold_lst, span_gold_lst, ner_cate_lst, label_list, threshold=config.entity_threshold, dims=2)
    else:
        eval_accuracy, eval_precision, eval_recall, eval_f1 = nested_ner_performance(start_pred_lst, end_pred_lst, span_pred_lst, start_gold_lst, end_gold_lst, span_gold_lst, ner_cate_lst, label_list, threshold=config.entity_threshold, dims=2)

    average_loss = round(eval_loss / eval_steps, 4)  
    eval_f1 = round(eval_f1 , 4)
    eval_precision = round(eval_precision , 4)
    eval_recall = round(eval_recall , 4) 
    eval_accuracy = round(eval_accuracy , 4)
    model_object.train()

    return model_object, average_loss, eval_accuracy, eval_precision, eval_recall, eval_f1


def merge_config(args_config):
    model_config_path = args_config.config_path 
    model_config = Config.from_json_file(model_config_path)
    model_config.update_args(args_config)
    model_config.print_config()
    return model_config


In [7]:
args = argparse.Namespace()

args.config_path = "./config/en_bert_base_cased.json"
args.data_dir = "../wlp_wnut/data/ner/wlp_mrc_query/"
args.bert_model = "bert-base-cased"
args.task_name = None
args.max_seq_length = 128
args.train_batch_size = 4
args.dev_batch_size = 4
args.test_batch_size = 4
args.checkpoint = 100
args.learning_rate = 5e-5
args.num_train_epochs = 5
args.warmup_proportion = 0.1
args.max_grad_norm = 1.0
args.gradient_accumulation_steps = 1
args.seed = 3006
args.output_dir = "./wlp_wnut_output"
args.data_sign = "en_wnut_20_wlp"
args.weight_start = 1.0
args.weight_end = 1.0
args.weight_span = 1.0
args.entity_sign = "flat"
args.n_gpu = 1
args.dropout = 0.2
args.entity_threshold = 0.5
args.num_data_processor = 1
args.data_cache = True
args.export_model = True
args.do_lower_case = False
args.fp16 = False
args.amp_level = "02" 
args.local_rank = -1
args.num_data_processor = 1


args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps 

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

os.makedirs(args.output_dir, exist_ok=True)

In [8]:
args_config = args

config = merge_config(args_config)

Please notice that merge the args_dict and json_config ... ...
{
  "bert_frozen": "false",
  "hidden_size": 768,
  "hidden_dropout_prob": 0.2,
  "classifier_sign": "multi_nonlinear",
  "clip_grad": 1,
  "bert_config": {
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "max_position_embeddings": 512,
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "type_vocab_size": 2,
    "vocab_size": 28996
  },
  "config_path": "./config/en_bert_base_cased.json",
  "data_dir": "../wlp_wnut/data/ner/wlp_mrc_query/",
  "bert_model": "bert-base-cased",
  "task_name": null,
  "max_seq_length": 128,
  "train_batch_size": 4,
  "dev_batch_size": 4,
  "test_batch_size": 4,
  "checkpoint": 100,
  "learning_rate": 5e-05,
  "num_train_epochs": 5,
  "warmup_proportion": 0.1,
  "max_grad_norm": 1.0,
  "gradient_accumulation_steps": 1,
  "seed": 3006,
  "output

In [9]:
import torch


torch.zeros((100, 10))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],


In [10]:
train_loader, dev_loader, test_loader, num_train_steps, label_list = load_data(config)

-*--*--*--*--*--*--*--*--*--*-
current data_sign: en_wnut_20_wlp
<transformers.tokenization_bert.BertTokenizer object at 0x7f4b8049e290>
=*==*==*==*==*==*==*==*==*==*=
loading train data ... ...
Before Input Data: 145332
After Input Data slice: 27000
FEATURES SIZE 128
EXAMPLES LENGTH 27000
27000 train data loaded
=*==*==*==*==*==*==*==*==*==*=
loading dev data ... ...
FEATURES SIZE 128
EXAMPLES LENGTH 9000
9000 dev data loaded
=*==*==*==*==*==*==*==*==*==*=
loading test data ... ...
FEATURES SIZE 128
EXAMPLES LENGTH 9000
9000 test data loaded


In [11]:
train_loader.dataset.tensors[3].shape

torch.Size([27000, 128])

In [12]:
model, optimizer, sheduler, device, n_gpu = load_model(config, num_train_steps, label_list)

In [None]:
train(model, optimizer, sheduler, train_loader, dev_loader, test_loader, config, device, n_gpu, label_list)

######################################################################
EPOCH:  0


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
current training loss is : 
0.48953989148139954
............................................................
DEV: loss, acc, precision, recall, f1
0.1515 0.791 0.0 0.0 0.0
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
current training loss is : 
0.021484261378645897
............................................................
DEV: loss, acc, precision, recall, f1
0.1381 0.791 0.0 0.0 0.0
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
current training loss is : 
0.27062806487083435
............................................................
DEV: loss, acc, precision, recall, f1
0.1451 0.791 0.0 0.0 0.0
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
current training loss is : 
0.019999917596578598
............................................................
DEV: loss, acc, precision, recall, f1
0.1395 0.791 