In [1]:
# 在google co-lab中，切换到正确工作目录
import os
if "Nested.ipynb" not in os.listdir():
  try:
    os.chdir("drive/MyDrive")
    os.chdir("Nested")
  except:
    print("无法找到正确工作目录！")

# 安装依赖项
!sudo pip install -r requirements.txt | grep -v 'Requirement already satisfied'
!nvidia-smi

# 载入所有需要的包
import copy, time, sys
from datetime import datetime
from random import shuffle
from collections import defaultdict
import numpy as np
import pickle
from typing import Optional, Tuple, List, Dict
import torch
import torch.nn
import torch.cuda
from torch import Tensor

from model.sequence_labeling import BiRecurrentConvCRF4NestedNER
from training.logger import get_logger
from training.utils import adjust_learning_rate, clip_model_grad, create_opt, pack_target, unpack_prediction, Optimizer
from util.evaluate import evaluate
from util.utils import Alphabet, save_dynamic_config, load_dynamic_config
from reader.reader import Reader

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [2]:
# 设定全局随机种子，保证可复现性
def set_random_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

set_random_seed(2021)


# 全局配置文件设置
class Config:
    def __init__(self,dataset) -> None:
        self.root_path: str = "."

        # for data loader
        self.data_set: str = dataset
        self.lowercase: Optional[bool] = None
        self.batch_size: int = 32
        self.if_shuffle: bool = True

        # override when loading data
        self.voc_iv_size: Optional[int] = None
        self.voc_ooev_size: Optional[int] = None
        self.char_size: Optional[int] = None
        self.label_size: Optional[int] = None

        # embed size
        self.token_embed: Optional[int] = None
        self.char_embed: int = 128
        self.word_dropout: float = 0.05
        self.char_dropout: float = 0.00

        # for cnn
        self.num_filters: int = 256
        self.kernel_size: int = 3

        # for lstm
        self.hidden_size: int = 256
        self.layers: int = 2
        self.lstm_dropout: float = 0.20

        # for training
        self.embed_path: str = self.root_path + "/data/word_vec_{}.pkl".format(self.data_set)

        self.epoch: int = 150 # TODO: 根据需要设置训练轮数  
        self.if_gpu: bool = True
        self.if_gpu = self.if_gpu and torch.cuda.is_available()

        self.opt: Optimizer = Optimizer.AdaBound
        self.lr: float = 0.001 if self.opt != Optimizer.SGD else 0.1
        self.final_lr: float = 0.1 if self.opt == Optimizer.AdaBound else None
        self.l2: float = 0.
        self.check_every: int = 1
        self.clip_norm: int = 5

        # for early stop
        self.lr_patience: int = 3 if self.opt != Optimizer.SGD else 5

        self.data_path: str = self.root_path + "/data/{}".format(self.data_set)
        self.train_data_path: str = self.data_path + "_train.pkl"
        self.dev_data_path: str = self.data_path + "_dev.pkl"
        self.test_data_path: str = self.data_path + "_test.pkl"
        self.idx_data_path: str = self.data_path + "_idx.pkl"
        self.config_data_path: str = self.data_path + "_config.pkl"
        self.model_root_path: str = self.root_path + "/dumps"
        self.model_path: str = self.model_root_path + "/{}_model".format(self.data_set)

    def __repr__(self) -> str:
        return str(vars(self))

config = Config('NLPHW')

In [3]:
# gen_data.py
def batch_stat(batches: Tuple[List[List[List[int]]],
                List[List[List[int]]],
                List[List[List[List[int]]]],
                List[List[List[Tuple[int, int, int]]]],
                List[List[List[bool]]]]) -> None:
    all_num = 0
    start_num = 0
    end_num = 0
    for token_iv_batch, token_ooev_batch, char_batch, label_batch, mask_batch in zip(*batches):
        for labels in label_batch:
            start_dic = defaultdict(list)
            end_dic = defaultdict(list)
            for ent in labels:
                start_dic[(ent[0], ent[2])].append(ent)
                end_dic[(ent[1], ent[2])].append(ent)
                all_num += 1
            for k, v in start_dic.items():
                if len(v) > 1:
                    start_num += len(v)
            for k, v in end_dic.items():
                if len(v) > 1:
                    end_num += len(v)

    print("All {}, start {}, end {}".format(all_num, start_num, end_num))


reader = Reader()
reader.read_and_gen_vectors_pubmed_word2vec(config.embed_path)
reader.read_all_data("./data/" + config.data_set + '/', "train.txt", "dev.txt", "test.txt")

(train_batches, dev_batches, test_batches), index = reader.to_batch(config.batch_size)
with open(config.train_data_path, 'wb') as f:
  pickle.dump(train_batches, f)

with open(config.dev_data_path, 'wb') as f:
  pickle.dump(dev_batches, f)

with open(config.test_data_path, 'wb') as f:
  pickle.dump(test_batches, f)

with open(config.idx_data_path, 'wb') as f:
  pickle.dump(index, f)

batch_stat(train_batches)
batch_stat(dev_batches)
batch_stat(test_batches)

# misc config
misc_dict = save_dynamic_config(reader)
with open(config.config_data_path, 'wb') as f:
  pickle.dump(misc_dict, f)


Max length: 20
Threshold 6: 835
Max length: 20
Threshold 6: 76
Max length: 0
Threshold 6: 0
# mentions: 47006
# mentions: 4469
# mentions: 0
All 47006, start 2149, end 1892
All 4469, start 186, end 230
All 0, start 0, end 0


In [4]:
# train.py
def get_f1(model: BiRecurrentConvCRF4NestedNER, mode: str, file_path: str = None) -> float:
    output_res = [None]*1855*2
    total_i = 0
    with torch.no_grad():
        model.eval()

        pred_all, pred, recall_all, recall = 0, 0, 0, 0
        gold_cross_num = 0
        pred_cross_num = 0
        if mode == 'dev':
            batch_zip = zip(dev_token_iv_batches,
                            dev_token_ooev_batches,
                            dev_char_batches,
                            dev_label_batches,
                            dev_mask_batches)
        elif mode == 'test':
            batch_zip = zip(test_token_iv_batches,
                            test_token_ooev_batches,
                            test_char_batches,
                            test_label_batches,
                            test_mask_batches)
        else:
            raise ValueError

        f = None
        if file_path is not None:
            f = open(file_path, 'w', encoding='utf-8')

        for token_iv_batch, token_ooev_batch, char_batch, label_batch, mask_batch in batch_zip:
            token_iv_batch_var = torch.LongTensor(np.array(token_iv_batch,dtype=int))
            token_ooev_batch_var = torch.LongTensor(np.array(token_ooev_batch))
            char_batch_var = torch.LongTensor(np.array(char_batch))
            mask_batch_var = torch.ByteTensor(np.array(mask_batch, dtype=np.uint8))
            if config.if_gpu:
                token_iv_batch_var = token_iv_batch_var.cuda()
                token_ooev_batch_var = token_ooev_batch_var.cuda()
                char_batch_var = char_batch_var.cuda()
                mask_batch_var = mask_batch_var.cuda()

            pred_sequence_entities = model.predict(token_iv_batch_var,
                                token_ooev_batch_var,
                                char_batch_var,
                                mask_batch_var)
            pred_entities = unpack_prediction(model, pred_sequence_entities)
            p_a, p, r_a, r = evaluate(label_batch, pred_entities)

            gold_cross_num += 0
            pred_cross_num += 0

            pred_all += p_a
            pred += p
            recall_all += r_a
            recall += r

            if file_path is not None:
                for token_iv, token_ooev, mask, label, preds \
                        in zip(token_iv_batch, token_ooev_batch, mask_batch, label_batch, pred_entities):
                    words = []
                    for t_iv, t_ooev, m in zip(token_iv, token_ooev, mask):
                        if not m:
                            break
                        if t_iv > 0:
                            words.append(voc_iv_dict.get_instance(t_iv))
                        else:
                            words.append(voc_ooev_dict.get_instance(t_ooev))
                    index = test_index[total_i] 
                    total_i += 1
                    word_ = ' '.join(words) + '\n'
                    output_res[index*2] = word_
                    labels = []
                    for p in sorted(preds, key=lambda x: (x[0], x[1], x[2])):
                        labels.append("{},{} {}".format(p[0], p[1], label_dict.get_instance(p[2])))
                    label_ = '|'.join(labels) + '\n'
                    output_res[index*2+1] = label_
        
        if file_path is not None:
            for i in range(0, len(output_res), 2):
                if output_res[i] is not None:
                    f.write(output_res[i]) 
                else:
                    f.write('should be\n')
                if output_res[i+1] is not None:
                    f.write(output_res[i+1]) 
                else:
                    f.write('\n')
            f.close()

        pred = pred / pred_all if pred_all > 0 else 1.
        recall = recall / recall_all if recall_all > 0 else 1.
        f1 = 2 / ((1. / pred) + (1. / recall)) if pred > 0. and recall > 0. else 0.
        logger.info("{} precision: {:.2f}%, recall: {:.2f}%, F1: {:.2f}%".format(mode, pred * 100., recall * 100., f1 * 100.))
        return f1


# prepare log file
serial_number = datetime.now().strftime('%y%m%d_%H%M%S')
log_file_path = config.model_path + "_" + serial_number + '.tmp'
if not os.path.isdir(config.model_root_path):
    os.makedirs(config.model_root_path, mode=0o755, exist_ok=True)
logger = get_logger('Nested Mention', file=log_file_path)


# load data
with open(config.train_data_path, 'rb') as f:
  train_token_iv_batches, train_token_ooev_batches, train_char_batches, train_label_batches, train_mask_batches = pickle.load(f)
with open(config.dev_data_path, 'rb') as f:
  dev_token_iv_batches, dev_token_ooev_batches, dev_char_batches, dev_label_batches, dev_mask_batches = pickle.load(f)
with open(config.test_data_path, 'rb') as f:
  test_token_iv_batches, test_token_ooev_batches, test_char_batches, test_label_batches, test_mask_batches = pickle.load(f)
with open(config.idx_data_path, 'rb') as f:
  test_index = pickle.load(f)


# misc info
misc_config: Dict[str, Alphabet] = pickle.load(open(config.config_data_path, 'rb'))
voc_iv_dict, voc_ooev_dict, char_dict, label_dict = load_dynamic_config(misc_config)
config.voc_iv_size = voc_iv_dict.size()
config.voc_ooev_size = voc_ooev_dict.size()
config.char_size = char_dict.size()
config.label_size = label_dict.size()

with open(config.embed_path, 'rb') as f:
    vectors: List[np.ndarray] = pickle.load(f)
    config.token_embed = vectors[0].size
    embedd_word: Tensor = Tensor(vectors)

logger.info(config)  # print training setting

ner_model = BiRecurrentConvCRF4NestedNER(config.token_embed, config.voc_iv_size, config.voc_ooev_size,
                                         config.char_embed, config.char_size, config.num_filters, config.kernel_size,
                                         config.label_size, embedd_word,
                                         hidden_size=config.hidden_size, layers=config.layers,
                                         word_dropout=config.word_dropout, char_dropout=config.char_dropout,
                                         lstm_dropout=config.lstm_dropout)
if config.if_gpu:
    ner_model = ner_model.cuda()

parameters = filter(lambda p: p.requires_grad, ner_model.parameters())
optimizer, lr_scheduler = create_opt(parameters, config.opt, lr=config.lr, l2=config.l2, lr_patience=config.lr_patience)

train_sequence_label_batches = [pack_target(ner_model, train_label_batch, train_mask_batch)
                                for train_label_batch, train_mask_batch in zip(train_label_batches, train_mask_batches)]

logger.info("{} batches expected for training".format(len(train_token_iv_batches)))
logger.info("")
best_model = None
best_per = float('-inf')
best_loss = float('inf')
train_all_batches = list(zip(train_token_iv_batches,
                             train_token_ooev_batches,
                             train_char_batches,
                             train_sequence_label_batches,
                             train_mask_batches))

train_start_time = time.time()
num_batches = len(train_all_batches)
for e_ in range(1, config.epoch + 1):
    logger.info("Epoch {:d} (learning rate={:.4f}):".format(e_, optimizer.param_groups[0]['lr']))
    train_err = 0.
    train_total = 0.

    if config.if_shuffle:
        shuffle(train_all_batches)
    batch_counter = 0
    start_time = time.time()
    ner_model.train()
    num_back = 0
    for token_iv_batch, token_ooev_batch, char_batch, label_batch, mask_batch in train_all_batches:
        batch_len = len(token_iv_batch)

        token_iv_batch_var = torch.LongTensor(np.array(token_iv_batch))
        token_ooev_batch_var = torch.LongTensor(np.array(token_ooev_batch))
        char_batch_var = torch.LongTensor(np.array(char_batch))
        mask_batch_var = torch.ByteTensor(np.array(mask_batch, dtype=np.uint8))
        if config.if_gpu:
            token_iv_batch_var = token_iv_batch_var.cuda()
            token_ooev_batch_var = token_ooev_batch_var.cuda()
            char_batch_var = char_batch_var.cuda()
            mask_batch_var = mask_batch_var.cuda()

        optimizer.zero_grad()
        loss = ner_model.forward(token_iv_batch_var, token_ooev_batch_var, char_batch_var,
                                 label_batch, mask_batch_var)
        loss.backward()
        clip_model_grad(ner_model, config.clip_norm)

        batch_counter += 1

        optimizer.step(None)

        with torch.no_grad():
            train_err += loss * batch_len
            train_total += batch_len

        # update log
        if batch_counter % 10 == 0:
            time_ave = (time.time() - start_time) / batch_counter
            time_left = (num_batches - batch_counter) * time_ave

            sys.stdout.write('\b' * num_back)
            sys.stdout.write(' ' * num_back)
            sys.stdout.write('\b' * num_back)
            log_info = "train: {:d}/{:d} loss: {:.4f}, time left (estimated): {:.2f}s" \
                       .format(batch_counter, num_batches, train_err / train_total, time_left)
            sys.stdout.write(log_info)
            sys.stdout.flush()
            num_back = len(log_info)

    sys.stdout.write('\b' * num_back)
    sys.stdout.write(' ' * num_back)
    sys.stdout.write('\b' * num_back)
    logger.info("train: {:d} loss: {:.4f}, time: {:.2f}s"
                .format(num_batches, train_err / train_total, time.time() - start_time))

    if e_ % config.check_every != 0:
        continue

    # evaluating dev and always save the best
    cur_time = time.time()
    f1 = get_f1(ner_model, 'dev')
    logger.info("dev step took {:.4f} seconds".format(time.time() - cur_time))
    logger.info("")

    # early stop
    if f1 > best_per:
        best_per = f1
        del best_model
        best_model = copy.deepcopy(ner_model)
    if train_err < best_loss:
        best_loss = train_err
    if not adjust_learning_rate(lr_scheduler, e_, train_err, f1):
        break

logger.info("training step took {:.4f} seconds".format(time.time() - train_start_time))
logger.info("best dev F1: {:.2f}%".format(best_per * 100.))
logger.info("")

serial_number = datetime.now().strftime('%y%m%d_%H%M%S')
this_model_path = config.model_path + "_" + serial_number
if not os.path.isdir(config.model_root_path):
    os.makedirs(config.model_root_path, mode=0o755, exist_ok=True)

# remember to eval after loading the model. for the reason of batchnorm and dropout
cur_time = time.time()
f1 = get_f1(best_model, 'test', file_path=this_model_path + '.result.txt')
logger.info("test step took {:.4f} seconds".format(time.time() - cur_time))

logger.info("Dumping model to {}".format(this_model_path + '.pt'))
torch.save(best_model.state_dict(), this_model_path + '.pt')

os.rename(log_file_path, this_model_path + '.log.txt')

2021-01-20 10:40:19,315 - Nested Mention - INFO - {'root_path': '.', 'data_set': 'NLPHW', 'lowercase': None, 'batch_size': 32, 'if_shuffle': True, 'voc_iv_size': 2231687, 'voc_ooev_size': 1647, 'char_size': 85, 'label_size': 5, 'token_embed': 200, 'char_embed': 128, 'word_dropout': 0.05, 'char_dropout': 0.0, 'num_filters': 256, 'kernel_size': 3, 'hidden_size': 256, 'layers': 2, 'lstm_dropout': 0.2, 'embed_path': './data/word_vec_NLPHW.pkl', 'epoch': 1, 'if_gpu': False, 'opt': <Optimizer.AdaBound: 'AdaBound'>, 'lr': 0.001, 'final_lr': 0.1, 'l2': 0.0, 'check_every': 1, 'clip_norm': 5, 'lr_patience': 3, 'data_path': './data/NLPHW', 'train_data_path': './data/NLPHW_train.pkl', 'dev_data_path': './data/NLPHW_dev.pkl', 'test_data_path': './data/NLPHW_test.pkl', 'idx_data_path': './data/NLPHW_idx.pkl', 'config_data_path': './data/NLPHW_config.pkl', 'model_root_path': './dumps', 'model_path': './dumps/NLPHW_model'}
2021-01-20 10:40:24,350 - Nested Mention - INFO - 470 batches expected for trai

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1 - beta1, grad)


train: 10/470 loss: 51.8946, time left (estimated): 607.66s                                                           train: 20/470 loss: 35.0792, time left (estimated): 595.50s                                                           train: 30/470 loss: 29.3548, time left (estimated): 615.26s                                                           train: 40/470 loss: 26.1153, time left (estimated): 596.52s                                                           train: 50/470 loss: 24.7462, time left (estimated): 616.



2021-01-20 10:53:16,864 - Nested Mention - INFO - test precision: 0.00%, recall: 100.00%, F1: 0.00%
2021-01-20 10:53:16,867 - Nested Mention - INFO - test step took 25.5701 seconds
2021-01-20 10:53:16,869 - Nested Mention - INFO - Dumping model to ./dumps/NLPHW_model_210120_105251.pt


In [7]:
with open(this_model_path + '.result.txt','r') as f:
  lines = f.readlines()
with open('181220010.txt','w') as f:
  for i in range(len(lines)):
    if i%2!=0:
      f.write(lines[i])