In [2]:
from pathlib import Path
from tokenizers import BertWordPieceTokenizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from bs4 import BeautifulSoup
import glob
import pandas as pd
import sys
import os
import time
import importlib
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
from pathlib import Path

from torch.utils.data.distributed import DistributedSampler
from torch.utils import data
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report

from tqdm import tqdm, trange
import collections
from pytorch_pretrained_bert.modeling import BertModel, BertForTokenClassification, BertLayerNorm
import pickle
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
#from pytorch_pretrained_bert.tokenization import BertTokenizer
from transformers import BertTokenizer

# Data Preprocess

In [3]:
dataset_path = sorted(glob.glob('/home/mchou001/1Kcitation/*.csv'))

In [4]:
"""
The below block of code will concatanate all the csv files in the downsampled GIANT dataset directories. Each directory contains 219 csv files.
"""
newDataset = []
for file in dataset_path:
    df = pd.read_csv(file, encoding = 'utf-8')
    newDataset.append(df)

concat_files = pd.concat(newDataset, axis = 0 , ignore_index=True)
citationString = concat_files['citationStringAnnotated']
citationString.to_csv("./bert_giant/nerdata-cite/500_citation.txt", header=False, index=None)
print(len(citationString))

505


In [5]:
citeList = []
with open("./bert_giant/nerdata-cite/500_citation.txt", mode = 'r') as f:
    for line in f:
        new_start_tag = "<citation> "
        new_end_tag = " </citation>"
        if line.startswith(""):
            citation_tag = new_start_tag + line + new_end_tag
            citeList.append(citation_tag)

### BIO Tagging Citation String

In [6]:
def remove_punct(withpunct):
    punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*_~'''
    without_punct = ""
    char = 'nan'
    for char in withpunct:
        if char not in punctuations:
            without_punct = without_punct + char
    return(without_punct)

In [7]:
def update_in_alist(alist, key, value):
    return [(k,v) if (k != key) else (key, value) for (k, v) in alist]

In [8]:
soup = BeautifulSoup(str(citeList), "html.parser")
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=True)

In [9]:
def BIO_TAG(files):
    docs = []
    sents = []
    for d in soup.find_all("citation"):
        for word in d:
            tags = []
            NoneType = type(None)
            other_tag = 'O'
            if isinstance(word.name, NoneType) == True:
                withoutpunct = remove_punct(word)
                temp = withoutpunct.split()
                for token in temp:
                    tags.append((token, other_tag))
            else:
                prev_tag = other_tag
                withoutpunct = remove_punct(word.get_text())
                temp = withoutpunct.split()
                for token in temp:
                    #beginning of the token
                    if tags != 'O' and prev_tag == 'O' :
                        tags.append((token, "B-"+word.name))
                        tag = "B-"+word.name
                        prev_tag = tag
                    #inside of the token
                    elif prev_tag != 'O' and prev_tag == tag:
                        tags.append((token, "I-"+word.name))
                        tag = "I-"+word.name
                        prev_tag = tag
                    #adjacent of the token
                    elif prev_tag != 'O' and prev_tag != tag:
                        tags.append((token, "B-"+word.name))
                        tag = "B-"+word.name
                        prev_tag = tag

            sents = sents + tags

    docs.append(sents)
    return docs

In [10]:
start_time = time.time()
print("Start time", time.ctime(start_time))
cite_bio_tag = BIO_TAG(soup)
print("End time", (time.time() - start_time)/60.0)

Start time Sun Aug 28 14:13:35 2022
End time 0.005436929066975912


In [11]:
#cite_bio_tag

In [12]:
fp = open("/home/mchou001/bert_giant/nerdata-cite/citation500-ner.txt", 'w')
for i in cite_bio_tag:
    for j in i:
        fp.write('\t'.join(list(j))+'\n')
fp.close()

In [13]:
dfList=[]
colname=['tokens', 'tags']
df = pd.read_csv("/home/mchou001/bert_giant/nerdata-cite/citation500-ner.txt", sep = "\t", header = None)
dfList.append(df)
concatDf = pd.concat(dfList, axis =0)
concatDf.columns=colname
concatDf.to_csv("/home/mchou001/bert_giant/nerdata-cite/citation500-ner.csv",index = None, encoding = 'utf-8')

In [14]:
dataframe = pd.read_csv("/home/mchou001/bert_giant/nerdata-cite/citation500-ner.csv")

In [15]:
dataframe.count()

tokens    14213
tags      14214
dtype: int64

In [16]:
dataframe['tags'].value_counts()

I-title                        4347
O                              2433
I-author                       2206
I-container-title              1524
B-issued                        500
B-title                         489
B-container-title               478
B-author                        426
B-page                          398
B-volume                        340
I-publisher                     265
B-issue                         164
B-publisher                     162
B-url                           150
B-doi                           145
I-issued                         61
I-cite|journal|title=<title      35
I-event                          33
I-editor                         19
I-container-title-short           9
B-page-first                      9
B-event                           5
B-container-title-short           4
B-editor                          4
B-issn                            4
B-isbn                            2
B-cite|journal|title=<title       1
B-citation-label            

In [17]:
entities_to_remove = ["I-cite|journal|title=<title", "B-cite|journal|title=<title", "B-container-title-short", "I-container-title-short", 
                      "B-page-first", "I-page-first","B-issn", "I-issn", "B-isbn", "I-isbn", "B-citation-label","B-translator", "I-translator", 
                      "I-container-author", "B-container-author", "B-abstract", "I-abstract", "B-container-author", 
                      "B-interviewer", "I-interviewer","B-cite|chapter|title=<title", "I-cite|chapter|title=<title", 
                      "B-cite|book|title=<title", "I-cite|book|title=<title"]
dataframe = dataframe[~dataframe.tags.isin(entities_to_remove)]
dataframe.head()

Unnamed: 0,tokens,tags
0,Anonymous,O
1,1978,B-issued
2,New,B-title
3,Names,I-title
4,in,I-title


In [20]:
labels_to_ids = {k: v for v, k in enumerate(dataframe.tags.unique())}
ids_to_labels = {v: k for v, k in enumerate(dataframe.tags.unique())}
labels_to_ids

{'O': 0,
 'B-issued': 1,
 'B-title': 2,
 'I-title': 3,
 'B-container-title': 4,
 'I-container-title': 5,
 'B-volume': 6,
 'B-issue': 7,
 'B-page': 8,
 'B-url': 9,
 'B-doi': 10,
 'B-author': 11,
 'I-author': 12,
 'B-publisher': 13,
 'I-publisher': 14,
 'I-issued': 15,
 'B-event': 16,
 'I-event': 17,
 'B-editor': 18,
 'I-editor': 19}

In [21]:
dataframe.count()

tokens    14148
tags      14149
dtype: int64

## Spliting the dataset

In [22]:
#dataset = dataframe[['tokens', 'tags']]

In [23]:
train, dev = train_test_split(dataframe, test_size=0.2, random_state = 32, shuffle = False)

In [24]:
train.to_csv('./bert_giant/nerdata-cite/train500.txt', sep = '\t', header = None, index = False)
dev.to_csv('./bert_giant/nerdata-cite/dev500.txt', sep = '\t', header = None, index = False)

In [25]:
train.count()

tokens    11318
tags      11319
dtype: int64

In [26]:
dev.count()

tokens    2830
tags      2830
dtype: int64

### Process Test Samples

In [27]:
dfList=[]
colname=['tokens', 'tags']
df = pd.read_csv("/home/mchou001/bert_giant/nerdata-cite/test500.txt", sep = "\t", header = None)
dfList.append(df)
concatDf = pd.concat(dfList, axis =0)
concatDf.columns=colname
concatDf.to_csv("/home/mchou001/bert_giant/nerdata-cite/test500-ner.csv",index = None, encoding = 'utf-8')

In [28]:
test_set = pd.read_csv("/home/mchou001/bert_giant/nerdata-cite/test500-ner.csv")

In [29]:
test_set['tags'].value_counts()

I-title                        4347
O                              2433
I-author                       2206
I-container-title              1524
B-issued                        500
B-title                         489
B-container-title               478
B-author                        426
B-page                          398
B-volume                        340
I-publisher                     265
B-issue                         164
B-publisher                     162
B-url                           150
B-doi                           145
I-issued                         61
I-cite|journal|title=<title      35
I-event                          33
I-editor                         19
I-container-title-short           9
B-page-first                      9
B-event                           5
B-container-title-short           4
B-editor                          4
B-issn                            4
B-isbn                            2
B-cite|journal|title=<title       1
B-citation-label            

In [30]:
entities_to_remove = ["I-cite|journal|title=<title", "B-cite|journal|title=<title", "B-container-title-short", "I-container-title-short", 
                      "B-page-first", "I-page-first","B-issn", "I-issn", "B-isbn", "I-isbn", "B-citation-label","B-translator", "I-translator", 
                      "I-container-author", "B-container-author", "B-abstract", "I-abstract", "B-container-author", 
                      "B-interviewer", "I-interviewer","B-cite|chapter|title=<title", "I-cite|chapter|title=<title", 
                      "B-cite|book|title=<title", "I-cite|book|title=<title"]
test_set = test_set[~test_set.tags.isin(entities_to_remove)]
test_set.head()

Unnamed: 0,tokens,tags
0,Anonymous,O
1,1978,B-issued
2,New,B-title
3,Names,I-title
4,in,I-title


In [31]:
labels_to_ids = {k: v for v, k in enumerate(test_set.tags.unique())}
ids_to_labels = {v: k for v, k in enumerate(test_set.tags.unique())}
labels_to_ids

{'O': 0,
 'B-issued': 1,
 'B-title': 2,
 'I-title': 3,
 'B-container-title': 4,
 'I-container-title': 5,
 'B-volume': 6,
 'B-issue': 7,
 'B-page': 8,
 'B-url': 9,
 'B-doi': 10,
 'B-author': 11,
 'I-author': 12,
 'B-publisher': 13,
 'I-publisher': 14,
 'I-issued': 15,
 'B-event': 16,
 'I-event': 17,
 'B-editor': 18,
 'I-editor': 19}

In [32]:
test_set.to_csv('./bert_giant/nerdata-cite/test.txt', sep = '\t', header = None, index = False)

In [33]:
test_set.count()

tokens    14148
tags      14149
dtype: int64

## Setting up the data directory

In [34]:
def get_data_dir(local_path="bert_giant", server_path="bert_giant"):
    if (os.path.exists(os.getenv("HOME")+'/'+local_path)):
        return os.getenv("HOME")+'/'+local_path
    else:
        raise Exception('get data path error!')

In [35]:
data_dir = os.path.join(get_data_dir(), 'nerdata-cite/')

In [36]:
cuda_yes = torch.cuda.is_available()
# cuda_yes = False
print('Cuda is available?', cuda_yes)
device = torch.device("cuda:0" if cuda_yes else "cpu")
print('Device:', device)

Cuda is available? True
Device: cuda:0


## BERT Processor -- wordpeice tokenization, padding, turncating, embeddings features (input id, segment id, label id, etc.)

In [37]:
class InputExample(object):
    """A single training/test example for NER."""

    def __init__(self, guid, words, labels):
        """Constructs a InputExample.
        Args:
          guid: Unique id for the example(a sentence or a pair of sentences).
          words: list of words of sentence
          labels_a/labels_b: (Optional) string. The label seqence of the text_a/text_b. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        # list of words of the sentence,example: [EU, rejects, German, call, to, boycott, British, lamb .]
        self.words = words
        # list of label sequence of the sentence,like: [B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]
        self.labels = labels

In [38]:
class InputFeatures(object):
    """A single set of features of data.
    result of convert_examples_to_features(InputExample)
    """

    def __init__(self, input_ids, input_mask, segment_ids,  predict_mask, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.predict_mask = predict_mask
        self.label_ids = label_ids

In [39]:
class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()
        
    @classmethod
    ## reading the BIO data
    def _read_data(cls, input_file):
        with open(input_file) as f:           
            out_lists = []
            entries = f.read().strip().split(".n")
            for entry in entries:
                words = []
                ner_labels = []
                for line in entry.splitlines():
                    pieces = line.strip().split()
                    if len(pieces) <= 1:
                        continue
                    word = pieces[0]
                    words.append(word)
                    ner_labels.append(pieces[-1])
                out_lists.append([words, ner_labels])
                
        return out_lists

In [48]:
class CiteDataProcessor(DataProcessor):
    def __init__(self):
        self._label_types = [ 'X','[CLS]','[SEP]','O', 'B-title', 'I-title', 'B-author', 'I-author', 'B-issued', 'I-issued', 'B-issue', 'I-issue', 'B-container-title', 'I-container-title',
                             'B-page', 'I-page', 'B-volume', 'I-volume', 'I-publisher', 'B-publisher', 'I-event', 'I-editor', 'B-event', 'B-editor',
                            'B-url', 'I-url', 'B-doi', 'I-doi']
        
        self._num_labels = len(self._label_types)
        self._label_map = {label: i for i,
                           label in enumerate(self._label_types)}
    def get_train_examples(self, data_dir):
        return self._create_examples(
            self._read_data(os.path.join(data_dir, "train500.txt")))

    def get_dev_examples(self, data_dir):
        return self._create_examples(
            self._read_data(os.path.join(data_dir, "dev500.txt")))

    def get_test_examples(self, data_dir):
        return self._create_examples(
            self._read_data(os.path.join(data_dir, "test.txt")))
    
    def get_labels(self):
        return self._label_types
    
    def get_num_labels(self):
        return self.get_num_labels
    
    def get_label_map(self):
        return self._label_map
    
    def get_start_label_id(self):
        return self._label_map['[CLS]']

    def get_stop_label_id(self):
        return self._label_map['[SEP]']
    
    def _create_examples(self, all_lists):
        examples = []
        for (i, one_lists) in enumerate(all_lists):
            guid = i
            words = one_lists[0]
            labels = one_lists[-1]
            examples.append(InputExample(
                guid=guid, words=words, labels=labels))
        return examples

In [49]:
# "Whether to run training."
do_train = True
# "Whether to run eval on the dev set."
do_eval = True
# "Whether to run the model in inference mode on the test set."
do_predict = True
# Whether load checkpoint file before train model
load_checkpoint = True
# "The vocabulary file that the BERT model was trained on."
max_seq_length = 500 #256
batch_size = 12 #32
# "The initial learning rate for Adam."
learning_rate0 = 5e-6 #0.000005
lr0_crf_fc = 8e-5 #0.00008
weight_decay_finetune = 1e-5 #0.00001
weight_decay_crf_fc = 5e-6 #0.000005
total_train_epochs = 30
gradient_accumulation_steps = 1
warmup_proportion = 0.1
output_dir = '/home/mchou001/bert_giant/nerdata-cite/'
bert_model_scale = 'bert-base-cased' ## try uncased
do_lower_case = True

In [50]:
tokenizer = BertTokenizer.from_pretrained(bert_model_scale, do_lower_case=do_lower_case)

In [51]:
def example2feature(example, tokenizer, label_map, max_seq_length):

    add_label = 'X'
    # tokenize_count = []
    tokens = ['[CLS]']
    predict_mask = [0]
    label_ids = [label_map['[CLS]']]
    for i, w in enumerate(example.words):
        sub_words = tokenizer.tokenize(w)
        if not sub_words:
            sub_words = ['[UNK]']
        # tokenize_count.append(len(sub_words))
        tokens.extend(sub_words)
        for j in range(len(sub_words)):
            if j == 0:
                predict_mask.append(1)
                label_ids.append(label_map[example.labels[i]])
            else:
                # '##xxx' -> 'X' (see bert paper)
                predict_mask.append(0)
                label_ids.append(label_map[add_label])

    # truncate
    if len(tokens) > max_seq_length - 1:
        print('Example No.{} is too long, length is {}, truncated to {}!'.format(example.guid, len(tokens), max_seq_length))
        tokens = tokens[0:(max_seq_length - 1)]
        predict_mask = predict_mask[0:(max_seq_length - 1)]
        label_ids = label_ids[0:(max_seq_length - 1)]
    tokens.append('[SEP]')
    predict_mask.append(0)
    label_ids.append(label_map['[SEP]'])

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    segment_ids = [0] * len(input_ids)
    input_mask = [1] * len(input_ids)

    feat=InputFeatures(
                # guid=example.guid,
                # tokens=tokens,
                input_ids=input_ids,
                input_mask=input_mask,
                segment_ids=segment_ids,
                predict_mask=predict_mask,
                label_ids=label_ids)

    return feat

In [52]:
class NerDataset(data.Dataset):
    def __init__(self, examples, tokenizer, label_map, max_seq_length):
        self.examples=examples
        self.tokenizer=tokenizer
        self.label_map=label_map
        self.max_seq_length=max_seq_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        feat=example2feature(self.examples[idx], self.tokenizer, self.label_map, max_seq_length)
        return feat.input_ids, feat.input_mask, feat.segment_ids, feat.predict_mask, feat.label_ids

    @classmethod
    def pad(cls, batch):

        seqlen_list = [len(sample[0]) for sample in batch]
        maxlen = np.array(seqlen_list).max()

        f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: X for padding
        input_ids_list = torch.LongTensor(f(0, maxlen))
        input_mask_list = torch.LongTensor(f(1, maxlen))
        segment_ids_list = torch.LongTensor(f(2, maxlen))
        predict_mask_list = torch.ByteTensor(f(3, maxlen))
        label_ids_list = torch.LongTensor(f(4, maxlen))

        return input_ids_list, input_mask_list, segment_ids_list, predict_mask_list, label_ids_list
    

In [53]:
def f1_score(y_true, y_pred):
    '''
    0,1,2,3 are [CLS],[SEP],[X],O
    '''
    ignore_id=3

    num_proposed = len(y_pred[y_pred>ignore_id])
    num_correct = (np.logical_and(y_true==y_pred, y_true>ignore_id)).sum()
    num_gold = len(y_true[y_true>ignore_id])

    try:
        precision = num_correct / num_proposed
    except ZeroDivisionError:
        precision = 1.0

    try:
        recall = num_correct / num_gold
    except ZeroDivisionError:
        recall = 1.0

    try:
        f1 = 2*precision*recall / (precision + recall)
    except ZeroDivisionError:
        if precision*recall==0:
            f1=1.0
        else:
            f1=0

    return precision, recall, f1

In [54]:
np.random.seed(44)
torch.manual_seed(44)
if cuda_yes:
    torch.cuda.manual_seed_all(44)

In [55]:
citeProcessor = CiteDataProcessor()
label_list = citeProcessor.get_labels()
label_map = citeProcessor.get_label_map()
train_examples = citeProcessor.get_train_examples(data_dir)
dev_examples = citeProcessor.get_dev_examples(data_dir)
test_examples = citeProcessor.get_test_examples(data_dir)


total_train_steps = int(len(train_examples) / batch_size / gradient_accumulation_steps * total_train_epochs)

#_desired_epochs * (#words(not_sentences)_in_all_input_corpus / #tokens_per_batch)
print("***** Running training *****")
print("  Num train examples = %d"% len(train_examples))
print("  Num dev examples = %d"% len(dev_examples))
print("  Num test examples = %d"% len(test_examples))
print("  Batch size = %d"% batch_size)
print("  Num steps = %d"% total_train_steps)

***** Running training *****
  Num train examples = 291
  Num dev examples = 76
  Num test examples = 366
  Batch size = 12
  Num steps = 727


In [56]:
train_dataset = NerDataset(train_examples,tokenizer,label_map,max_seq_length)
dev_dataset = NerDataset(dev_examples,tokenizer,label_map,max_seq_length)
test_dataset = NerDataset(test_examples,tokenizer,label_map,max_seq_length)

In [57]:
train_dataloader = DataLoader(dataset=train_dataset,
                                batch_size=batch_size,
                                shuffle=True,
                                num_workers=4,
                                collate_fn=NerDataset.pad)

dev_dataloader = DataLoader(dataset=dev_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=NerDataset.pad)

test_dataloader = DataLoader(dataset=test_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=NerDataset.pad)


In [59]:
print('*** Use only BertForTokenClassification ***')

if load_checkpoint and os.path.exists(output_dir+'/ner_bert_checkpoint.pt'):
    checkpoint = torch.load(output_dir+'/ner_bert_checkpoint.pt', map_location='cpu')
    start_epoch = checkpoint['epoch']+1
    valid_acc_prev = checkpoint['valid_acc']
    valid_f1_prev = checkpoint['valid_f1']
    model = BertForTokenClassification.from_pretrained(
        bert_model_scale, state_dict=checkpoint['model_state'], num_labels=len(label_list))
    print('Loaded the pretrain NER_BERT model, epoch:',checkpoint['epoch'],'valid acc:',
            checkpoint['valid_acc'], 'valid f1:', checkpoint['valid_f1'])
else:
    start_epoch = 0
    valid_acc_prev = 0
    valid_f1_prev = 0
    model = BertForTokenClassification.from_pretrained(
        bert_model_scale, num_labels=len(label_list))

model.to(device)

# Prepare optimizer
named_params = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in named_params if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay_finetune},
    {'params': [p for n, p in named_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate0, warmup=warmup_proportion, t_total=total_train_steps)

*** Use only BertForTokenClassification ***


In [60]:
def evaluate(model, predict_dataloader, batch_size, epoch_th, dataset_name):
    # print("***** Running prediction *****")
    model.eval()
    all_preds = []
    all_labels = []
    total=0
    correct=0
    start = time.time()
    with torch.no_grad():
        for batch in predict_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, predict_mask, label_ids = batch
            out_scores = model(input_ids, segment_ids, input_mask)
            # out_scores = out_scores.detach().cpu().numpy()
            _, predicted = torch.max(out_scores, -1)
            valid_predicted = torch.masked_select(predicted, predict_mask)
            valid_label_ids = torch.masked_select(label_ids, predict_mask)
            # print(len(valid_label_ids),len(valid_predicted),len(valid_label_ids)==len(valid_predicted))
            all_preds.extend(valid_predicted.tolist())
            all_labels.extend(valid_label_ids.tolist())
            total += len(valid_label_ids)
            correct += valid_predicted.eq(valid_label_ids).sum().item()

    test_acc = correct/total
    precision, recall, f1 = f1_score(np.array(all_labels), np.array(all_preds))
    end = time.time()
    print('Epoch:%d, Acc:%.2f, Precision: %.2f, Recall: %.2f, F1: %.2f on %s, Spend: %.3f minutes for evaluation' \
        % (epoch_th, 100.*test_acc, 100.*precision, 100.*recall, 100.*f1, dataset_name,(end-start)/60.0))
    print('--------------------------------------------------------------')
    return test_acc, f1

In [61]:
torch.cuda.empty_cache()
global_step_th = int(len(train_examples) / batch_size / gradient_accumulation_steps * start_epoch)
# for epoch in trange(start_epoch, total_train_epochs, desc="Epoch"):
for epoch in range(start_epoch, total_train_epochs):
    tr_loss = 0
    train_start = time.time()
    model.train()
    optimizer.zero_grad()
    # for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)

        input_ids, input_mask, segment_ids, predict_mask, label_ids = batch
        loss = model(input_ids, segment_ids, input_mask, label_ids)

        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps

        loss.backward()
        tr_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            # modify learning rate with special warm up BERT uses
            lr_this_step = learning_rate0 * warmup_linear(global_step_th/total_train_steps, warmup_proportion)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            optimizer.zero_grad()
            global_step_th += 1

        print("Epoch:{}-{}/{}, CrossEntropyLoss: {} ".format(epoch, step, len(train_dataloader), loss.item()))

    print('--------------------------------------------------------------')
    print("Epoch:{} completed, Total training's Loss: {}, Spend: {}m".format(epoch, tr_loss, (time.time() - train_start) / 60.0))
    valid_acc, valid_f1 = evaluate(model, dev_dataloader, batch_size, epoch, 'Valid_set')
    # Save a checkpoint
    if valid_f1 > valid_f1_prev:
        # model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        torch.save({'epoch': epoch, 'model_state': model.state_dict(), 'valid_acc': valid_acc,
            'valid_f1': valid_f1, 'max_seq_length': max_seq_length, 'lower_case': do_lower_case},
                    os.path.join(output_dir, 'ner_bert_checkpoint.pt'))
        valid_f1_prev = valid_f1

evaluate(model, test_dataloader, batch_size, total_train_epochs-1, 'Test_set')

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1055.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


Epoch:0-0/25, CrossEntropyLoss: 3.4760069847106934 
Epoch:0-1/25, CrossEntropyLoss: 3.4768404960632324 
Epoch:0-2/25, CrossEntropyLoss: 3.5022847652435303 
Epoch:0-3/25, CrossEntropyLoss: 3.464895486831665 
Epoch:0-4/25, CrossEntropyLoss: 3.4745612144470215 
Epoch:0-5/25, CrossEntropyLoss: 3.484229564666748 
Epoch:0-6/25, CrossEntropyLoss: 3.499748706817627 
Epoch:0-7/25, CrossEntropyLoss: 3.501962900161743 
Epoch:0-8/25, CrossEntropyLoss: 3.4719295501708984 
Epoch:0-9/25, CrossEntropyLoss: 3.456376314163208 
Epoch:0-10/25, CrossEntropyLoss: 3.459044933319092 
Epoch:0-11/25, CrossEntropyLoss: 3.4307644367218018 
Epoch:0-12/25, CrossEntropyLoss: 3.4462382793426514 
Epoch:0-13/25, CrossEntropyLoss: 3.4355318546295166 
Epoch:0-14/25, CrossEntropyLoss: 3.4081361293792725 
Epoch:0-15/25, CrossEntropyLoss: 3.3827037811279297 
Epoch:0-16/25, CrossEntropyLoss: 3.396536111831665 
Epoch:0-17/25, CrossEntropyLoss: 3.344524621963501 
Epoch:0-18/25, CrossEntropyLoss: 3.3639755249023438 
Epoch:0-19/

  app.launch_new_instance()


Epoch:0, Acc:2.29, Precision: 9.67, Recall: 2.61, F1: 4.11 on Valid_set, Spend: 0.020 minutes for evaluation
--------------------------------------------------------------
Epoch:1-0/25, CrossEntropyLoss: 3.080375909805298 
Epoch:1-1/25, CrossEntropyLoss: 3.0748934745788574 
Epoch:1-2/25, CrossEntropyLoss: 3.0389091968536377 
Epoch:1-3/25, CrossEntropyLoss: 2.9418721199035645 
Epoch:1-4/25, CrossEntropyLoss: 2.9286303520202637 
Epoch:1-5/25, CrossEntropyLoss: 2.859492301940918 
Epoch:1-6/25, CrossEntropyLoss: 2.7234814167022705 
Epoch:1-7/25, CrossEntropyLoss: 2.7067933082580566 
Epoch:1-8/25, CrossEntropyLoss: 2.657900094985962 
Epoch:1-9/25, CrossEntropyLoss: 2.547978639602661 
Epoch:1-10/25, CrossEntropyLoss: 2.484297513961792 
Epoch:1-11/25, CrossEntropyLoss: 2.3357393741607666 
Epoch:1-12/25, CrossEntropyLoss: 2.2557454109191895 
Epoch:1-13/25, CrossEntropyLoss: 2.11370587348938 
Epoch:1-14/25, CrossEntropyLoss: 2.0154199600219727 
Epoch:1-15/25, CrossEntropyLoss: 1.898229956626892

  if sys.path[0] == '':


Epoch:2-0/25, CrossEntropyLoss: 0.8429243564605713 
Epoch:2-1/25, CrossEntropyLoss: 1.2281277179718018 
Epoch:2-2/25, CrossEntropyLoss: 1.054638147354126 
Epoch:2-3/25, CrossEntropyLoss: 1.1366603374481201 
Epoch:2-4/25, CrossEntropyLoss: 0.9489879608154297 
Epoch:2-5/25, CrossEntropyLoss: 0.8757876753807068 
Epoch:2-6/25, CrossEntropyLoss: 1.003857135772705 
Epoch:2-7/25, CrossEntropyLoss: 1.1387221813201904 
Epoch:2-8/25, CrossEntropyLoss: 0.7742856740951538 
Epoch:2-9/25, CrossEntropyLoss: 1.44528329372406 
Epoch:2-10/25, CrossEntropyLoss: 0.8188692927360535 
Epoch:2-11/25, CrossEntropyLoss: 1.0911500453948975 
Epoch:2-12/25, CrossEntropyLoss: 0.8823626637458801 
Epoch:2-13/25, CrossEntropyLoss: 1.035467267036438 
Epoch:2-14/25, CrossEntropyLoss: 1.1135202646255493 
Epoch:2-15/25, CrossEntropyLoss: 0.8620625138282776 
Epoch:2-16/25, CrossEntropyLoss: 1.2825000286102295 
Epoch:2-17/25, CrossEntropyLoss: 0.9365834593772888 
Epoch:2-18/25, CrossEntropyLoss: 0.7196564674377441 
Epoch:2-

(0.8733405875952122, 0.8725759395915564)

In [62]:
#%%
'''
Test_set prediction using the best epoch of NER_BERT model
'''
checkpoint = torch.load(output_dir+'/ner_bert_checkpoint.pt', map_location='cpu')
epoch = checkpoint['epoch']
valid_acc_prev = checkpoint['valid_acc']
valid_f1_prev = checkpoint['valid_f1']
model = BertForTokenClassification.from_pretrained(
    bert_model_scale, state_dict=checkpoint['model_state'], num_labels=len(label_list))
# if os.path.exists(output_dir+'/ner_bert_crf_checkpoint.pt'):
model.to(device)
print('Loaded the pretrain NER_BERT model, epoch:',checkpoint['epoch'],'valid acc:', 
        checkpoint['valid_acc'], 'valid f1:', checkpoint['valid_f1'])

model.to(device)
# evaluate(model, train_dataloader, batch_size, total_train_epochs-1, 'Train_set')
evaluate(model, test_dataloader, batch_size, epoch, 'Test_set')

Loaded the pretrain NER_BERT model, epoch: 23 valid acc: 0.847912885662432 valid f1: 0.8504037399065023


  app.launch_new_instance()


Epoch:23, Acc:87.28, Precision: 87.69, Recall: 86.77, F1: 87.23 on Test_set, Spend: 0.040 minutes for evaluation
--------------------------------------------------------------


(0.8728327892636925, 0.8722701334363067)

## BERTModel + CRF

In [63]:
print('*** Use BertModel + CRF ***')
torch.cuda.empty_cache()

def log_sum_exp_1vec(vec):  # shape(1,m)
    max_score = vec[0, np.argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

def log_sum_exp_mat(log_M, axis=-1):  # shape(n,m)
    return torch.max(log_M, axis)[0]+torch.log(torch.exp(log_M-torch.max(log_M, axis)[0][:, None]).sum(axis))

def log_sum_exp_batch(log_Tensor, axis=-1): # shape (batch_size,n,m)
    return torch.max(log_Tensor, axis)[0]+torch.log(torch.exp(log_Tensor-torch.max(log_Tensor, axis)[0].view(log_Tensor.shape[0],-1,1)).sum(axis))

*** Use BertModel + CRF ***


In [64]:
class BERT_CRF_NER(nn.Module):

    def __init__(self, bert_model, start_label_id, stop_label_id, num_labels, max_seq_length, batch_size, device):
        super(BERT_CRF_NER, self).__init__()
        self.hidden_size = 768
        self.start_label_id = start_label_id
        self.stop_label_id = stop_label_id
        self.num_labels = num_labels
        # self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.device=device

        # use pretrainded BertModel
        self.bert = bert_model
        self.dropout = torch.nn.Dropout(0.2)
        # Maps the output of the bert into label space.
        self.hidden2label = nn.Linear(self.hidden_size, self.num_labels)

        # Matrix of transition parameters.  Entry i,j is the score of transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.num_labels, self.num_labels))

        # These two statements enforce the constraint that we never transfer *to* the start tag(or label),
        # and we never transfer *from* the stop label (the model would probably learn this anyway,
        # so this enforcement is likely unimportant)
        self.transitions.data[start_label_id, :] = -10000
        self.transitions.data[:, stop_label_id] = -10000

        nn.init.xavier_uniform_(self.hidden2label.weight)
        nn.init.constant_(self.hidden2label.bias, 0.0)
        # self.apply(self.init_bert_weights)

    def init_bert_weights(self, module):
        """ Initialize the weights.
        """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, BertLayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def _forward_alg(self, feats):
        '''
        this also called alpha-recursion or forward recursion, to calculate log_prob of all barX
        '''

        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]

        # alpha_recursion,forward, alpha(zt)=p(zt,bar_x_1:t)
        log_alpha = torch.Tensor(batch_size, 1, self.num_labels).fill_(-10000.).to(self.device)
        # normal_alpha_0 : alpha[0]=Ot[0]*self.PIs
        # self.start_label has all of the score. it is log,0 is p=1
        log_alpha[:, 0, self.start_label_id] = 0

        # feats: sentances -> word embedding -> lstm -> MLP -> feats
        # feats is the probability of emission, feat.shape=(1,tag_size)
        for t in range(1, T):
            log_alpha = (log_sum_exp_batch(self.transitions + log_alpha, axis=-1) + feats[:, t]).unsqueeze(1)

        # log_prob of all barX
        log_prob_all_barX = log_sum_exp_batch(log_alpha)
        return log_prob_all_barX

    def _get_bert_features(self, input_ids, segment_ids, input_mask):
        '''
        sentances -> word embedding -> lstm -> MLP -> feats
        '''
        bert_seq_out, _ = self.bert(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, output_all_encoded_layers=False)
        bert_seq_out = self.dropout(bert_seq_out)
        bert_feats = self.hidden2label(bert_seq_out)
        return bert_feats

    def _score_sentence(self, feats, label_ids):
        '''
        Gives the score of a provided label sequence
        p(X=w1:t,Zt=tag1:t)=...p(Zt=tag_t|Zt-1=tag_t-1)p(xt|Zt=tag_t)...
        '''

        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]

        batch_transitions = self.transitions.expand(batch_size,self.num_labels,self.num_labels)
        batch_transitions = batch_transitions.flatten(1)

        score = torch.zeros((feats.shape[0],1)).to(device)
        # the 0th node is start_label->start_word,the probability of them=1. so t begin with 1.
        for t in range(1, T):
            score = score + \
                batch_transitions.gather(-1, (label_ids[:, t]*self.num_labels+label_ids[:, t-1]).view(-1,1)) \
                    + feats[:, t].gather(-1, label_ids[:, t].view(-1,1)).view(-1,1)
        return score

    def _viterbi_decode(self, feats):
        '''
        Max-Product Algorithm or viterbi algorithm, argmax(p(z_0:t|x_0:t))
        '''

        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]

        # batch_transitions=self.transitions.expand(batch_size,self.num_labels,self.num_labels)

        log_delta = torch.Tensor(batch_size, 1, self.num_labels).fill_(-10000.).to(self.device)
        log_delta[:, 0, self.start_label_id] = 0

        # psi is for the vaule of the last latent that make P(this_latent) maximum.
        psi = torch.zeros((batch_size, T, self.num_labels), dtype=torch.long).to(self.device)  # psi[0]=0000 useless
        for t in range(1, T):
            # delta[t][k]=max_z1:t-1( p(x1,x2,...,xt,z1,z2,...,zt-1,zt=k|theta) )
            # delta[t] is the max prob of the path from  z_t-1 to z_t[k]
            log_delta, psi[:, t] = torch.max(self.transitions + log_delta, -1)
            # psi[t][k]=argmax_z1:t-1( p(x1,x2,...,xt,z1,z2,...,zt-1,zt=k|theta) )
            # psi[t][k] is the path choosed from z_t-1 to z_t[k],the value is the z_state(is k) index of z_t-1
            log_delta = (log_delta + feats[:, t]).unsqueeze(1)

        # trace back
        path = torch.zeros((batch_size, T), dtype=torch.long).to(self.device)

        # max p(z1:t,all_x|theta)
        max_logLL_allz_allx, path[:, -1] = torch.max(log_delta.squeeze(), -1)

        for t in range(T-2, -1, -1):
            # choose the state of z_t according the state choosed of z_t+1.
            path[:, t] = psi[:, t+1].gather(-1,path[:, t+1].view(-1,1)).squeeze()

        return max_logLL_allz_allx, path

    def neg_log_likelihood(self, input_ids, segment_ids, input_mask, label_ids):
        bert_feats = self._get_bert_features(input_ids, segment_ids, input_mask)
        forward_score = self._forward_alg(bert_feats)
        # p(X=w1:t,Zt=tag1:t)=...p(Zt=tag_t|Zt-1=tag_t-1)p(xt|Zt=tag_t)...
        gold_score = self._score_sentence(bert_feats, label_ids)
        # - log[ p(X=w1:t,Zt=tag1:t)/p(X=w1:t) ] = - log[ p(Zt=tag1:t|X=w1:t) ]
        return torch.mean(forward_score - gold_score)

    # this forward is just for predict, not for train
    # dont confuse this with _forward_alg above.
    def forward(self, input_ids, segment_ids, input_mask):
        # Get the emission scores from the BiLSTM
        bert_feats = self._get_bert_features(input_ids, segment_ids, input_mask)

        # Find the best path, given the features.
        score, label_seq_ids = self._viterbi_decode(bert_feats)
        return score, label_seq_ids

In [65]:
start_label_id = citeProcessor.get_start_label_id()
stop_label_id = citeProcessor.get_stop_label_id()

In [66]:
bert_model = BertModel.from_pretrained(bert_model_scale)
model = BERT_CRF_NER(bert_model, start_label_id, stop_label_id, len(label_list), max_seq_length, batch_size, device)

In [67]:
if load_checkpoint and os.path.exists(output_dir+'/ner_bert_crf_checkpoint.pt'):
    checkpoint = torch.load(output_dir+'/ner_bert_crf_checkpoint.pt', map_location='cuda:0')
    start_epoch = checkpoint['epoch']+1
    valid_acc_prev = checkpoint['valid_acc']
    valid_f1_prev = checkpoint['valid_f1']
    pretrained_dict=checkpoint['model_state']
    net_state_dict = model.state_dict()
    pretrained_dict_selected = {k: v for k, v in pretrained_dict.items() if k in net_state_dict}
    net_state_dict.update(pretrained_dict_selected)
    model.load_state_dict(net_state_dict)
    print('Loaded the pretrain NER_BERT_CRF model, epoch:',checkpoint['epoch'],'valid acc:',
            checkpoint['valid_acc'], 'valid f1:', checkpoint['valid_f1'])
else:
    start_epoch = 0
    valid_acc_prev = 0
    valid_f1_prev = 0

model.to(device)

BERT_CRF_NER(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
         

In [69]:
# Prepare optimizer
param_optimizer = list(model.named_parameters())

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
new_param = ['transitions', 'hidden2label.weight', 'hidden2label.bias']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) \
        and not any(nd in n for nd in new_param)], 'weight_decay': weight_decay_finetune},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) \
        and not any(nd in n for nd in new_param)], 'weight_decay': 0.0},
    {'params': [p for n, p in param_optimizer if n in ('transitions','hidden2label.weight')] \
        , 'lr':lr0_crf_fc, 'weight_decay': weight_decay_crf_fc},
    {'params': [p for n, p in param_optimizer if n == 'hidden2label.bias'] \
        , 'lr':lr0_crf_fc, 'weight_decay': 0.0}
]
optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate0, warmup=warmup_proportion, t_total=total_train_steps)
# optimizer = optim.Adam(model.parameters(), lr=learning_rate0)

def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

def evaluate(model, predict_dataloader, batch_size, epoch_th, dataset_name):
    # print("***** Running prediction *****")
    model.eval()
    all_preds = []
    all_labels = []
    total=0
    correct=0
    start = time.time()
    with torch.no_grad():
        for batch in predict_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, predict_mask, label_ids = batch
            _, predicted_label_seq_ids = model(input_ids, segment_ids, input_mask)
            # _, predicted = torch.max(out_scores, -1)
            valid_predicted = torch.masked_select(predicted_label_seq_ids, predict_mask)
            valid_label_ids = torch.masked_select(label_ids, predict_mask)
            all_preds.extend(valid_predicted.tolist())
            all_labels.extend(valid_label_ids.tolist())
            # print(len(valid_label_ids),len(valid_predicted),len(valid_label_ids)==len(valid_predicted))
            total += len(valid_label_ids)
            correct += valid_predicted.eq(valid_label_ids).sum().item()

    test_acc = correct/total
    precision, recall, f1 = f1_score(np.array(all_labels), np.array(all_preds))
    end = time.time()
    print('Epoch:%d, Acc:%.2f, Precision: %.2f, Recall: %.2f, F1: %.2f on %s, Spend:%.3f minutes for evaluation' \
        % (epoch_th, 100.*test_acc, 100.*precision, 100.*recall, 100.*f1, dataset_name,(end-start)/60.0))
    print('--------------------------------------------------------------')
    return test_acc, f1

#%%
# train procedure
global_step_th = int(len(train_examples) / batch_size / gradient_accumulation_steps * start_epoch)

# train_start=time.time()
# for epoch in trange(start_epoch, total_train_epochs, desc="Epoch"):
for epoch in range(start_epoch, total_train_epochs):
    tr_loss = 0
    train_start = time.time()
    model.train()
    optimizer.zero_grad()
    # for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, predict_mask, label_ids = batch

        neg_log_likelihood = model.neg_log_likelihood(input_ids, segment_ids, input_mask, label_ids)

        if gradient_accumulation_steps > 1:
            neg_log_likelihood = neg_log_likelihood / gradient_accumulation_steps

        neg_log_likelihood.backward()

        tr_loss += neg_log_likelihood.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            # modify learning rate with special warm up BERT uses
            lr_this_step = learning_rate0 * warmup_linear(global_step_th/total_train_steps, warmup_proportion)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            optimizer.zero_grad()
            global_step_th += 1

        print("Epoch:{}-{}/{}, Negative loglikelihood: {} ".format(epoch, step, len(train_dataloader), neg_log_likelihood.item()))

    print('--------------------------------------------------------------')
    print("Epoch:{} completed, Total training's Loss: {}, Spend: {}m".format(epoch, tr_loss, (time.time() - train_start)/60.0))
    valid_acc, valid_f1 = evaluate(model, dev_dataloader, batch_size, epoch, 'Valid_set')

    # Save a checkpoint
    if valid_f1 > valid_f1_prev:
        # model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        torch.save({'epoch': epoch, 'model_state': model.state_dict(), 'valid_acc': valid_acc,
            'valid_f1': valid_f1, 'max_seq_length': max_seq_length, 'lower_case': do_lower_case},
                    os.path.join(output_dir, 'ner_bert_crf_checkpoint.pt'))
        valid_f1_prev = valid_f1

evaluate(model, test_dataloader, batch_size, total_train_epochs-1, 'Test_set')

Epoch:0-0/25, Negative loglikelihood: 9148.796875 
Epoch:0-1/25, Negative loglikelihood: 9147.7060546875 
Epoch:0-2/25, Negative loglikelihood: 9146.423828125 
Epoch:0-3/25, Negative loglikelihood: 9151.236328125 
Epoch:0-4/25, Negative loglikelihood: 9151.658203125 
Epoch:0-5/25, Negative loglikelihood: 9151.412109375 
Epoch:0-6/25, Negative loglikelihood: 9149.064453125 
Epoch:0-7/25, Negative loglikelihood: 9146.9638671875 
Epoch:0-8/25, Negative loglikelihood: 9146.7763671875 
Epoch:0-9/25, Negative loglikelihood: 9146.5712890625 
Epoch:0-10/25, Negative loglikelihood: 9149.48828125 
Epoch:0-11/25, Negative loglikelihood: 9146.537109375 
Epoch:0-12/25, Negative loglikelihood: 9150.966796875 
Epoch:0-13/25, Negative loglikelihood: 9147.466796875 
Epoch:0-14/25, Negative loglikelihood: 9146.5146484375 
Epoch:0-15/25, Negative loglikelihood: 9147.984375 
Epoch:0-16/25, Negative loglikelihood: 9150.80078125 
Epoch:0-17/25, Negative loglikelihood: 9148.099609375 
Epoch:0-18/25, Negative



Epoch:0, Acc:91.40, Precision: 92.47, Recall: 91.58, F1: 92.02 on Valid_set, Spend:0.026 minutes for evaluation
--------------------------------------------------------------
Epoch:1-0/25, Negative loglikelihood: 9150.20703125 
Epoch:1-1/25, Negative loglikelihood: 9147.396484375 
Epoch:1-2/25, Negative loglikelihood: 9149.130859375 
Epoch:1-3/25, Negative loglikelihood: 9146.294921875 
Epoch:1-4/25, Negative loglikelihood: 9146.7421875 
Epoch:1-5/25, Negative loglikelihood: 9147.6328125 
Epoch:1-6/25, Negative loglikelihood: 9145.80078125 
Epoch:1-7/25, Negative loglikelihood: 9148.6298828125 
Epoch:1-8/25, Negative loglikelihood: 9145.87109375 
Epoch:1-9/25, Negative loglikelihood: 8319.8388671875 
Epoch:1-10/25, Negative loglikelihood: 9146.9140625 
Epoch:1-11/25, Negative loglikelihood: 9149.337890625 
Epoch:1-12/25, Negative loglikelihood: 9151.44140625 
Epoch:1-13/25, Negative loglikelihood: 9150.068359375 
Epoch:1-14/25, Negative loglikelihood: 9148.3271484375 
Epoch:1-15/25, Ne

KeyboardInterrupt: 

In [70]:
'''
Test_set prediction using the best epoch of NER_BERT_CRF model
'''
checkpoint = torch.load(output_dir+'/ner_bert_crf_checkpoint.pt', map_location='cpu')
epoch = checkpoint['epoch']
valid_acc_prev = checkpoint['valid_acc']
valid_f1_prev = checkpoint['valid_f1']
pretrained_dict=checkpoint['model_state']
net_state_dict = model.state_dict()
pretrained_dict_selected = {k: v for k, v in pretrained_dict.items() if k in net_state_dict}
net_state_dict.update(pretrained_dict_selected)
model.load_state_dict(net_state_dict)
print('Loaded the pretrain  NER_BERT_CRF  model, epoch:',checkpoint['epoch'],'valid acc:',
      checkpoint['valid_acc'], 'valid f1:', checkpoint['valid_f1'])

model.to(device)
#evaluate(model, train_dataloader, batch_size, total_train_epochs-1, 'Train_set')
evaluate(model, test_dataloader, batch_size, epoch, 'Test_set')

Loaded the pretrain  NER_BERT_CRF  model, epoch: 2 valid acc: 0.9248638838475499 valid f1: 0.9294763513513513




Epoch:2, Acc:97.52, Precision: 97.70, Recall: 97.51, F1: 97.60 on Test_set, Spend:0.060 minutes for evaluation
--------------------------------------------------------------


(0.9751904243743199, 0.9760328106976546)

In [71]:
model.eval()
with torch.no_grad():
    demon_dataloader = data.DataLoader(dataset=test_dataset,
                                batch_size=10,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=NerDataset.pad)
    for batch in demon_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, predict_mask, label_ids = batch
        _, predicted_label_seq_ids = model(input_ids, segment_ids, input_mask)
        # _, predicted = torch.max(out_scores, -1)
        valid_predicted = torch.masked_select(predicted_label_seq_ids, predict_mask)
        # valid_label_ids = torch.masked_select(label_ids, predict_mask)
        for i in range(len(batch)):
            print(predicted_label_seq_ids[i])
            print(label_ids[i])
            new_ids = predicted_label_seq_ids[i].cpu().numpy()[predict_mask[i].cpu().numpy()==1]
            print(list(map(lambda i: label_list[i], new_ids)))
            print(test_examples[i].labels)
        #break
#%%
#print(citeProcessor.get_label_map())

  del sys.path[0]


tensor([ 1,  3,  8,  4,  5,  5,  5,  5,  3, 12, 13, 13,  0,  0,  0, 16, 10, 14,
         0,  0, 14,  0,  3, 24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  3, 26,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       device='cuda:0')
tensor([ 1,  3,  8,  4,  5,  5,  5,  5,  3, 12, 13, 13,  0,  0,  0, 16, 10,  8,
         0,  0, 14,  0,  3, 24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  3, 26,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       device=

In [72]:
print('Test Example 5: {} '.format(test_examples[0].words))
print('True Labels: {}'.format(test_examples[0].labels))
pred_id = predicted_label_seq_ids[0].cpu().numpy()[predict_mask[0].cpu().numpy()==1]
print('Predicted Labels: {}'.format(list(map(lambda i: label_list[i], pred_id))))

Test Example 5: ['Anonymous', '1978', 'New', 'Names', 'in', 'Volume', '52', '.', 'Journal', 'of', 'Helminthology', '52', '04', 'December', '389', '.', 'httpdx.doi.org10.1017s0022149x00017338', 'doi', '10.1017s0022149x00017338'] 
True Labels: ['O', 'B-issued', 'B-title', 'I-title', 'I-title', 'I-title', 'I-title', 'O', 'B-container-title', 'I-container-title', 'I-container-title', 'B-volume', 'B-issue', 'B-issued', 'B-page', 'O', 'B-url', 'O', 'B-doi']
Predicted Labels: ['B-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'O', 'B-issued', 'B-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'O', 'B-container-title', 'B-volume', 'B-page', 'O', 'B-publisher', 'I-publisher']


In [62]:
print('Test Example: 8 {} '.format(test_examples[8].words))
print('True Labels: {}'.format(test_examples[8].labels))
pred_id = predicted_label_seq_ids[8].cpu().numpy()[predict_mask[8].cpu().numpy()==1]
print('Predicted Labels: {}'.format(list(map(lambda i: label_list[i], pred_id))))

Test Example: 8 ['D.', 'Felmingham', 'and', 'R.', 'N.', 'Gruneberg', '“A', 'multicentre', 'collaborative', 'study', 'of', 'the', 'antimicrobial', 'susceptibility', 'of', 'communityacquired', 'lower', 'respiratory', 'tract', 'pathogens', '19921993', 'The', 'Alexander', 'Project”', 'Journal', 'of', 'Antimicrobial', 'Chemotherapy', 'vol.38', 'no.', 'suppl', 'A', 'pp.', '1–57', 'Jul.', '1996'] 
True Labels: ['B-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'B-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'B-container-title', 'I-container-title', 'I-container-title', 'I-container-title', 'O', 'O', 'O', 'O', 'O', 'B-page', 'B-issued', 'I-issued']


IndexError: index 8 is out of bounds for dimension 0 with size 6

In [63]:
print('Test Example: 0 {} '.format(test_examples[0].words))
print('True Labels: {}'.format(test_examples[0].labels))
pred_id = predicted_label_seq_ids[0].cpu().numpy()[predict_mask[0].cpu().numpy()==1]
print('Predicted Labels: {}'.format(list(map(lambda i: label_list[i], pred_id))))

Test Example: 0 ['Anonymous', '1978', 'New', 'Names', 'in', 'Volume', '52', '.', 'Journal', 'of', 'Helminthology', '52', '04', 'December', '389', '.', 'httpdx.doi.org10.1017s0022149x00017338', 'doi', '10.1017s0022149x00017338'] 
True Labels: ['O', 'B-issued', 'B-title', 'I-title', 'I-title', 'I-title', 'I-title', 'O', 'B-container-title', 'I-container-title', 'I-container-title', 'B-volume', 'B-issue', 'B-issued', 'B-page', 'O', 'B-url', 'O', 'B-doi']
Predicted Labels: ['O', 'B-issued', 'B-title', 'I-title', 'I-title', 'I-title', 'I-title', 'O', 'B-container-title', 'I-container-title', 'I-container-title', 'B-volume', 'B-issue', 'B-issued', 'B-page', 'O', 'B-url', 'O', 'B-doi']


In [73]:
print('Test Example: 50 {} '.format(test_examples[50].words))
print('True Labels: {}'.format(test_examples[50].labels))
pred_id = predicted_label_seq_ids[50].cpu().numpy()[predict_mask[50].cpu().numpy()==1]
print('Predicted Labels: {}'.format(list(map(lambda i: label_list[i], pred_id))))

Test Example: 50 ['Nestler', 'T.', 'R.', 'Schmid', 'W.', 'Münchgesang', 'V.', 'Bazhenov', 'J.', 'Schilm', 'T.', 'Leisegang', 'and', 'D.', 'C.', 'Meyer', '.', '“Separators', 'Technology', 'Review', 'Ceramic', 'Based', 'Separators', 'for', 'Secondary', 'Batteries.”', 'AIP', 'Publishing', 'LLC', '2014', '.', 'doi', '10.10631.4878486'] 
True Labels: ['B-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'I-author', 'O', 'B-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'I-title', 'B-publisher', 'I-publisher', 'I-publisher', 'B-issued', 'O', 'O', 'B-doi']


IndexError: index 50 is out of bounds for dimension 0 with size 6