# Enrichment for NER with Partial Annotations
 
 Some changes to this notebook are required before running. These changes are mentioned at appropriate places in the notebook. The following need to be changed
 
 * paths when saving/loading checkpoints
 * paths of datasets
 * name of pre-trained BERT model (if not dealing with Material Science corpora)

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import os
import re
import spacy
from tqdm import tqdm
import inflect
import numpy as np
import seaborn as sns
nlp = spacy.load("en_core_web_sm")
stopwords = nltk.corpus.stopwords.words('english') 
engine = inflect.engine()

import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification
from transformers import BertTokenizerFast
from transformers import pipeline
import torch.nn.functional as F
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score
import conlleval
from seqeval.scheme import IOB2

device = 'cuda'

## reading data

change the paths of datasets when using

data files are in csv format, with the two columns having the headers "sentence" and "word_labels"

sentence: tokens separated by space (' ')
word_labels: B/I/O tags for tokens separated by commas (',')

In [None]:
train_data = pd.read_csv('/kaggle/input/elsevier-tagged-data/matscholar_train_bio_partial.csv')
test_data = pd.read_csv('/kaggle/input/elsevier-tagged-data/matscholar_test_bio.csv')
test_data['sentence'] = test_data['sentence'].astype(str)
dev_data = pd.read_csv('/kaggle/input/elsevier-tagged-data/matscholar_dev_bio.csv')
dev_data['sentence'] = dev_data['sentence'].astype(str)

additional_data = pd.read_csv('/kaggle/input/elsevier-tagged-data/matpro_bio_tags.csv')

In [None]:
test_data = test_data.dropna()
dev_data = dev_data.dropna()

## preparing data loader

change pretrained BERT model below, depending on which domain the datasets belong to

In [None]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 1
EPOCHS = 3
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizer.from_pretrained('m3rg-iitd/matscibert')
fast_tokenizer = BertTokenizerFast.from_pretrained('m3rg-iitd/matscibert')

In [None]:
label2id = {'B':0, 'I':1, 'O':2}
id2label = {0:'B', 1:'I', 2:'O'}

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()
    try:
        l = text_labels.split(',')
    except:
        print(text_labels)
    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]' for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long),
        } 
    
    def __len__(self):
        return self.len

In [None]:
print("training data (partial): {}".format(train_data.shape))
print("test data: {}".format(test_data.shape))
print("dev data: {}".format(dev_data.shape))
print("additional data: {}".format(additional_data.shape))

training_set = dataset(train_data, tokenizer, MAX_LEN)
testing_set = dataset(test_data, tokenizer, MAX_LEN)
dev_set = dataset(dev_data, tokenizer, MAX_LEN)
additional_set = dataset(additional_data, tokenizer, MAX_LEN)

In [None]:

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
dev_loader = DataLoader(dev_set, **test_params)
additional_loader = DataLoader(additional_set, **train_params)

## model definition

change model name below, depending on domain

In [None]:
model = BertForTokenClassification.from_pretrained('m3rg-iitd/matscibert', 
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

## training function

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch, loader):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(loader):
        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
        if idx % 500==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 500 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

## testing function

In [None]:
# returns labels, predictions, tokens and probabilities
def test(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    eval_tokens = []
    eval_probs = []
    
    with torch.no_grad():
        for idx, batch in enumerate(tqdm(testing_loader)):
            
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)
            
            outputs = model(input_ids = ids, attention_mask=mask)
            eval_logits = outputs.logits
            
            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)
              
            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            # applying softmax to get probabilities from logits
            probabilities = F.softmax(active_logits, dim=-1)
            flattened_probs = torch.max(probabilities, axis=1)[0]
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            probs = [p.item() for p in torch.masked_select(flattened_probs, active_accuracy)]
            
            # convert tuples of tokens into a flattened list
            flattened_ids = ids.view(-1)
            ids = torch.masked_select(flattened_ids, active_accuracy)
            tokens = tokenizer.convert_ids_to_tokens(ids)
            
            
            eval_labels.append(targets)
            eval_preds.append(predictions)
            eval_tokens.append(tokens)
            eval_probs.append(probs)
            
    labels, predictions = [], []
    for l in eval_labels:
        labels.append([id2label[id.item()] for id in l])

    for p in eval_preds:
        predictions.append([id2label[id.item()] for id in p])

    return labels, predictions, eval_tokens, eval_probs

## step 1: training on additional dataset

change path to which model checkpoints are saved

In [None]:
#training the model on matpro data
# for epoch in range(10):
#    print(f"Training epoch: {epoch + 1}")
#    train(epoch, additional_set)
   
#    # test as it goes on
#    print("DEV TESTING")
#    labels, predictions, tokens, probs = test(model, dev_loader)
#    print(classification_report(labels, predictions, mode="strict", scheme=IOB2))
    
   # #save the model    
#    state = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
#                  'optimizer': optimizer.state_dict()}

#    try:
#        name = '/kaggle/working/conll-matsci-' + str(epoch) + '-checkpoint.pt'
#        torch.save(state, name)
#    except:
#        pass

## load checkpoint

change checkpoint path

In [None]:
def load_checkpoint(model, optimizer, filepath):
    start_epoch = 0
    if os.path.isfile(filepath):
        print("=> loading checkpoint '{}'".format(filepath))
        checkpoint = torch.load(filepath)
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("=> loaded checkpoint '{}' (epoch {})"
                  .format(filepath, checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(filepath))

    return model, optimizer, start_epoch

In [None]:
filepath = '/kaggle/input/checkpoints/matpro-2-checkpoint.pt'
model, optimizer, start_epoch = load_checkpoint(model, optimizer, filepath)

# now individually transfer the optimizer parts...
for state in optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.to(device)


## testing over dev set

In [None]:
labels, predictions, tokens, probs = test(model, dev_loader)

In [None]:
from seqeval.scheme import IOB2
# using seqeval library to get metrics for entities instead of tokens
print(classification_report(labels, predictions, mode="strict", scheme=IOB2))

In [None]:
lines = []
for l, p in zip(labels, predictions):
    for label, pred in zip(l, p):
        lines.append('word ' + label + ' ' + pred)
lines = '\n'.join(lines).splitlines()

res = conlleval.evaluate(lines)
print(conlleval.report(res))

## combination function (for labels and predictions) 

In [None]:
# input: tokens, labels, predictions, probs for a training instance
# tokens = list of tokens in the instance
# labels = list of labels for the instance, from the partially annotated set
# predictions = list of predictions for the instance
# probs = list of likelihoods for the predictions

#output: added, sentence, new_labels
# added: number of entities added to labels 
# sentence: list of tokens
# new_labels: enriched labels
def combine_labels_predictions_sentence(tokens, labels, predictions, probs, threshold=0.50):
    sentence = []
    new_labels = []
    last_label = 'O'
    added = 0
    
    for token,label,pred,prob in zip(tokens, labels, predictions, probs):
        if len(new_labels) == 0:
            last_label = 'O'
        else:
            last_label = new_labels[-1]
        
        # if token is one of bert added tokens, continue
        if token=='[CLS]' or token=='[SEP]':
            continue
            
        # if label and prediction are same, simply add token and label
        if label == pred:
            sentence.append(token)
            new_labels.append(label)
            continue
            
        # if label is entity, ignore prediction
        if label == 'B' or label == 'I':
            sentence.append(token)
            new_labels.append(label)
            continue

        # if the label is not entity but prediction is entity
        if label != pred and label == 'O':
            # if probability of prediction is less than threshold, ignore
            if prob < threshold:
                sentence.append(token)
                new_labels.append(label)
                continue
                
            # if prediction is I and last token was marked B, assume part of the same entity
            if pred == 'I' and last_label == 'B':
                sentence.append(token)
                new_labels.append(pred)
                continue
            
            # if prediction is I and last token was O, assume new entity started
            if pred == 'I' and last_label == 'O':
                added += 1
                sentence.append(token)
                new_labels.append('B')
                continue
            
            # if prediction is B then assume its a start of new entity
            if pred == 'B':
                added += 1
                sentence.append(token)
                new_labels.append(pred)
                continue

    return added, sentence, new_labels

In [None]:
# get predicted labels and call combination functions, return dataframe with new data
def get_combined_dataset(model, loader):
    labels, predictions, tokens, probs = test(model, loader)
    df = pd.DataFrame(columns=['sentence', 'word_labels'])
    added = 0
    for l, pred, t, prob in tqdm(zip(labels, predictions, tokens, probs)):
        cur_added, sentence, new_labels = combine_labels_predictions_sentence(t, l, pred, prob)
        added += cur_added
        sentence = ' '.join(sentence)
        word_labels = ','.join(new_labels)
        df = df.append({'sentence':sentence, 'word_labels':word_labels}, ignore_index=True)
    print("New Entities Added: ", added)
    return df
        

In [None]:
updated_train_data = train_data
updated_training_loader = training_loader
updated_training_set = training_set

## enrich and fine-tune

change checkpoint path

In [None]:
# for each epoch, train on 
dev_results = []
for epoch in range(5):
   # updating training data to predict+add new stuff
   partial_loader_testing = DataLoader(updated_training_set, **test_params)
   updated_train_data = get_combined_dataset(model, partial_loader_testing)
   updated_training_set = dataset(updated_train_data, tokenizer, MAX_LEN)
   updated_training_loader = DataLoader(updated_training_set, **train_params)
   print("New training data (enriched): {}".format(updated_train_data.shape))
    
   # now train
   print(f"Training epoch: {epoch + 1}")
   train(epoch, updated_training_loader)
   
   
   # test as it goes on, and save dev results
   print("DEV TESTING")
   labels, predictions, tokens, probs = test(model, dev_loader)
   precision = precision_score(labels, predictions, mode="strict", scheme=IOB2)
   recall = recall_score(labels, predictions, mode="strict", scheme=IOB2)
   f1 = f1_score(labels, predictions, mode="strict", scheme=IOB2)
   dev_results.append({'precision':precision, 'recall':recall, 'f1':f1, 'epoch':epoch+1})
   print(classification_report(labels, predictions, mode="strict", scheme=IOB2))

   #save the model    
   torch.save(model, '/kaggle/working/matscholar-matpro-enriched-model-'+str(epoch+1)+'.pt')