In [1]:
# initial code adapted from https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/
# which was outdated
# https://github.com/huggingface/transformers#Quick-tour-TF-20-training-and-PyTorch-interoperability
# provided a good reference to update although written for a different task

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tqdm.notebook import tnrange, tqdm_notebook

# Load Data

In [None]:
data = pd.read_csv("./Data/entity-annotated-corpus/ner_dataset.csv", encoding= 'latin1')

In [None]:
data.head()

In [None]:
data = data.fillna(method="ffill")

In [None]:
data['Sentence #'] = data['Sentence #'].apply(lambda x: int(x.split(': ')[-1]))

In [None]:
data[data['Sentence #']==1]

In [None]:
data['Tag'].value_counts()

In [None]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(data)

In [None]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
sentences[0]

In [None]:
labels = [[s[2] for s in sentence] for sentence in getter.sentences]
print(labels[0])

In [None]:
tag_values = list(set(data["Tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [None]:
tag_values

# Load Model

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
torch.__version__

In [None]:
tf.__version__

In [None]:
MAX_LEN = 75
bs = 32

In [None]:
# have py-torch use local GPU
device = torch.device("cuda")
n_gpu = torch.cuda.device_count()

In [None]:
# load bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]

In [None]:
tokenized_texts, labels = zip(*tokenized_texts_and_labels)

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [None]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [None]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

# Train with PyTorch Model

In [None]:
tr_inputs = torch.tensor(tr_inputs).to(torch.int64)
val_inputs = torch.tensor(val_inputs).to(torch.int64)
tr_tags = torch.tensor(tr_tags).to(torch.int64)
val_tags = torch.tensor(val_tags).to(torch.int64)
tr_masks = torch.tensor(tr_masks).to(torch.int64)
val_masks = torch.tensor(val_masks).to(torch.int64)

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [None]:
import transformers
from transformers import BertForTokenClassification, AdamW

In [None]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels, avoid = 17): # 17 is the PAD token
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    mask = labels_flat != 17
    labels_flat = labels_flat[mask]
    pred_flat = pred_flat[mask]
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
epochs = 5
max_grad_norm = 1.0
total_steps = len(train_dataloader) * epochs

In [None]:
from transformers import get_linear_schedule_with_warmup

In [None]:
def load_model():
    model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False)
    return model

In [None]:
from collections import defaultdict

In [None]:
def train(model, optimizer, scheduler):
    model.cuda();
    metrics = defaultdict(list)
    for _ in tnrange(epochs, desc="Epoch"):
        # ========================================
        #               Training
        # ========================================
        # Perform one full pass over the training set.

        # Put the model into training mode.
        model.train()
        # Reset the total loss for this epoch.
        total_loss = 0

        # Training loop
        for step, batch in tqdm_notebook(enumerate(train_dataloader)):
            # add batch to gpu
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            # Always clear any previously calculated gradients before performing a backward pass.
            model.zero_grad()
            # forward pass
            # This will return the loss (rather than the model output)
            # because we have provided the `labels`.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
            # get the loss
            loss = outputs[0]
            # Perform a backward pass to calculate the gradients.
            loss.backward()
            # track train loss
            total_loss += loss.item()
            # Clip the norm of the gradient
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
            # update parameters
            optimizer.step()
            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)
        print("Average train loss: {}".format(avg_train_loss))

        # Store the loss value for plotting the learning curve.
        metrics['train_loss'].append(avg_train_loss)


        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        # Put the model into evaluation mode
        model.eval()
        # Reset the validation loss for this epoch.
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions , true_labels = [], []
        for batch in valid_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            # Telling the model not to compute or store gradients,
            # saving memory and speeding up validation
            with torch.no_grad():
                # Forward pass, calculate logit predictions.
                # This will return the logits rather than the loss because we have not provided labels.
                outputs = model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)
            # Move logits and labels to CPU
            logits = outputs[1].detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences.
            eval_loss += outputs[0].mean().item()
            eval_accuracy += flat_accuracy(logits, label_ids)
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.extend(label_ids)

            nb_eval_examples += b_input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        metrics['eval_loss'].append(eval_loss)
        print("Validation loss: {}".format(eval_loss))
        print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
        pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                     for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
        valid_tags = [tag_values[l_i] for l in true_labels
                                      for l_i in l if tag_values[l_i] != "PAD"]
        metrics['eval_f1'].append(f1_score(pred_tags, valid_tags))
        print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
        print()
    return metrics

# Select training parameters and Train

In [None]:
model = load_model()

In [None]:
# Can view names of trainable parameters in model
[itm[0] for itm in model.named_parameters()]

In [None]:
train_groups = {}
train_groups['classifier'] =  list(model.classifier.named_parameters())
train_groups['pooler_classifier'] =  train_groups['classifier'] + list(model.bert.pooler.named_parameters())
train_groups['encoder_11'] = train_groups['pooler_classifier'] + list(model.bert.encoder.layer[11].named_parameters())
train_groups['encoder_9_11'] = train_groups['encoder_11'] + list(model.bert.encoder.layer[10].named_parameters()) + list(model.bert.encoder.layer[9].named_parameters())
train_groups['encoder_6_11'] = train_groups['encoder_9_11'] + list(model.bert.encoder.layer[8].named_parameters()) + list(model.bert.encoder.layer[7].named_parameters()) + list(model.bert.encoder.layer[6].named_parameters())
train_groups['encoder_3_11'] = train_groups['encoder_6_11'] + list(model.bert.encoder.layer[5].named_parameters()) + list(model.bert.encoder.layer[4].named_parameters()) + list(model.bert.encoder.layer[3].named_parameters())
train_groups['full'] = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']

In [None]:
param_optimizer = list(model.named_parameters())

In [None]:
group_metrics = {}
for group_name, param_optimizer in train_groups.items():
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.00}
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
    lr=3e-3,
    eps=1e-8
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    model = load_model()
    metrics = train(model, optimizer, scheduler)
    group_metrics[group_name] = metrics

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(loss_values, 'b-d', label="last layer training loss")
plt.plot(validation_loss_values, 'r-d', label="last layer validation loss")
plt.plot(full_loss_values, 'b-o', label="full training loss")
plt.plot(full_validation_loss_values, 'r-o', label="full validation loss")

# Label the plot.
plt.title("Learning curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.show()