In [None]:
!pip install -U pytorch-crf
!pip install -U torchvision
!pip install -U transformers[torch]
!pip install -U scikit-learn
!pip install seqeval
!pip install datasets

In [None]:
import torch
import json
import gc
import time
import pandas as pd
import numpy as np
import torch.nn as nn
import warnings
warnings.filterwarnings("ignore")
from torchcrf import CRF
#from datasets import load_metric, Dataset
from transformers import (
    pipeline,
    AutoConfig,
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    PreTrainedModel,
    PretrainedConfig,
    BertModel,
    BertConfig,
    BertForTokenClassification
)
from datasets import load_dataset, load_metric, Dataset, DatasetDict

In [None]:
#check if cuda is available
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use GPU
    print("GPU is available.")

GPU is available.


In [None]:
torch.cuda.empty_cache()
print(torch.cuda.memory_allocated(device))
print(torch.cuda.max_memory_allocated(device))

0
0


In [None]:
bert_name = "dslim/bert-large-NER"
tokenizer = AutoTokenizer.from_pretrained(bert_name)
config = AutoConfig.from_pretrained(bert_name)

# READ AND PREPROCESS DATA

In [None]:
# read dataset and separate training set from testing set
def read_dataset(file):
    with open(file, "r") as json_file:
        data = json.load(json_file)
        train = data['train']
        test = data['test']
    return train, test

file = "label_data_train_test.json"
train, test = read_dataset(file)

In [None]:
def encode_examples(text, entities):
    # Tokenize the text
    tokens = tokenizer(text, truncation=True, padding='max_length', max_length=512)
    word_ids = tokens.word_ids()

    # Initialize labels with -100 for special tokens and padding
    labels = [-100] * len(tokens['input_ids'])


    previous_word_idx = None

    labels_mask = []

    for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                labels_mask.append(False)
            # Only label the first token of a given word.
            elif word_idx != previous_word_idx:
                labels_mask.append(True)
            else:
                labels_mask.append(False)

            previous_word_idx = word_idx


    for entity in entities:
        entity_type, start_char, end_char = entity
        # Convert character start and end positions to token start and end positions
        start_token = tokens.char_to_token(start_char)
        end_token = tokens.char_to_token(end_char - 1)  # Subtract 1 because the end index is inclusive

        # If end_token is None, we move end_char to the right until we find a token
        if end_token is None and start_token is not None:
            while end_token is None and end_char > start_char:
                end_char -= 1  # Move left since we may have overshot the actual token
                end_token = tokens.char_to_token(end_char)

        if start_token is None or end_token is None:
            continue  # Skip entities that couldn't be mapped to tokens

        # Get label IDs based on entity type
        label_ids = {
            'PERSON': (3, 4),
            'ORG': (5, 6)
        }.get(entity_type, (0, 0))

        b_label_id, i_label_id = label_ids

        # Label the first token of the entity
        labels[start_token] = b_label_id

        # Label subsequent tokens of the entity
        for i in range(start_token + 1, end_token + 1):
            labels[i] = i_label_id

    # Now, set the non-special and non-padding tokens to 'O'
    for i, token_id in enumerate(tokens['input_ids']):
        if token_id not in [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id]:
            if labels[i] == -100:  # Only if it's not already labeled
                labels[i] = 0

    return {"input_ids": tokens['input_ids'], "attention_mask": tokens['attention_mask'], "labels": labels, 'label_mask': labels_mask}

In [None]:
from torch.utils.data import Dataset, DataLoader
class NERDataset(Dataset):
    def __init__(self, texts, entities):
        self.texts = texts
        self.entities = entities
        self.encodings = [encode_examples(text, entity) for text, entity in zip(texts, entities)]

    def __getitem__(self, idx):
        return self.encodings[idx]

    def __len__(self):
        return len(self.texts)

In [None]:
%%time
# Create an instance of the dataset
all_train_texts = [item[0] for item in train]
all_train_entities = [item[1] for item in train]
train_dataset = NERDataset(all_train_texts, all_train_entities)
all_test_texts = [item[0] for item in test]
all_test_entities = [item[1] for item in test]
test_dataset = NERDataset(all_test_texts, all_test_entities)

CPU times: user 951 ms, sys: 13.7 ms, total: 965 ms
Wall time: 969 ms


In [None]:
class BERT_CRF(nn.Module):
    def __init__(self, dropout):

        super().__init__()
        bert_config = BertConfig.from_pretrained(bert_name)
        bert_config.output_attentions = True
        bert_config.output_hidden_states = True
        num_labels = bert_config.num_labels

        self.bert = BertModel.from_pretrained(bert_name, config = bert_config)

        self.dropout = nn.Dropout(p=dropout)

        self.linear = nn.Linear(self.bert.config.hidden_size, num_labels)

        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels_mask, labels = None):

        last_hidden_layer = self.bert(input_ids=input_ids, attention_mask=attention_mask)[
            'last_hidden_state']

        last_hidden_layer = self.dropout(last_hidden_layer)

        logits = self.linear(last_hidden_layer)

        batch_size = logits.shape[0]

        output_tags = []

        if labels is not None:
            loss = 0

            for seq_logits, seq_labels, seq_mask in zip(logits, labels, labels_mask):
                # Index logits and labels using prediction mask to pass only the
                # first subtoken of each word to CRF.
                seq_logits = seq_logits[seq_mask].unsqueeze(0)
                seq_labels = seq_labels[seq_mask].unsqueeze(0)

                if seq_logits.numel() != 0:
                    loss -= self.crf(seq_logits, seq_labels,
                                     reduction='token_mean')

            return loss / batch_size
        else:
            for seq_logits, seq_mask in zip(logits, labels_mask):
                seq_logits = seq_logits[seq_mask].unsqueeze(0)
                if seq_logits.numel() != 0:
                    tags = self.crf.decode(seq_logits)
                else:
                    tags = [[]]

                # Unpack "batch" results

                output_tags.append(tags[0])

            return output_tags

In [None]:
batch_size = 10
batch_train_dataset = DataLoader(train_dataset, batch_size=batch_size)
batch_test_dataset = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
%%time
from sklearn.metrics import accuracy_score

if __name__ == "__main__":

    # Initialize the BERT-CRF model
    dropout = 0.5
    model = BERT_CRF(dropout).to(device)

    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr = 2e-5)
    num_epochs = 8  # Adjust the number of training epochs as needed
    model.train()
    for epoch in range(num_epochs):
        print(f'epoch {epoch}')
        loss_num = 0
        for batch in batch_train_dataset:
            optimizer.zero_grad()
            # change the shape and make the batch size as the first dimension
            input_ids = torch.stack(batch["input_ids"], dim=0).t().to(device)
            attention_mask = torch.stack(batch["attention_mask"], dim=0).t().to(device)
            labels = torch.stack(batch["labels"], dim=0).t().to(device)
            labels_mask = torch.stack(batch["label_mask"], dim=0).t().to(device)
            loss = model(input_ids, attention_mask, labels_mask, labels)
            loss_num += loss.item()
            loss.backward()
            optimizer.step()
        print('loss:',loss_num)

epoch 0
loss: 9.859933614730835
epoch 1
loss: 2.964125607162714
epoch 2
loss: 2.063571787904948
epoch 3
loss: 1.7061817408539355
epoch 4
loss: 1.4212387232109904
epoch 5
loss: 1.100999459857121
epoch 6
loss: 0.921869860845618
epoch 7
loss: 0.8531488277949393
CPU times: user 11min 9s, sys: 18.3 s, total: 11min 27s
Wall time: 11min 31s


In [None]:
%%time
def evaluate_model(model, eval_dataset, device):
    model.eval()
    true_labels = []  # Ground truth labels
    predicted_labels = []  # Predicted labels

    with torch.no_grad():
        for batch in eval_dataset:
            input_ids = torch.stack(batch["input_ids"], dim=0).t().to(device)
            attention_mask = torch.stack(batch["attention_mask"], dim=0).t().to(device)
            labels_mask = torch.stack(batch["label_mask"], dim=0).t().to(device)

            # Make predictions
            predicted_tags = model(input_ids, attention_mask, labels_mask)

            labels = torch.stack(batch["labels"], dim=0).t().to(device)

            masked_true_labels = [torch.tensor(label)[mask].tolist() for label, mask in zip(labels, labels_mask)]

            true_labels.extend(masked_true_labels)
            predicted_labels.extend(predicted_tags)
    return (true_labels, predicted_labels)


true_labels, predicted_labels = evaluate_model(model, batch_test_dataset, device)

CPU times: user 2.89 s, sys: 1.98 ms, total: 2.89 s
Wall time: 2.91 s


# **Seqeval Overall Comparison Report on Test Dataset**

In [None]:
#convert ids to ner
predicted_ner = [[config.id2label[label] for label in sample] for sample in predicted_labels]
true_ner = [[config.id2label[sample[j]] for j in range(len(sample))] for sample in true_labels]

In [None]:
metric = load_metric("seqeval")
results = metric.compute(predictions=predicted_ner, references=true_ner)
print({"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]})

{'precision': 0.7101167315175098, 'recall': 0.7832618025751072, 'f1': 0.7448979591836734, 'accuracy': 0.9768120393120393}


# **Seqeval Classification Report on Test Dataset**

In [None]:
from seqeval.metrics import classification_report
results_classification = classification_report(true_ner, predicted_ner)
print(results_classification)

# **FINE TUNING**

In [None]:
learning_rates = [1e-6, 1e-5, 1e-4]
batch_sizes = [6, 10, 16, 32, 64]
dropouts = [0.1, 0.2, 0.3, 0.4, 0.5]
num_epochs = [2, 4, 6, 8, 10]

In [None]:
%%time
trial = 0

best_f1 = 0
best_hyperparameters = None

# Open the file for writing
with open("grid_search_results_2.txt", "a") as file:
    for learning_rate in learning_rates:
        for batch_size in batch_sizes:
            batch_train_dataset = DataLoader(train_dataset, batch_size=batch_size)
            batch_test_dataset = DataLoader(test_dataset, batch_size=batch_size)
            for dropout in dropouts:
                    for n_epoch in num_epochs:
                            model = BERT_CRF(dropout).to(device)
                            # Training loop
                            optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
                            model.train()
                            for epoch in range(n_epoch):
                                print(f'n_epoch {n_epoch}, learning_rate {learning_rate}, batch_size {batch_size}, dropout {dropout}, epoch {epoch}')
                                for batch in batch_train_dataset:
                                    optimizer.zero_grad()
                                    # change the shape and make the batch size as the first dimension
                                    input_ids = torch.stack(batch["input_ids"], dim=0).t().to(device)

                                    attention_mask = torch.stack(batch["attention_mask"], dim=0).t().to(device)
                                    labels = torch.stack(batch["labels"], dim=0).t().to(device)
                                    labels_mask = torch.stack(batch["label_mask"], dim=0).t().to(device)
                                    loss = model(input_ids, attention_mask, labels_mask, labels)
                                    loss.backward()
                                    optimizer.step()
                                print('loss:',loss.item())
                                # Evaluate the model
                            true_labels, predicted_labels = evaluate_model(model, batch_test_dataset, device)
                            predicted_ner = [[config.id2label[label] for label in sample] for sample in predicted_labels]
                            true_ner = [[config.id2label[sample[j]] for j in range(len(sample))] for sample in true_labels]
                            eval_results = metric.compute(predictions=predicted_ner, references=true_ner)
                            results_classification = classification_report(true_ner, predicted_ner)




                        # Check if current F1 is the best
                            if eval_results['overall_f1'] > best_f1:
                                best_f1 = eval_results['overall_f1']
                                best_hyperparameters = {'learning_rate': learning_rate, 'batch size': batch_size, 'dropout': dropout, 'number of epochs': n_epoch}

                        # print results
                            print(f'Trial {trial + 1}')
                            print(f'learning rate: {learning_rate}, batch size: {batch_size}, dropout: {dropout}, number of epochs: {n_epoch}')
                            print(f"F1 Score: {eval_results['overall_f1']}")
                            print(f"precision: {eval_results['overall_precision']}")
                            print(f"recall: {eval_results['overall_recall']}")
                            print(results_classification)

                        # Write results to the file
                            file.write(f'Trial {trial + 1}\n')
                            file.write(f'learning rate: {learning_rate}, batch size: {batch_size}, dropout: {dropout}, number of epochs: {n_epoch}\n')
                            file.write(f"F1 Score: {eval_results['overall_f1']}\n")
                            file.write(f"precision: {eval_results['overall_precision']}\n")
                            file.write(f"recall: {eval_results['overall_recall']}\n")
                            file.write('-' * 80 + '\n')
                            file.flush()

                            trial += 1

                            del model
                            torch.cuda.empty_cache()
                            gc.collect()

    # Write the best results to the file
    file.write('-' * 80 + '\n')
    file.write(f"Best F1 Score: {best_f1}\n")
    file.write(f"Best Hyperparameters: {best_hyperparameters}\n")

# **Custom Metrics**

In [None]:
def reformat_predictions(predictions, text):
    reformatted_predictions = []
    current_type = None
    current_start = None
    current_end = None
    current_entity = None

    for entity in predictions:
        entity_type = entity['entity']
        word = entity['word']
        if word.startswith('##') and entity_type.startswith('B-'):
            entity['entity'] = entity_type.replace('B-', 'I-')

    for entity in predictions:
        entity_type = entity['entity']
        if entity_type.startswith('B-'):
            if current_entity:
                reformatted_predictions.append({
                    'entity_type': current_type,
                    'word': current_entity,
                    'start': current_start,
                    'end': current_end
                })
            current_type = entity_type[2:]
            current_start = entity['start']
            current_end = entity['end']
            current_entity = text[current_start:current_end]
        elif entity_type.startswith('I-') and current_type == entity_type[2:]:
            current_end = entity['end']
            current_entity = text[current_start:current_end]
    if current_entity:
        reformatted_predictions.append({
            'entity_type': current_type,
            'word': current_entity,
            'start': current_start,
            'end': current_end
        })

    return reformatted_predictions

In [None]:
all_entities = []

for i in range(len(test)):
    lst = []
    text, entities = test[i]
    pred = nlp(text)
    pred = reformat_predictions(pred, text)
    for item in pred:
        lst.append((item['word'], item['entity_type']))
    all_entities.append(lst)

In [None]:
def compute_confusion_matrix(pred, truth):
    """Compute TP, FP, and FN."""
    truth = set(truth)
    pred = set(pred)

    TP = len(truth.intersection(pred))
    FP = len(pred - truth)
    FN = len(truth - pred)
    return TP, FP, FN

def compute_metrics_post_training(TP, FP, FN):
    """Compute precision, recall, and F1 score."""
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1

metrics = {
    'PER': {'TP': 0, 'FP': 0, 'FN': 0},
    'ORG': {'TP': 0, 'FP': 0, 'FN': 0},
    'TOTAL': {'TP': 0, 'FP': 0, 'FN': 0}
}

for i in range(len(test)):
    entities_pred = all_entities[i]
    text, entities_truth = test[i]

    for tag in ['PER', 'ORG']:
        pred_set = set()
        truth_set = set()
        for entity, label in entities_pred:
            entity = entity.lower()
            if label == tag:
                pred_set.add(entity)
        for entity in entities_truth:
            label, start, end = entity
            if label == tag:
                truth_set.add(text[start:end].lower())

        TP, FP, FN = compute_confusion_matrix(pred_set, truth_set)

        metrics[tag]['TP'] += TP
        metrics[tag]['FP'] += FP
        metrics[tag]['FN'] += FN

        metrics['TOTAL']['TP'] += TP
        metrics['TOTAL']['FP'] += FP
        metrics['TOTAL']['FN'] += FN

for tag, values in metrics.items():
    precision, recall, f1 = compute_metrics_post_training(values['TP'], values['FP'], values['FN'])
    print(f"{tag} -- Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")