**Load Data**

In [None]:
!pip install transformers
from transformers import pipeline
import json
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, BertTokenizerFast, BertForTokenClassification

**Read File**

In [None]:
def read_dataset(file):
  with open(file, "r") as json_file:
    data = json.load(json_file)
    train = data['train']
    test = data['test']
  return train, test

file = "label_data_train_test.json"
train, test = read_dataset(file)

**Build Model**

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [None]:
ner_results = []
for text, labels in train:
  res = nlp(text)
  filter = [item for item in res if 'ORG' in item.get('entity', '') or 'PER' in item.get('entity', '')]
  ner_results.append(filter)

**Post-Processing: Align prediction words and Labels**

In [None]:
def align_prediction(nonsplit_ner_results):
    aligned_predictions = []
    current_word = []
    current_entity = None
    for token in nonsplit_ner_results:
      word = token['word']
      entity = token['entity']
      if entity.startswith('B-'):
          if current_word:
              aligned_predictions.append({'word': ' '.join(current_word).replace(' ##', '').replace('##', ''), 'entity': current_entity})
          current_word = []
          current_entity = entity[2:]
      current_word.append(word)
    if current_word:
      aligned_predictions.append({'word': ' '.join(current_word).replace(' ##', '').replace('##', ''), 'entity': current_entity})
    return aligned_predictions
predicted_labels = []
for i in range(len(ner_results)):
  predicted_labels.append(align_prediction(ner_results[i]))


Example Illustration： If the model tokenizes one word in two parts and consider them as two different words, then we don't combine them back to their original form


*   below: 'An', 'ubha' and 'v Poddar' are considered as three words
*   while in the ground truth, they consist of one word



In [None]:
for item in ner_results[1]:
  if 'PER' in item['entity']:

    print([item['entity'], item['word'],item['start'],item['end']])

['B-PER', 'An', 434, 436]
['B-PER', '##ub', 436, 438]
['I-PER', '##ha', 438, 440]
['B-PER', '##v', 440, 441]
['I-PER', 'Po', 442, 444]
['I-PER', '##dd', 444, 446]
['I-PER', '##ar', 446, 448]
['B-PER', 'Ash', 450, 453]
['B-PER', '##ish', 453, 456]
['I-PER', 'Singh', 457, 462]
['I-PER', '##ania', 462, 466]


In [None]:
predicted_labels[1]

[{'word': 'ETRON', 'entity': 'ORG'},
 {'word': 'Tetron Commercial Ltd & Services', 'entity': 'ORG'},
 {'word': 'Tetron Commercial Ltd', 'entity': 'ORG'},
 {'word': 'Tetron Commercial Ltd', 'entity': 'ORG'},
 {'word': 'An', 'entity': 'PER'},
 {'word': 'ubha', 'entity': 'PER'},
 {'word': 'v Poddar', 'entity': 'PER'},
 {'word': 'Ash', 'entity': 'PER'},
 {'word': 'ish Singhania', 'entity': 'PER'},
 {'word': 'Tetron Commercial Ltd', 'entity': 'ORG'},
 {'word': 'Tetron Commercial Ltd', 'entity': 'ORG'},
 {'word': 'Tetron Commercial Ltd', 'entity': 'ORG'}]

In [None]:
true_labels = []
for text, labels in train:
    result = []
    for ner, start, end in labels:
        word = text[start:end]
        result.append([word, ner])

    true_labels.append(result)
true_labels[1]

[['TETRON COMMERCIAL LTD', 'ORG'],
 ['Tetron Commercial Ltd', 'ORG'],
 ['Tetron Commercial Ltd', 'ORG'],
 ['Tetron Commercial Ltd', 'ORG'],
 ['Anubhav Poddar', 'PERSON'],
 ['Ashish Singhania', 'PERSON'],
 ['Tetron Commercial Ltd', 'ORG'],
 ['Tetron Commercial Ltd', 'ORG'],
 ['Tetron Commercial Ltd', 'ORG']]

**Calculate Metrics**

True positives only when the predicted word & label pair are the exactly same as the corresponding groud truth pair

In [None]:
def calculate_metrics(predicted_labels, ground_truth_labels, num):
    # Initialize variables to count true positives, false positives, and false negatives
    true_positives, false_positives, false_negatives = 0, 0, 0
    # Iterate through each datapoint and calculate true positives, false positives, and false negatives
    for i in range(num):
        predicted_sublabels = [(label['word'], label['entity']) for label in predicted_labels[i]]
        ground_truth_sublabels = [(label[0], label[1]) for label in ground_truth_labels[i]]

        sub_true_positives = 0
        sub_false_positives = 0
        sub_false_negatives = len(ground_truth_sublabels)

        for label in predicted_sublabels:
            if label in ground_truth_sublabels:
                sub_true_positives += 1
            else:
                sub_false_positives += 1
        sub_false_negatives -= sub_true_positives

        true_positives += sub_true_positives
        false_positives += sub_false_positives
        false_negatives += sub_false_negatives

    # Calculate precision, recall, and F1 score
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1_score

# Calculate precision, recall, and F1 score
precision, recall, f1_score = calculate_metrics(predicted_labels, true_labels, len(train))

# Print the results
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Precision: 0.23613690634120457
Recall: 0.1319887290523506
F1 Score: 0.16933028919330292
