# **Named Entity Recognition Fine Tuning with BERT**

In [None]:
# Intstall required libraries
!pip install datasets
!pip install transformers
!pip install seqeval
!pip install torchvision

In [None]:
# Import necessary libraries
import re
import pandas as pd
import numpy as np
import itertools
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

In [None]:
# Create necessary functions
def get_all_tokens_and_ner_tags(directory):
    return pd.concat([get_tokens_and_ner_tags(os.path.join(directory, filename)) for filename in os.listdir(directory)]).reset_index().drop('index', axis=1)
    
def get_tokens_and_ner_tags(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        split_list = [list(y) for x, y in itertools.groupby(lines, lambda z: z == '\n') if not x]
        tokens = [[x.split('\t')[0] for x in y] for y in split_list]
        entities = [[x.split('\t')[1][:-1] for x in y] for y in split_list] 
    return pd.DataFrame({'tokens': tokens, 'ner_tags': entities})
  
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

### **DATA PREPARATION**
Convert the text data to BERT trainable data format

In [None]:
# Read text file
with open('train.txt', encoding="utf-8") as f:
    content = f.read()

In the text file, you can see the following delimeters that splits the words to next line.

---


["\n\n", "\n\t\n"]

In [None]:
# Replace and apply the common delimeter
content = content.replace("\n\n", "\n\t\n")

# Split the text by next line[\n\t\n] and save it in a list
content_all = content.split("\n\t\n")

Minor formating in the list.

In [None]:
# Remove empty values in the list
while("" in content_all):
    content_all.remove("")

# Remove \n has empty text
while("\n" in content_all):
  content_all.remove("\n")

In [None]:
# Print shape
print(len(content_all))

3394


**Data Conversion**
---
Create a DataFrame and store the "doc_id", "tokens", "ner_tags" as columns

In [None]:
content_all_df = pd.DataFrame()
for i in range(len(content_all)):
    split_sent = content_all[i].split("\n")
    tokens=[]
    ner_tags=[]
    for j in range(len(split_sent)):
        split_word = split_sent[j].split("\t")
        tokens.append(split_word[0])
        ner_tags.append(split_word[1])
    content_all_df.loc[i,"doc_id"] = i
    content_all_df.loc[i, "tokens"] = str(tokens)
    content_all_df.loc[i,"ner_tags"] = str(ner_tags)

The dataframe has stored the "ner_tags" and "tokens" in string format(str). So convert it to list using 'eval' method

In [None]:
content_all_df["ner_tags"] = content_all_df["ner_tags"].apply(lambda x:eval(x))
content_all_df["tokens"] = content_all_df["tokens"].apply(lambda x:eval(x))   

In [None]:
# Display samples
content_all_df.head()

Unnamed: 0,doc_id,tokens,ner_tags
0,0.0,"[@paulwalk, It, 's, the, view, from, where, I,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-l..."
1,1.0,"[From, Green, Newsfeed, :, AHFA, extends, dead...","[O, O, O, O, B-group, O, O, O, O, O, O, O, O, ..."
2,2.0,"[Pxleyes, Top, 50, Photography, Contest, Pictu...","[B-corporation, O, O, O, O, O, O, O, O, O, O, O]"
3,3.0,"[today, is, my, last, day, at, the, office, .]","[O, O, O, O, O, O, O, O, O]"
4,4.0,"[4Dbling, 's, place, til, monday, ,, party, pa...","[B-person, O, O, O, O, O, O, O, O, O, O, O]"


**Get NER Tags / Labels / Entities**
---
Find the number of unique NER lables in training data

In [None]:
# Add a column "unique_ner_tags" and get unique the lables from each row
content_all_df["unique_ner_tags"] = content_all_df["ner_tags"].apply(lambda x: list(set(x)))

# Get all unique entities/labels in a list
unique_ner_tags = set(itertools.chain.from_iterable(content_all_df["unique_ner_tags"]))
label_list = list(unique_ner_tags)
label_list.sort()

#Print the labels
print(label_list)

['B-corporation', 'B-creative-work', 'B-group', 'B-location', 'B-person', 'B-product', 'I-corporation', 'I-creative-work', 'I-group', 'I-location', 'I-person', 'I-product', 'O']


**Encode the Labels in Numeric representation**
---
Map the lables and the encoded values in a dictionarity further we can align and distribute it to the model

In [None]:
label_encoding_dict_non_reversed = dict(list(enumerate(label_list)))
label_encoding_dict = {v: k for k, v in label_encoding_dict_non_reversed.items()}

In [None]:
# Select only the required input and output features
content_all_df = content_all_df[["tokens", "ner_tags"]]

**Train / Validation Split**
---
Split the train data set into two sets by the ratio 8:2. and we can furthur evaluate the model performance

In [None]:
#train valid split
train, valid = train_test_split(content_all_df, test_size=0.20, shuffle=False, random_state=1)

In [None]:
train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)

train_df = train
valid_df = valid

In [None]:
# Print Shape
print(len(train_df))
print(len(valid_df))

2715
679


**Model Initialization**
---
Initialize the BERT model. Define the Task Name, Model, Tokenizer

In [None]:
# Initialize the BERT model
task = "ner" 
model_checkpoint = "distilbert-base-uncased"
batch_size = 16
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

# Initialize pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

data_collator = DataCollatorForTokenClassification(tokenizer)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

**Tokenization**
---
Tokenize and embed the dataset with pretrained BERT

In [None]:
# Tokenizate our dataset
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
valid_tokenized_datasets = valid_dataset.map(tokenize_and_align_labels, batched=True)


  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

**Hyper Parameter Turning**
---
Hyperparameters - We can iterate and tune the model with these parameters for better results

In [None]:
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

# Metric used for entity lable evaluation
metric = load_metric("seqeval")

**Training the Model**
---
Train the model with different parameters and finalize the optimal one

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=valid_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2715
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 510
  Number of trainable parameters = 66372877
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.13098,0.487356,0.3392,0.4,0.966697
2,No log,0.116043,0.523723,0.4592,0.489344,0.970434
3,0.118800,0.12009,0.569061,0.4944,0.52911,0.971664


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 679
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 679
  Batch size = 16
Saving model checkpoint to test-ner/checkpoint-500
Configuration saved in test-ner/checkpoint-500/config.json
Model weights saved in test-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-ner/checkpoint-500

TrainOutput(global_step=510, training_loss=0.11721972303063262, metrics={'train_runtime': 2239.6036, 'train_samples_per_second': 3.637, 'train_steps_per_second': 0.228, 'total_flos': 107800547177580.0, 'train_loss': 0.11721972303063262, 'epoch': 3.0})

# **Evaluation**

---

Evaluate the remaining 20 percent of training data. 
It is a good idea to take a part of training data for validation while training the model.

In [None]:
# Evalute the model
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 679
  Batch size = 16


{'eval_loss': 0.12008974701166153,
 'eval_precision': 0.569060773480663,
 'eval_recall': 0.4944,
 'eval_f1': 0.529109589041096,
 'eval_accuracy': 0.9716637494678083,
 'eval_runtime': 59.5541,
 'eval_samples_per_second': 11.401,
 'eval_steps_per_second': 0.722,
 'epoch': 3.0}

**Save the Model**
---
Save the model to disk

In [None]:
# Save the model to disk
trainer.save_model('un-ner.model')

Saving model checkpoint to un-ner.model
Configuration saved in un-ner.model/config.json
Model weights saved in un-ner.model/pytorch_model.bin
tokenizer config file saved in un-ner.model/tokenizer_config.json
Special tokens file saved in un-ner.model/special_tokens_map.json


**Load the Model**
---
Load the model to disk

In [None]:
# Load the tokenizer and trained model from disk
tokenizer = AutoTokenizer.from_pretrained('./un-ner.model/')
model = AutoModelForTokenClassification.from_pretrained('./un-ner.model/', num_labels=len(label_list))

**Error Analysis**
---
Do some Error Analysis with the Validation Dataset

In [None]:
# Recall the validation dataset that we preserved for evaluation purposes
valid = valid_df

# Compare bot the Prediction results and Original results in a DataFrame
final_df = pd.DataFrame()
for i in range(len(valid)): 
    split_sent = valid["tokens"][i]
    words_org=[]
    ner_tags=[]
    for j in range(len(split_sent)):
        split_word = split_sent[j]
        if not split_word.startswith("http"):
            words_org.append(split_word)
            ner_tags.append(valid["ner_tags"][i][j])
            
    sentence = (" ".join(words_org))
    
    tokens = tokenizer(sentence)
    torch.tensor(tokens['input_ids']).unsqueeze(0).size()
    
    # Split sentence to words
    words = tokenizer.batch_decode(tokens['input_ids'])
    predictions = model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))
    
    argmax_label_prediction = torch.argmax(predictions.logits.squeeze(), axis=1)
    softmax_proba_predictions = torch.softmax(predictions.logits.squeeze(), axis=1)
    
    new_tokens, new_labels, new_proba = [], [], []
    for j in range(len(words)):
        if words[j].startswith("##"):
            new_tokens[-1] = new_tokens[-1] + words[j][2:]
        else:
            new_labels.append(label_list[argmax_label_prediction[j]])
            new_proba.append(softmax_proba_predictions[j])
            new_tokens.append(words[j])
    
    
    # ArgMax - Label Predictions
    label_prediction = new_labels
    label_prediction = pd.DataFrame({'word': new_tokens, 'predicted_label': label_prediction})
    
    # SoftMax - Probability Predictions
    proba_predictions = pd.DataFrame([x.tolist() for x in new_proba], columns=label_list)

    # Get both label prediction and probabilities in one dataframe
    prediction_df = label_prediction.join(proba_predictions)
    
    default_df = pd.DataFrame({'word': words_org, 'default_label':ner_tags})
    prediction_df = prediction_df.merge(default_df, on="word", how="left")
    
    # Loop all the documents prediction and save it in one dataframe
    final_df = final_df.append(prediction_df, ignore_index=True)
    
    #Formatting the extra spaces
    eval_df = final_df[final_df["word"] != '[CLS]']
    eval_df["word"] = eval_df["word"].apply(lambda x: x.replace("[SEP]", ""))
    
    eval_df = eval_df.dropna(subset=['default_label', 'predicted_label'])
    eval_df = eval_df [["word", "predicted_label", "default_label"]]

In [None]:
# Print and see the Evaluated Dataset
print(eval_df.head())

         word predicted_label default_label
1         you               O             O
2         may               O             O
3        hope               O             O
4        that               O             O
5  everything               O             O


**Cross Table / Confusion Matrix**
---
Display and see the Confusion Matrix over each and individual entities

In [None]:
# Cross Table
pd.crosstab(eval_df['default_label'], eval_df['predicted_label'], rownames=['Default'], colnames=['Predicted'], margins=True)

Predicted,B-corporation,B-creative-work,B-group,B-location,B-person,B-product,I-creative-work,I-group,I-location,I-person,I-product,O,All
Default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
B-corporation,2,0,0,0,0,0,0,0,0,0,0,0,2
B-creative-work,0,1,0,0,0,0,0,0,0,0,0,1,2
B-group,0,0,0,0,1,0,0,0,0,0,0,0,1
B-location,0,0,0,3,0,0,0,0,0,0,0,4,7
B-person,0,0,0,0,10,0,0,0,0,0,0,0,10
B-product,0,0,1,0,0,2,0,0,0,0,0,1,4
I-creative-work,0,0,0,1,0,0,1,0,1,0,1,5,9
I-location,0,0,0,1,0,0,0,0,2,0,0,25,28
I-person,0,0,0,0,0,0,0,0,0,7,0,2,9
I-product,0,0,0,0,0,0,0,1,0,0,17,7,25


From the above table. It is explicitly seen the entity **I-Location** and **O** has more number of mismatches.

Similarly for **B-Location** and **O**

We should analyse and look deep in those entities

# **Accuracy | Precision, Recall, F-Measure**

Find the accuracy and performance

In [None]:
# Accuracy
accuracy = accuracy_score(eval_df['default_label'], eval_df['predicted_label'])

print(accuracy)

0.9890034364261169


In [None]:
# Evaluate the performance score
eval_score = precision_recall_fscore_support(eval_df['default_label'], eval_df['predicted_label'], average='macro')

print(eval_score)

(0.4444069404458563, 0.5053160139173113, 0.45676114220934466, None)


  _warn_prf(average, modifier, msg_start, len(result))


**Prediction Module**
---
Predict the **Test Data/Unseen Data**

In [None]:
# read the text file "test.txt"
with open('test.txt',  encoding="utf-8") as f:
    test_content = f.read()

In [None]:
test_content = test_content.replace("\n\n", "\n\t\n")
test_content_all = test_content.split("\n\t\n")

Minor formatting the text alignments if applicable

In [None]:
# Remove empty values in the list
while("" in test_content_all):
    test_content_all.remove("")

# Remove \n has empty text
while("\n" in test_content_all):
  test_content_all.remove("\n")

**Predict Labels and Confidence Scores**
---
Loop over the test data | Get prediction | Get Probabilities | Store every prediction in a DataFrame

In [None]:
#Predic Labels and Confidence scores
final_df = pd.DataFrame()
for i in range(len(test_content_all)): 
    sentence = (re.sub('\s+',' ',test_content_all[i]))
    tokens = tokenizer(sentence)
    torch.tensor(tokens['input_ids']).unsqueeze(0).size()
    
    # Split sentence to words
    words = tokenizer.batch_decode(tokens['input_ids'])
    predictions = model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))
    
    argmax_label_prediction = torch.argmax(predictions.logits.squeeze(), axis=1)
    softmax_proba_predictions = torch.softmax(predictions.logits.squeeze(), axis=1)
    
    new_tokens, new_labels, new_proba = [], [], []
    for j in range(len(words)):
        if words[j].startswith("##"):
            new_tokens[-1] = new_tokens[-1] + words[j][2:]
        else:
            new_labels.append(label_list[argmax_label_prediction[j]])
            new_proba.append(softmax_proba_predictions[j])
            new_tokens.append(words[j])
    
    # ArgMax - Label Predictions
    label_prediction = new_labels
    label_prediction = pd.DataFrame({'word': new_tokens, 'label': label_prediction})
    
    # SoftMax - Probability Predictions
    proba_predictions = pd.DataFrame([x.tolist() for x in new_proba], columns=label_list)

    # Get both label prediction and probabilities in one dataframe
    prediction_df = label_prediction.join(proba_predictions)
    
    # Loop all the documents prediction and save it in one dataframe
    final_df = final_df.append(prediction_df, ignore_index=True)
    
    #Formatting the extra spaces
    submission_df = final_df[final_df["word"] != '[CLS]']
    submission_df["word"] = submission_df["word"].apply(lambda x: x.replace("[SEP]", ""))

In [None]:
# Display the sample results
print(submission_df.head())

  word label  B-corporation  B-creative-work   B-group  B-location  B-person  \
1    &     O       0.000108         0.000141  0.000241    0.000293  0.000285   
2   gt     O       0.000072         0.000112  0.000151    0.000248  0.000157   
3    ;     O       0.000033         0.000074  0.000073    0.000095  0.000089   
4    *     O       0.000074         0.000149  0.000219    0.000315  0.000273   
5  the     O       0.000075         0.000211  0.000185    0.000261  0.000190   

   B-product  I-corporation  I-creative-work   I-group  I-location  I-person  \
1   0.000129       0.000031         0.000109  0.000053    0.000066  0.000053   
2   0.000108       0.000027         0.000076  0.000052    0.000050  0.000035   
3   0.000058       0.000022         0.000072  0.000045    0.000034  0.000027   
4   0.000103       0.000026         0.000090  0.000051    0.000050  0.000039   
5   0.000112       0.000035         0.000156  0.000089    0.000056  0.000053   

   I-product         O  
1   0.000088 

From the above dataframe you can see the predicted **labels** and the corresponding **confidence score** calculated individually for each Entity Labels

**Export the results**
---
Export test results to **text** file and **csv**

Select only the **Words** and corresponding **Predicted labels** to export text file

Select the **Words, Labels** and **Confidence scores** to export csv file

In [None]:
# Select only the words and corresponding predicted labels to export text file
submission_txt = submission_df[["word", "label"]]

# Select the words, Labels and Confidence scores to export csv file
submission_txt.to_csv(r'submission.txt', header=None, index=None, sep='\t', mode='a')
submission_df.to_csv("submission.csv")