### Import Required Libraries

In [1]:
import os 
from functools import partial
import torch

from datasets import DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments

from conll_parser import read_conll_file, convert_to_hf_dataset
from ner_pipeline_utils import compute_classification_metrics, tokenize_and_align_labels, data_collator
from viz import print_sample_data, print_hf_dataset, print_tokenized_dataset, extract_named_entities




### Check GPU Availability

In [None]:
# Check if GPU is available
print("CUDA available:", torch.cuda.is_available())
print("Current device index:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Current device index: 0
Device name: NVIDIA GeForce RTX 2070


### Define Paths and Training Configurations

In [None]:
# Define paths and configurations
data_path = "D:/files/conll2003"  # Path to CoNLL 2003 dataset
output_path = "./results"         # Directory to save model outputs
model_name = "bert-base-cased"    # Pre-trained model to use

# Training configurations
num_epochs = 1
learning_rate = 5e-5             # Initial learning rate
metric_name = "f1"               # Metric to determine the best checkpoint
eval_strategy = "steps"          # Evaluation frequency based on steps
eval_interval = 500              # Number of steps between evaluations

### Load CoNLL 2003 Data

In [None]:
# Load CoNLL 2003 dataset
train_data = read_conll_file(os.path.join(data_path, "eng.train"))
validation_data = read_conll_file(os.path.join(data_path, "eng.testa"))
test_data = read_conll_file(os.path.join(data_path, "eng.testb"))

# Display sample data from each dataset
print("Sample from Training Data:")
print_sample_data(train_data)

print("Sample from Validation Data:")
print_sample_data(validation_data)

print("Sample from Test Data:")
print_sample_data(test_data)

Sample from Training Data:
Number of sentences in the dataset: 14987
Showing the first 5 sentences:

Sentence 1:
  Token: ['-DOCSTART-', '-X-', '-X-', 'O']


Sentence 2:
  Token: ['EU', 'NNP', 'B-NP', 'B-ORG']
  Token: ['rejects', 'VBZ', 'B-VP', 'O']
  Token: ['German', 'JJ', 'B-NP', 'B-MISC']
  Token: ['call', 'NN', 'I-NP', 'O']
  Token: ['to', 'TO', 'B-VP', 'O']
  Token: ['boycott', 'VB', 'I-VP', 'O']
  Token: ['British', 'JJ', 'B-NP', 'B-MISC']
  Token: ['lamb', 'NN', 'I-NP', 'O']
  Token: ['.', '.', 'O', 'O']


Sentence 3:
  Token: ['Peter', 'NNP', 'B-NP', 'B-PER']
  Token: ['Blackburn', 'NNP', 'I-NP', 'I-PER']


Sentence 4:
  Token: ['BRUSSELS', 'NNP', 'B-NP', 'B-LOC']
  Token: ['1996-08-22', 'CD', 'I-NP', 'O']


Sentence 5:
  Token: ['The', 'DT', 'B-NP', 'O']
  Token: ['European', 'NNP', 'I-NP', 'B-ORG']
  Token: ['Commission', 'NNP', 'I-NP', 'I-ORG']
  Token: ['said', 'VBD', 'B-VP', 'O']
  Token: ['on', 'IN', 'B-PP', 'O']
  Token: ['Thursday', 'NNP', 'B-NP', 'O']
  Token: ['it',

### Create a Label Map

In [None]:
# Extract all labels from the training data
labels = [token[3] for sentence in train_data for token in sentence]

# Create a sorted list of unique labels and map them to integer indices
unique_labels = sorted(set(labels))
label_map = {label: i for i, label in enumerate(unique_labels)}

# Display the label map
for label, index in label_map.items():
    print(f"{index}: {label}")

0: B-LOC
1: B-MISC
2: B-ORG
3: B-PER
4: I-LOC
5: I-MISC
6: I-ORG
7: I-PER
8: O


### Convert Data to Hugging Face Dataset Format

In [None]:
# Convert CoNLL data to Hugging Face Dataset format
train_dataset = convert_to_hf_dataset(train_data, label_map)
validation_dataset = convert_to_hf_dataset(validation_data, label_map)
test_dataset = convert_to_hf_dataset(test_data, label_map)

# Combine datasets into a DatasetDict
datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset,
})

# Display processed data
print("Sample from Training Data:")
print_hf_dataset(train_dataset)

print("Sample from Validation Data:")
print_hf_dataset(validation_dataset)

print("Sample from Test Data:")
print_hf_dataset(test_dataset)

Sample from Training Data:
Dataset contains 14987 samples.

Sentence 1:
  Tokens: ['-DOCSTART-']
  NER Tags: [8]

Sentence 2:
  Tokens: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
  NER Tags: [2, 8, 1, 8, 8, 8, 1, 8, 8]

Sentence 3:
  Tokens: ['Peter', 'Blackburn']
  NER Tags: [3, 7]

Sentence 4:
  Tokens: ['BRUSSELS', '1996-08-22']
  NER Tags: [0, 8]

Sentence 5:
  Tokens: ['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.']
  NER Tags: [8, 2, 6, 8, 8, 8, 8, 8, 8, 1, 8, 8, 8, 8, 8, 1, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]

Sample from Validation Data:
Dataset contains 3466 samples.

Sentence 1:
  Tokens: ['-DOCSTART-']
  NER Tags: [8]

Sentence 2:
  Tokens: ['CRICKET', '-', 'LEICESTERSHIRE', 'TAKE', 'OVER', 'AT', 'TOP', 'AFTER

### Tokenize and Align Labels

In [None]:
# Load pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_map))

# Tokenize and align labels using partial for convenience
tokenize_and_align_labels = partial(tokenize_and_align_labels, tokenizer=tokenizer)
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

# Display tokenized datasets
print("Tokenized Datasets:\n", tokenized_datasets, end='\n\n')

print("Sample from Training Data:")
print_tokenized_dataset(tokenized_datasets['train'])

print("Sample from Validation Data:")
print_tokenized_dataset(tokenized_datasets['validation'])

print("Sample from Test Data:")
print_tokenized_dataset(tokenized_datasets['test'])

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map:   0%|          | 0/3466 [00:00<?, ? examples/s]

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14987
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3466
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3684
    })
})


Sample from Training Data:
Sentence 1:
  Original Tokens: ['-DOCSTART-']
  Original NER Tags: [8]
  Tokenized Input IDs: [101, 118, 141, 9244, 9272, 12426, 1942, 118, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### Set Up TrainingArguments and Trainer

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=output_path,
    eval_strategy=eval_strategy,
    eval_steps=eval_interval,
    save_steps=eval_interval,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100,
    learning_rate=learning_rate,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

# Define data collator and metrics
data_collator = partial(data_collator, tokenizer=tokenizer)
compute_metrics = partial(compute_classification_metrics, label_list=unique_labels)

# Initialize and train the model using Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

  0%|          | 0/1874 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 0.3914, 'grad_norm': 2.698897123336792, 'learning_rate': 4.7331910352187837e-05, 'epoch': 0.05}
{'loss': 0.1227, 'grad_norm': 7.496398448944092, 'learning_rate': 4.466382070437567e-05, 'epoch': 0.11}
{'loss': 0.0919, 'grad_norm': 0.6311567425727844, 'learning_rate': 4.1995731056563505e-05, 'epoch': 0.16}
{'loss': 0.0784, 'grad_norm': 4.44400691986084, 'learning_rate': 3.932764140875134e-05, 'epoch': 0.21}
{'loss': 0.0832, 'grad_norm': 2.786243438720703, 'learning_rate': 3.665955176093917e-05, 'epoch': 0.27}


  0%|          | 0/434 [00:00<?, ?it/s]

Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.94      0.95      0.94      1837
        MISC       0.76      0.84      0.80       922
         ORG       0.90      0.85      0.87      1341
         PER       0.97      0.96      0.96      1842

   micro avg       0.91      0.91      0.91      5942
   macro avg       0.89      0.90      0.89      5942
weighted avg       0.91      0.91      0.91      5942
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.0639970526099205, 'eval_precision': 0.9094664821643778, 'eval_recall': 0.9131605520026927, 'eval_f1': 0.9108744127047756, 'eval_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.94      0.95      0.94      1837\n        MISC       0.76      0.84      0.80       922\n         ORG       0.90      0.85      0.87      1341\n         PER       0.97      0.96      0.96      1842\n\n   micro avg       0.91      0.91      0.91      5942\n   macro avg       0.89      0.90      0.89      5942\nweighted avg       0.91      0.91      0.91      5942\n', 'eval_runtime': 16.8115, 'eval_samples_per_second': 206.169, 'eval_steps_per_second': 25.816, 'epoch': 0.27}
{'loss': 0.0695, 'grad_norm': 4.252267360687256, 'learning_rate': 3.3991462113127e-05, 'epoch': 0.32}
{'loss': 0.0537, 'grad_norm': 4.346560001373291, 'learning_rate': 3.1323372465314835e-05, 'epoch': 0.37}
{'loss': 0.0531, 'grad_norm': 0.05698142573237419, 'learning_rate': 

  0%|          | 0/434 [00:00<?, ?it/s]

Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.93      0.97      0.95      1837
        MISC       0.89      0.81      0.85       922
         ORG       0.87      0.92      0.90      1341
         PER       0.96      0.98      0.97      1842

   micro avg       0.92      0.94      0.93      5942
   macro avg       0.91      0.92      0.92      5942
weighted avg       0.92      0.94      0.93      5942
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.05810042470693588, 'eval_precision': 0.9202359003595662, 'eval_recall': 0.9362167620329855, 'eval_f1': 0.9276636840418444, 'eval_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.93      0.97      0.95      1837\n        MISC       0.89      0.81      0.85       922\n         ORG       0.87      0.92      0.90      1341\n         PER       0.96      0.98      0.97      1842\n\n   micro avg       0.92      0.94      0.93      5942\n   macro avg       0.91      0.92      0.92      5942\nweighted avg       0.92      0.94      0.93      5942\n', 'eval_runtime': 17.7791, 'eval_samples_per_second': 194.948, 'eval_steps_per_second': 24.411, 'epoch': 0.53}
{'loss': 0.062, 'grad_norm': 3.221877336502075, 'learning_rate': 2.0651013874066168e-05, 'epoch': 0.59}
{'loss': 0.0542, 'grad_norm': 2.4324893951416016, 'learning_rate': 1.7982924226254002e-05, 'epoch': 0.64}
{'loss': 0.0452, 'grad_norm': 2.3677196502685547, 'learning_rate

  0%|          | 0/434 [00:00<?, ?it/s]

Trainer is attempting to log a value of "              precision    recall  f1-score   support

         LOC       0.97      0.96      0.96      1837
        MISC       0.87      0.90      0.88       922
         ORG       0.91      0.92      0.91      1341
         PER       0.97      0.97      0.97      1842

   micro avg       0.94      0.94      0.94      5942
   macro avg       0.93      0.94      0.93      5942
weighted avg       0.94      0.94      0.94      5942
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.04237821698188782, 'eval_precision': 0.9397867130835945, 'eval_recall': 0.9436216762032985, 'eval_f1': 0.9416518252622397, 'eval_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.97      0.96      0.96      1837\n        MISC       0.87      0.90      0.88       922\n         ORG       0.91      0.92      0.91      1341\n         PER       0.97      0.97      0.97      1842\n\n   micro avg       0.94      0.94      0.94      5942\n   macro avg       0.93      0.94      0.93      5942\nweighted avg       0.94      0.94      0.94      5942\n', 'eval_runtime': 18.8097, 'eval_samples_per_second': 184.267, 'eval_steps_per_second': 23.073, 'epoch': 0.8}
{'loss': 0.0445, 'grad_norm': 1.8011629581451416, 'learning_rate': 7.310565635005337e-06, 'epoch': 0.85}
{'loss': 0.0485, 'grad_norm': 1.1676015853881836, 'learning_rate': 4.6424759871931695e-06, 'epoch': 0.91}
{'loss': 0.0496, 'grad_norm': 3.238689422607422, 'learning_rate'

TrainOutput(global_step=1874, training_loss=0.07855289532000889, metrics={'train_runtime': 359.6904, 'train_samples_per_second': 41.666, 'train_steps_per_second': 5.21, 'total_flos': 960565719981294.0, 'train_loss': 0.07855289532000889, 'epoch': 1.0})

### Extract Named Entities from Sentences

In [None]:
# Example sentences for NER extraction
sentence1 = "John Smith is a software engineer who works at Google."
named_entities1 = extract_named_entities(sentence1, tokenizer, model, label_map)
print(f"Example 1: {sentence1}")
print(f"Named Entities: {named_entities1}\n")

sentence2 = "The company Apple Inc. announced its new product, the iPhone 12, at a press conference held in San Francisco."
named_entities2 = extract_named_entities(sentence2, tokenizer, model, label_map)
print(f"Example 2: {sentence2}")
print(f"Named Entities: {named_entities2}\n")

sentence3 = "The actor Tom Hanks starred in the movie Forrest Gump."
named_entities3 = extract_named_entities(sentence3, tokenizer, model, label_map)
print(f"Example 3: {sentence3}")
print(f"Named Entities: {named_entities3}\n")

sentence4 = "Paris is the capital city of France."
named_entities4 = extract_named_entities(sentence4, tokenizer, model, label_map)
print(f"Example 4: {sentence4}")
print(f"Named Entities: {named_entities4}\n")

Example 1: John Smith is a software engineer who works at Google.
Named Entities: ['John', 'Smith', 'Google'] 

Example 2: The company Apple Inc. announced its new product, the iPhone 12, at a press conference held in San Francisco.
Named Entities: ['Apple', 'Inc', 'iPhone', '12', 'Francisco'] 


Example 3: The actor Tom Hanks starred in the movie Forrest Gump.
Named Entities: ['Tom', 'Hank', '##s', 'Forrest', 'G', '##ump'] 


Example 4: Paris is the capital city of France.
Named Entities: [] 


