# Step 1: Install Packages and Import Dependencies

In [None]:
!pip install -q seqeval ## to calculate evaluation metrics for NER
!pip install -q transformers ## to load pre-trained model
!pip install -q datasets ## to read dataset
!pip install evaluate ## to evaluate model
!pip install wandb --upgrade  ## to log model training
!pip install torchcrf #Ensure you have a library for CRF, such as torchcrf.

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver

## Step 2: Check CUDA Availability and Device Information

In [None]:
# print("CUDA available:", torch.cuda.is_available())
# print("Current device index:", torch.cuda.current_device())
# print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

In [None]:
import torch ## PyTorch for deep learning
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments ## Hugging Face Transformers
from datasets import load_dataset, Dataset, DatasetDict ## Hugging Face Datasets
import numpy as np  ## NumPy for numerical operations
from evaluate import load ## Hugging Face's evaluation library
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report  ## SeqEval for evaluation metrics

## Step 3: Read and Prepare Data

In [None]:
def read_conll_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

In [13]:
## read data from files dataset
train_data = read_conll_file("/content/drive/MyDrive/conll2003/eng.train")
validation_data = read_conll_file("/content/drive/MyDrive/conll2003/eng.testa")
test_data = read_conll_file("/content/drive/MyDrive/conll2003/eng.testb")

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
## example of Training data
train_data[0:2]

[[['-DOCSTART-', '-X-', '-X-', 'O']],
 [['EU', 'NNP', 'B-NP', 'B-ORG'],
  ['rejects', 'VBZ', 'B-VP', 'O'],
  ['German', 'JJ', 'B-NP', 'B-MISC'],
  ['call', 'NN', 'I-NP', 'O'],
  ['to', 'TO', 'B-VP', 'O'],
  ['boycott', 'VB', 'I-VP', 'O'],
  ['British', 'JJ', 'B-NP', 'B-MISC'],
  ['lamb', 'NN', 'I-NP', 'O'],
  ['.', '.', 'O', 'O']]]

In [15]:
## size of every part of data we have.
print(f"Train-data size {len(train_data)} sentences")
print(f"Validation-data size {len(validation_data)} sentences")
print(f"Test-data size {len(test_data)} sentences")

Train-data size 14987 sentences
Validation-data size 3466 sentences
Test-data size 3684 sentences


In [16]:
### convert our data to Features and label
def convert_to_dataset(data, label_map):
    formatted_data = {"tokens": [], "ner_tags": []}
    for sentence in data:
        tokens = [token_data[0] for token_data in sentence]
        ner_tags = [label_map[token_data[3]] for token_data in sentence]
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
    return Dataset.from_dict(formatted_data)

In [17]:
## create map for our label list
label_list = sorted(list(set([token_data[3] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}
print(f"label List:{label_list}")
print("_"*70)
print(f"label Map:{label_map}")

label List:['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']
______________________________________________________________________
label Map:{'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}


In [18]:
## Apply function convert_to_dataset for make my data ready for model
train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
test_dataset = convert_to_dataset(test_data, label_map)

In [19]:
## collect dataset in one dict
datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset,
})
print(f"describe dataset we have \n{datasets}")

describe dataset we have 
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 14987
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3466
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3684
    })
})


## Step 4: Initialize Tokenizer and Model

In [20]:
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)


    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions),
    }

In [22]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

## Step 6: Tokenize Datasets and Set Training Arguments

In [23]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save model outputs
    evaluation_strategy="steps", # Evaluate every `eval_steps` steps
    eval_steps=500, # How often to evaluate
    save_steps=500,# How often to save the model
    num_train_epochs=1,# Number of training epochs
    per_device_train_batch_size=8,# Batch size for training
    per_device_eval_batch_size=8,# Batch size for evaluation
    logging_steps=100,# Log every `logging_steps` steps
    learning_rate=5e-5,# Learning rate
    load_best_model_at_end=True,# Load best model based on evaluation metric
    metric_for_best_model="f1", # Metric used to determine best model
    report_to=[], # Disable wandb logging
    no_cuda=False,  # Ensure CUDA is enabled if available
    push_to_hub=False
)

Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map:   0%|          | 0/3466 [00:00<?, ? examples/s]

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]



In [24]:
def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]


    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

In [25]:
trainer = Trainer(
    model=model, #The BERT model
    args=training_args, #specifies hyperparameters for training and evaluation.
    train_dataset=tokenized_datasets["train"], #training dataset.
    eval_dataset=tokenized_datasets["validation"], #validation dataset for evaluation during training.
    data_collator=data_collator, #A function or object responsible for batching and padding the inputs for the model.
    tokenizer=tokenizer, #This is useful for ensuring consistency in tokenization during evaluation.
    compute_metrics=compute_metrics,#A custom metric function used to calculate performance metrics such as F1-score, precision, and recall.
)

  trainer = Trainer(


In [None]:
## try to train our model we create it
 trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Classification Report
500,0.0801,0.075086,0.903338,0.915348,0.909304,precision recall f1-score support  LOC 0.92 0.95 0.93 1837  MISC 0.83 0.82 0.82 922  ORG 0.88 0.85 0.86 1341  PER 0.94 0.98 0.96 1842  micro avg 0.90 0.92 0.91 5942  macro avg 0.89 0.90 0.90 5942 weighted avg 0.90 0.92 0.91 5942
1000,0.0488,0.062348,0.912828,0.930495,0.921577,precision recall f1-score support  LOC 0.92 0.96 0.94 1837  MISC 0.90 0.82 0.86 922  ORG 0.86 0.91 0.89 1341  PER 0.95 0.97 0.96 1842  micro avg 0.91 0.93 0.92 5942  macro avg 0.91 0.92 0.91 5942 weighted avg 0.91 0.93 0.92 5942
1500,0.0428,0.043829,0.937636,0.941266,0.939447,precision recall f1-score support  LOC 0.96 0.96 0.96 1837  MISC 0.87 0.90 0.89 922  ORG 0.91 0.91 0.91 1341  PER 0.97 0.97 0.97 1842  micro avg 0.94 0.94 0.94 5942  macro avg 0.93 0.93 0.93 5942 weighted avg 0.94 0.94 0.94 5942


TrainOutput(global_step=1874, training_loss=0.07494052777040984, metrics={'train_runtime': 504.4056, 'train_samples_per_second': 29.712, 'train_steps_per_second': 3.715, 'total_flos': 960565719981294.0, 'train_loss': 0.07494052777040984, 'epoch': 1.0})

In [None]:
# Save the tokenizer and model
 save_directory = "/content/drive/MyDrive/conll2003/bert_model"  # Directory to save the model and tokenizer
 tokenizer.save_pretrained(save_directory)
 model.save_pretrained(save_directory)
 print(f"BERT model and tokenizer saved to {save_directory}")

BERT model and tokenizer saved to /content/drive/MyDrive/conll2003/bert_model


In [26]:
# Load the tokenizer and model
save_directory = "/content/drive/MyDrive/conll2003/bert_model"  # Directory where the model and tokenizer are saved
loaded_tokenizer = AutoTokenizer.from_pretrained(save_directory)
loaded_model = AutoModelForTokenClassification.from_pretrained(save_directory)
print("BERT model and tokenizer loaded successfully.")

BERT model and tokenizer loaded successfully.


## First Example

In [27]:
sentence = "John Smith is a software engineer who works at Google."
# Tokenize and move the input to the model's device
tokenized_input = loaded_tokenizer(sentence, return_tensors="pt").to(loaded_model.device)
# Get model outputs
outputs = loaded_model(**tokenized_input)
# Decode predicted labels
predicted_labels = outputs.logits.argmax(-1).squeeze().tolist()
input_ids= tokenized_input["input_ids"].squeeze().tolist()

# Map labels and tokens to entities
entities = []
for idx,label_id in enumerate(predicted_labels):
    if label_id != -100 and label_list[label_id] != "O": # Ignore non-entity and padding
        token = loaded_tokenizer.convert_ids_to_tokens(input_ids[idx])
        entity_label = label_list[label_id]
        entities.append((token, entity_label))

print("Named Entities - Example 1:")
print("_"*20)
for i,ind in enumerate(entities):
    print(f"{i} => {ind}")

Named Entities - Example 1:
____________________
0 => ('John', 'B-PER')
1 => ('Smith', 'I-PER')
2 => ('Google', 'B-ORG')


## Second Example

In [28]:
sentence2 = "The company Apple Inc. announced its new product, the iPhone 12, at a press conference held in San Francisco."
# Tokenize and move the input to the model's device
tokenized_input2 = loaded_tokenizer(sentence2, return_tensors="pt").to(loaded_model.device)
# Get model outputs
outputs2 = loaded_model(**tokenized_input2)
# Decode predicted labels
predicted_labels2 = outputs2.logits.argmax(-1).squeeze().tolist()
input_ids2= tokenized_input2["input_ids"].squeeze().tolist()

# Map labels and tokens to entities
entities2 = []
for idx,label_id in enumerate(predicted_labels2):
    if label_id != -100 and label_list[label_id] != "O": # Ignore non-entity and padding
        token2 = loaded_tokenizer.convert_ids_to_tokens(input_ids2[idx])
        entity_label2 = label_list[label_id]
        entities2.append((token2, entity_label2))

# Print extracted entities

print("Named Entities - Example 2:")
print("_"*20)
for i,ind in enumerate(entities2):
    print(f"{i} => {ind}")

Named Entities - Example 2:
____________________
0 => ('Apple', 'B-ORG')
1 => ('Inc', 'I-ORG')
2 => ('iPhone', 'B-MISC')
3 => ('12', 'I-MISC')
4 => ('San', 'B-LOC')
5 => ('Francisco', 'I-LOC')


## Named Entity Recognition (NER) with BERT + CRF

In [29]:
!pip install torchcrf # install torchcrf to add crf layer



In [30]:
from TorchCRF import CRF ## import CRF layer
from transformers import AutoModel, AutoTokenizer ## import AutoModel and AutoTokenizer

In [31]:
from transformers import BertModel  ## import BertModel
import torch.nn as nn ## import nn
class BertCRFModel(nn.Module):  ## define BertCRFModel
    def __init__(self, model_name2, num_labels):  ## initialize the model
        super(BertCRFModel, self).__init__()  ## call the parent class constructor

        # Load pre-trained BERT model
        self.bert = AutoModel.from_pretrained(model_name2)
        self.num_labels = num_labels

        # Conditional Random Field (CRF) layer
        self.crf = CRF(num_labels)

        # Output layer for token classification
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get BERT embeddings
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]  # (batch_size, seq_len, hidden_size)

        # Pass BERT output through the classifier to get logits for each token
        logits = self.classifier(sequence_output)  # (batch_size, seq_len, num_labels)

        # Masking padding tokens
        mask = attention_mask.bool()  # Create mask from attention_mask (1 for real tokens, 0 for padding)

        if labels is not None:
            # Replace -100 labels with an ignored token that isn't part of the CRF layers' label set
            labels = torch.where(labels == -100, torch.tensor(self.num_labels).to(labels.device), labels)

            # Ensure labels are within the range [0, num_labels-1]
            labels = torch.clamp(labels, min=0, max=self.num_labels-1)

            # Compute loss using CRF
            loss = -self.crf(logits, labels, mask=mask)  # Use mask to ignore padding tokens
            return loss

        # Otherwise, return the predicted labels
        predictions = self.crf.viterbi_decode(logits, mask=mask)  # Decode sequences using CRF
        return predictions

In [32]:
def tokenize_and_align_labels2(examples):
    # Tokenize the sentences using the tokenizer
    tokenized_inputs2 = tokenizer(
        examples["tokens"], truncation=True, padding=True, is_split_into_words=True
    )

    # Align the labels to the tokenized inputs
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs2.word_ids(i)  # Word ids for tokenized input
        label_ids = [label[word_id] if word_id is not None else -100 for word_id in word_ids]
        labels.append(label_ids)

    tokenized_inputs2["labels"] = labels  # Add labels to tokenized input
    return tokenized_inputs2

In [33]:
model_name2 = "bert-base-cased"  # Name of the pre-trained BERT model
num_labels = len(label_list)  # Number of NER labels
tokenizer2 = AutoTokenizer.from_pretrained(model_name2)  # Load tokenizer for the BERT model
model_BERT_CRF = BertCRFModel(model_name2, num_labels)  # Initialize the custom BERT + CRF model

In [34]:
def data_collator_2(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

In [35]:
# Remove unnecessary columns from the dataset
tokenized_datasets2 = tokenized_datasets.remove_columns([col for col in tokenized_datasets["train"].column_names if col not in ["input_ids", "attention_mask", "labels"]])

# Define the training arguments
training_args2 = TrainingArguments(
    output_dir="./results",  # Directory to save model outputs
    evaluation_strategy="steps", # Evaluate every `eval_steps` steps
    eval_steps=500, # How often to evaluate
    save_steps=500, # How often to save the model
    num_train_epochs=1, # Number of training epochs
    per_device_train_batch_size=8, # Batch size for training
    per_device_eval_batch_size=8, # Batch size for evaluation
    logging_steps=100, # Log every `logging_steps` steps
    learning_rate=5e-5, # Learning rate
    load_best_model_at_end=True, # Load best model based on evaluation metric
    metric_for_best_model="f1", # Metric used to determine best model
    report_to=[], # Disable wandb logging
    no_cuda=False,  # Ensure CUDA is enabled if available
    push_to_hub=False,
    remove_unused_columns=False  # Prevent automatic removal of unused columns
)

In [36]:
# Instantiate and train the model using the Trainer
trainer2 = Trainer(
    model=model_BERT_CRF,  # The BERT + CRF model
    args=training_args2,  # Training arguments
    train_dataset=tokenized_datasets2["train"],  # Training dataset
    eval_dataset=tokenized_datasets2["validation"],  # Validation dataset
    data_collator=data_collator_2,  # Data collator
    tokenizer=tokenizer2,  # Tokenizer for tokenization consistency
    compute_metrics=compute_metrics,  # Custom metric function (e.g., F1-score)
)

  trainer2 = Trainer(


In [None]:
# Start training
 trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Classification Report
500,0.0146,0.057442,0.927038,0.930158,0.928595,precision recall f1-score support  LOC 0.96 0.94 0.95 1837  MISC 0.87 0.87 0.87 922  ORG 0.86 0.94 0.90 1341  PER 0.97 0.95 0.96 1842  micro avg 0.93 0.93 0.93 5942  macro avg 0.92 0.92 0.92 5942 weighted avg 0.93 0.93 0.93 5942
1000,0.0087,0.056471,0.932734,0.94278,0.93773,precision recall f1-score support  LOC 0.95 0.96 0.95 1837  MISC 0.87 0.89 0.88 922  ORG 0.92 0.92 0.92 1341  PER 0.96 0.97 0.96 1842  micro avg 0.93 0.94 0.94 5942  macro avg 0.92 0.93 0.93 5942 weighted avg 0.93 0.94 0.94 5942
1500,0.0057,0.05962,0.93911,0.9448,0.941946,precision recall f1-score support  LOC 0.96 0.95 0.96 1837  MISC 0.89 0.90 0.89 922  ORG 0.91 0.93 0.92 1341  PER 0.96 0.97 0.97 1842  micro avg 0.94 0.94 0.94 5942  macro avg 0.93 0.94 0.93 5942 weighted avg 0.94 0.94 0.94 5942


TrainOutput(global_step=1874, training_loss=0.017585012389857844, metrics={'train_runtime': 562.8501, 'train_samples_per_second': 26.627, 'train_steps_per_second': 3.329, 'total_flos': 960565719981294.0, 'train_loss': 0.017585012389857844, 'epoch': 1.0})

In [None]:
# Save the trained model
 model_save_path = "/content/drive/MyDrive/conll2003/bert_crf_model2.pth"  # Specify the path where you want to save the model
 torch.save(model_BERT_CRF.state_dict(), model_save_path)
 print(f"Model saved to {model_save_path}")

Model saved to /content/drive/MyDrive/conll2003/bert_crf_model2.pth


In [None]:
 model_BERT_CRF = BertCRFModel(model_name2, num_labels)  # Initialize the model structure
 model_BERT_CRF.load_state_dict(torch.load("/content/drive/MyDrive/conll2003/bert_crf_model2.pth"))  # Load the saved state_dict
 model_BERT_CRF.eval()  # Set the model to evaluation mode
 loaded_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
 print("Model loaded successfully.")

In [37]:
# Ensure the model is loaded to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the model
model_BERT_CRF = BertCRFModel(model_name2, num_labels)  # Initialize the model structure
model_BERT_CRF.load_state_dict(torch.load("/content/drive/MyDrive/conll2003/bert_crf_model.pth", map_location=device))  # Map to the correct device
model_BERT_CRF.to(device)  # Move the model to the appropriate device

print("Model loaded successfully on device:", device)

  model_BERT_CRF.load_state_dict(torch.load("/content/drive/MyDrive/conll2003/bert_crf_model.pth", map_location=device))  # Map to the correct device


Model loaded successfully on device: cpu


In [38]:
sentence = "John Smith is a software engineer who works at Google."

# Tokenize and prepare the input
tokenized_input = loaded_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
input_ids = tokenized_input["input_ids"].squeeze().tolist()  # Convert tensor to list for decoding

# Move tensors to the model's device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_BERT_CRF.to(device)
tokenized_input = {key: val.to(device) for key, val in tokenized_input.items()}

# Perform inference
with torch.no_grad():
    # Pass inputs through the BERT encoder
    outputs = model_BERT_CRF.bert(
        input_ids=tokenized_input["input_ids"],
        attention_mask=tokenized_input["attention_mask"]
    )
    sequence_output = outputs[0]  # This gives us (batch_size, seq_len, hidden_size)

    # Pass through the classifier to get logits
    logits = model_BERT_CRF.classifier(sequence_output)  # (batch_size, seq_len, num_labels)

    # Decode the most probable sequence using CRF
    mask = tokenized_input["attention_mask"]
    predictions = model_BERT_CRF.crf.viterbi_decode(logits, mask=mask)[0]  # CRF decoding returns sequences

# Map tokens and labels to entities
entities = []
previous_label = None
current_entity = []

# Iterate through each token and its prediction
for idx, label_id in enumerate(predictions):
    token = loaded_tokenizer.convert_ids_to_tokens(input_ids[idx])  # Get token

    # Skip special tokens (e.g., '[CLS]', '[SEP]')
    if token in loaded_tokenizer.all_special_tokens:
        continue

    # Exclude punctuation or stop words
    if token in ['.', ',', '!', '?', '(', ')', '[', ']', ':', '-', 'at', 'is', 'a', 'who', 'and']:
        continue

    # Get label
    entity_label = label_list[label_id]

    # If label is the same as previous, continue accumulating in the current entity
    if previous_label == entity_label and entity_label != "O" and entity_label != "I-MISC":
        current_entity.append(token)
    else:
        # If there was a previous entity, add it to the final list
        if current_entity:
            entities.append((" ".join(current_entity), previous_label))
        # Start a new entity
        current_entity = [token]

    previous_label = entity_label

# If there's a remaining entity, append it to the list
if current_entity:
    entities.append((" ".join(current_entity), previous_label))

# Print extracted entities
print("Named Entities - Example 1:")
print("_"*20)
for i,ind in enumerate(entities):
    print(f"{i} => {ind}")

Named Entities - Example 1:
____________________
0 => ('John', 'B-LOC')
1 => ('Smith', 'I-MISC')
2 => ('software', 'I-MISC')
3 => ('engineer', 'O')
4 => ('works', 'O')
5 => ('Google', 'O')
