<a href="https://colab.research.google.com/github/joshIsac/LargeLanguageModel/blob/main/2348523_LLM_lab5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!pip install transformers



In [28]:
import tensorflow as tf
from transformers import BertTokenizer,TFBertForTokenClassification
from transformers import BertConfig
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

In [43]:
# Load model directly
from transformers import AutoTokenizer, TFAutoModelForTokenClassification ,BertForTokenClassification, AdamW

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = TFAutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

All PyTorch model weights were used when initializing TFBertForTokenClassification.

All the weights of TFBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


In [38]:
import torch

In [66]:
# Define the sentences and their labels
sentences = [
    "Albert Einstein was born in Ulm.",
    "Microsoft was founded by Bill Gates and Paul Allen.",
    "Cats and Dogs are Enemies",
     "Barack Obama was the president of the USA.",
    "The Eiffel Tower is in Paris.",
    "Steve Jobs co-founded Apple Inc.",
    "The Amazon rainforest is very large."
]


# Corresponding labels (1: B-PER, 2: I-PER, 3: B-ORG, 4: I-ORG, 5: B-LOC, 6: I-LOC)
labels = [
    [1, 2, 0, 0, 5, 0],  # Albert Einstein -> [B-PER, I-PER], Ulm -> [B-LOC]
    [3, 0, 0, 1, 2, 0, 0],  # Microsoft -> [B-ORG], Bill Gates -> [B-PER]
    [7, 0, 7, 0, 0],
    [1, 0, 0, 0, 0, 0],  # Barack Obama -> [B-PER], USA -> [B-LOC]
    [0, 0, 0, 0, 5, 0],  # Eiffel Tower -> [B-LOC]
    [1, 2, 0, 3, 4, 0],  # Steve Jobs -> [B-PER], Apple Inc. -> [B-ORG]
    [0, 0, 0, 0, 0, 0]  # No entities
]

# Updated label mapping
label_map = {
    "O": 0,      # Outside
    "B-PER": 1,  # Beginning of person entity
    "I-PER": 2,  # Inside a person entity
    "B-ORG": 3,  # Beginning of organization entity
    "I-ORG": 4,  # Inside an organization entity
    "B-LOC": 5,  # Beginning of location entity
    "I-LOC": 6,
    "B-ANM":7
}

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define maximum length for tokenized sequences
max_len = 20  # Maximum length of a sentence

# Initialize lists to hold input IDs, attention masks, and label IDs
input_ids = []
attention_masks = []
label_ids = []

# Tokenize sentences
for sent, label in zip(sentences, labels):
    # Print the current sentence and label for debugging
    print(f"Processing sentence: '{sent}' with label: {label}")

    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'  # Use 'pt' for PyTorch tensors
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    label_ids.append(label + [0] * (max_len - len(label)))  # Pad labels to `max_len`

# Check if the lists are populated
print(f"Number of input_ids: {len(input_ids)}")
print(f"Number of attention_masks: {len(attention_masks)}")
print(f"Number of label_ids: {len(label_ids)}")

# Convert lists to PyTorch tensors
input_ids_tensor = torch.cat(input_ids)
attention_masks_tensor = torch.cat(attention_masks)
label_ids_tensor = torch.tensor(label_ids)

# Print shapes after conversion
print(f"input_ids shape: {input_ids_tensor.shape}")
print(f"attention_masks shape: {attention_masks_tensor.shape}")
print(f"label_ids shape: {label_ids_tensor.shape}")

Processing sentence: 'Albert Einstein was born in Ulm.' with label: [1, 2, 0, 0, 5, 0]
Processing sentence: 'Microsoft was founded by Bill Gates and Paul Allen.' with label: [3, 0, 0, 1, 2, 0, 0]
Processing sentence: 'Cats and Dogs are Enemies' with label: [7, 0, 7, 0, 0]
Processing sentence: 'Barack Obama was the president of the USA.' with label: [1, 0, 0, 0, 0, 0]
Processing sentence: 'The Eiffel Tower is in Paris.' with label: [0, 0, 0, 0, 5, 0]
Processing sentence: 'Steve Jobs co-founded Apple Inc.' with label: [1, 2, 0, 3, 4, 0]
Processing sentence: 'The Amazon rainforest is very large.' with label: [0, 0, 0, 0, 0, 0]
Number of input_ids: 7
Number of attention_masks: 7
Number of label_ids: 7
input_ids shape: torch.Size([7, 20])
attention_masks shape: torch.Size([7, 20])
label_ids shape: torch.Size([7, 20])


In [67]:
from torch.utils.data import DataLoader, TensorDataset
# Create a dataset and DataLoader
dataset = TensorDataset(input_ids_tensor, attention_masks_tensor, label_ids_tensor)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [68]:
# Initialize the BERT model for token classification
num_labels = len(label_map)
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set model to training mode
model.train()

# Training loop
for epoch in range(10):  # Train for 3 epochs
    for batch in dataloader:
        b_input_ids = batch[0]
        b_attention_masks = batch[1]
        b_label_ids = batch[2]

        optimizer.zero_grad()  # Clear previous gradients
        outputs = model(b_input_ids, attention_mask=b_attention_masks, labels=b_label_ids)
        loss = outputs[0]  # Model outputs are in the form of (loss, logits)
        loss.backward()  # Backpropagation
        optimizer.step()  # Update parameters

        print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")

# Prediction Phase
model.eval()  # Set model to evaluation mode




Epoch: 1, Loss: 2.125270128250122
Epoch: 1, Loss: 1.7665526866912842
Epoch: 1, Loss: 1.4622747898101807
Epoch: 1, Loss: 1.2250856161117554
Epoch: 2, Loss: 0.7862497568130493
Epoch: 2, Loss: 0.3825218677520752
Epoch: 2, Loss: 0.6831575632095337
Epoch: 2, Loss: 0.5507920384407043
Epoch: 3, Loss: 0.49747714400291443
Epoch: 3, Loss: 0.38640812039375305
Epoch: 3, Loss: 0.6580871939659119
Epoch: 3, Loss: 0.3335500657558441
Epoch: 4, Loss: 0.4211150109767914
Epoch: 4, Loss: 0.30498963594436646
Epoch: 4, Loss: 0.3168627619743347
Epoch: 4, Loss: 0.6423942446708679
Epoch: 5, Loss: 0.18715675175189972
Epoch: 5, Loss: 0.3755376935005188
Epoch: 5, Loss: 0.4342638850212097
Epoch: 5, Loss: 0.1765187531709671
Epoch: 6, Loss: 0.3221607506275177
Epoch: 6, Loss: 0.23087266087532043
Epoch: 6, Loss: 0.06773670762777328
Epoch: 6, Loss: 0.44058284163475037
Epoch: 7, Loss: 0.21616104245185852
Epoch: 7, Loss: 0.22697961330413818
Epoch: 7, Loss: 0.10615048557519913
Epoch: 7, Loss: 0.27485552430152893
Epoch: 8, 

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [70]:
sentences = [
    "Albert Einstein was born in Ulm.",
    "Microsoft was founded by Bill Gates and Paul Allen.",
    "Cats and Dogs are Enemies",
]

# Tokenize new sentences
input_ids_pred = []
attention_masks_pred = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids_pred.append(encoded_dict['input_ids'])
    attention_masks_pred.append(encoded_dict['attention_mask'])

input_ids_pred_tensor = torch.cat(input_ids_pred)
attention_masks_pred_tensor = torch.cat(attention_masks_pred)



In [71]:
# Set model to evaluation mode
model.eval()

# Define new sentences for prediction
new_sentences = [
    "Steve Jobs co-founded Apple in Cupertino.",
    "Bill Gates is a philanthropist.",
    "The Great Wall of China is remarkable.",
    "Albert Einstein was born in Ulm.",
    "Microsoft was founded by Bill Gates and Paul Allen.",
    "Cats and Dogs are Enemies"
]

# Tokenize new sentences
input_ids_pred = []
attention_masks_pred = []

for sent in new_sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt'
    )

    input_ids_pred.append(encoded_dict['input_ids'])
    attention_masks_pred.append(encoded_dict['attention_mask'])

# Convert to tensors
input_ids_pred_tensor = torch.cat(input_ids_pred)
attention_masks_pred_tensor = torch.cat(attention_masks_pred)

# Make predictions
with torch.no_grad():  # Disable gradient calculation
    outputs = model(input_ids_pred_tensor, attention_mask=attention_masks_pred_tensor)
    logits = outputs[0]  # Get the logits
    predictions = torch.argmax(logits, dim=2)  # Get predicted labels

# Decode predictions
for i in range(len(new_sentences)):
    predicted_labels = []
    for j in range(predictions.shape[1]):
        label_index = predictions[i][j].item()
        # Get the corresponding label from the mapping (0 corresponds to 'O', etc.)
        if label_index in label_map.values():
            for label, index in label_map.items():
                if index == label_index:
                    predicted_labels.append(label)
                    break
        else:
            predicted_labels.append("O") # Default to 'O' if not found

    # Print the results
    print(f"Sentence: '{new_sentences[i]}'")
    print(f"Predicted Labels: {predicted_labels}")


Sentence: 'Steve Jobs co-founded Apple in Cupertino.'
Predicted Labels: ['B-PER', 'I-PER', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Sentence: 'Bill Gates is a philanthropist.'
Predicted Labels: ['B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Sentence: 'The Great Wall of China is remarkable.'
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Sentence: 'Albert Einstein was born in Ulm.'
Predicted Labels: ['B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Sentence: 'Microsoft was founded by Bill Gates and Paul Allen.'
Predicted Labels: ['B-PER', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Sentence: 'Cats and Dogs are Enemies'
Predicted Labels: ['B-ANM', 'O', 'B-ANM', 'O', 'O', 'O', 'O', 'O'