<a href="https://colab.research.google.com/github/joshIsac/LargeLanguageModel/blob/main/2348523_LLM_lab5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install transformers



In [54]:
import tensorflow as tf
from transformers import BertTokenizer,TFBertForTokenClassification
from transformers import BertConfig
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

In [75]:
# Load model directly
from transformers import AutoTokenizer, TFAutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = TFAutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

All PyTorch model weights were used when initializing TFBertForTokenClassification.

All the weights of TFBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


In [106]:
sentences = [
    "Albert Einstein was born in Ulm.",
    "Microsoft was founded by Bill Gates and Paul Allen.",
]

#assign label for each sentence
labels = [
    [1, 2, 0, 0, 0, 0],
    [1, 0, 0, 0, 1, 2, 0]
]


max_len = 20  # Maximum length of a sentence

# Initialize lists
input_ids = []
attention_masks = []
label_ids = []

# Tokenize sentences
for sent, label in zip(sentences, labels):
    # Print the current sentence and label for debugging
    print(f"Processing sentence: '{sent}' with label: {label}")

    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='tf'
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    label_ids.append(label + [0] * (max_len - len(label)))

# Check if the lists are populated
print(f"Number of input_ids: {len(input_ids)}")
print(f"Number of attention_masks: {len(attention_masks)}")
print(f"Number of label_ids: {len(label_ids)}")

# Convert lists to numpy arrays
input_ids = np.array(input_ids)
attention_masks = np.array(attention_masks)
label_ids = np.array(label_ids)

# Print shapes after conversion
print(f"input_ids shape: {input_ids.shape}")
print(f"attention_masks shape: {attention_masks.shape}")
print(f"label_ids shape: {label_ids.shape}")



Processing sentence: 'Albert Einstein was born in Ulm.' with label: [1, 2, 0, 0, 0, 0]
Processing sentence: 'Microsoft was founded by Bill Gates and Paul Allen.' with label: [1, 0, 0, 0, 1, 2, 0]
Number of input_ids: 2
Number of attention_masks: 2
Number of label_ids: 2
input_ids shape: (2, 1, 20)
attention_masks shape: (2, 1, 20)
label_ids shape: (2, 20)


In [119]:
# Create TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices(({
    'input_ids': input_ids,
    'attention_mask': attention_masks
}, label_ids))

# Check the contents of the dataset
for element in dataset:
    print(element)

({'input_ids': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=
array([[  101,  3986, 16127,  1108,  1255,  1107,   158, 13505,   119,
          102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int32)>}, <tf.Tensor: shape=(20,), dtype=int64, numpy=array([1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>)
({'input_ids': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=
array([[  101,  6998,  1108,  1771,  1118,  2617, 12702,  1105,  1795,
         4522,   119,   102,     0,     0,     0,     0,     0,     0,
            0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int32)>}, <tf.Tensor: shape=(20,), dtype=int64, numpy=array([1, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0,

In [120]:
# Compile the model
optimizer ='adam'
loss = 'sparse_categorical_crossentropy'
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])


In [121]:
# Train the model
model.fit(dataset, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7b445aa49a50>

In [110]:
# prompt: #implement NER on the Sentences above

# Predict on new sentences
new_sentences = [
    "Albert Einstein was born in Ulm.",
    "Microsoft was founded by Bill Gates and Paul Allen.",
]


# Tokenize and prepare new sentences
input_ids_new = []
attention_masks_new = []

for sent in new_sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='tf'
    )
    input_ids_new.append(encoded_dict['input_ids'])
    attention_masks_new.append(encoded_dict['attention_mask'])

input_ids_new = np.array(input_ids_new)
attention_masks_new = np.array(attention_masks_new)

# Make predictions
predictions = model.predict({
    'input_ids': input_ids_new.squeeze(),
    'attention_mask': attention_masks_new.squeeze()
})

# Process predictions
predicted_labels = np.argmax(predictions.logits, axis=-1)

# Decode labels (replace with your actual label mapping)
id_to_label = {0: 'O', 1: 'PER', 2: 'ORG', 3: 'LOC'}

for sent, predicted_ids in zip(new_sentences, predicted_labels):
    tokens = tokenizer.tokenize(sent)
    predicted_tags = [id_to_label[id] for id in predicted_ids]
    print(f"Sentence: {sent}")
    for token, tag in zip(tokens, predicted_tags):
        print(f"{token}: {tag}")
    print("----")


Sentence: Albert Einstein was born in Ulm.
Albert: O
Einstein: O
was: O
born: O
in: O
U: O
##lm: O
.: O
----
Sentence: Microsoft was founded by Bill Gates and Paul Allen.
Microsoft: O
was: O
founded: O
by: O
Bill: O
Gates: O
and: O
Paul: O
Allen: O
.: O
----
