In [1]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003")
print(dataset)

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [3]:
print(dataset["train"][0])

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


In [4]:
# Initialize the tokenizer for BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
# Access the NER label mappings
label_names = dataset["train"].features["ner_tags"].feature.names
print(label_names)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [6]:
# Example of a sentence and its corresponding labels for demonstration
example_sentence = dataset["train"][0]["tokens"]
example_labels = dataset["train"][0]["ner_tags"]

# Tokenize the sentence
encoding = tokenizer(example_sentence, is_split_into_words=True)
print("Tokens:", encoding.tokens())

Tokens: ['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]']


In [7]:
def align_labels_with_tokens(labels, word_ids):
    aligned_labels = []
    previous_word_id = None
    for word_id in word_ids:
        if word_id is None:
            # Special tokens (like [CLS] and [SEP]) should have no label
            aligned_labels.append(-100)
        elif word_id != previous_word_id:
            # Assign label to the first subword token
            aligned_labels.append(labels[word_id])
        else:
            # Ignore the label for other subword tokens
            aligned_labels.append(-100)
        previous_word_id = word_id
    return aligned_labels


In [12]:
def tokenize_and_align_labels(batch):
    # Tokenize the input text with truncation and padding to a fixed length
    encodings = tokenizer(
        batch["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128
    )

    # Align labels with tokens
    all_labels = []
    for i in range(len(encodings["input_ids"])):
        word_ids = encodings.word_ids(batch_index=i)  # Get word IDs for each example
        labels = batch["ner_tags"][i]
        aligned_labels = align_labels_with_tokens(labels, word_ids)
        # Pad labels to the max length
        aligned_labels = aligned_labels + [-100] * (128 - len(aligned_labels))
        all_labels.append(aligned_labels)

    # Update the encoding dictionary with the aligned labels
    encodings["labels"] = all_labels
    return encodings

# Apply the preprocessing function to the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [13]:
# Check the first example in the tokenized dataset
sample = tokenized_dataset["train"][0]
print("Tokens:", tokenizer.convert_ids_to_tokens(sample["input_ids"]))
print("Labels:", sample["labels"])

Tokens: ['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]

In [14]:
from transformers import AutoModelForTokenClassification

# Load the pre-trained BERT model for token classification
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(label_names))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
training_args = TrainingArguments(
    output_dir="./results",           # Directory to save the model checkpoints
    evaluation_strategy="epoch",      # Evaluate the model at the end of each epoch
    learning_rate=3e-5,               # Typical learning rate for fine-tuning transformers
    per_device_train_batch_size=8,    # Batch size for training
    per_device_eval_batch_size=8,     # Batch size for evaluation
    num_train_epochs=3,               # Number of training epochs
    weight_decay=0.01,                # Weight decay to prevent overfitting
    report_to="none"                  # Disables logging to wandb
)



In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer
)

  trainer = Trainer(


In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0609,0.048082


Epoch,Training Loss,Validation Loss
1,0.0609,0.048082
2,0.0273,0.053611
3,0.0125,0.053376


TrainOutput(global_step=5268, training_loss=0.04589948847878586, metrics={'train_runtime': 1426.3639, 'train_samples_per_second': 29.532, 'train_steps_per_second': 3.693, 'total_flos': 2751824963545344.0, 'train_loss': 0.04589948847878586, 'epoch': 3.0})

In [21]:
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.05337599664926529, 'eval_runtime': 24.4175, 'eval_samples_per_second': 133.101, 'eval_steps_per_second': 16.668, 'epoch': 3.0}


In [23]:
import torch

def ner_prediction(sentence):
    # Tokenize the input sentence and get the token IDs
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True)

    # Move inputs to the appropriate device if using GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model.to(device)

    # Get predictions from the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted labels for each token
    predictions = torch.argmax(outputs.logits, dim=2)

    # Decode tokens and predicted labels
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    predicted_labels = [label_names[label] for label in predictions[0].cpu().numpy()]

    # Merge subwords for better readability
    merged_tokens = []
    merged_labels = []
    current_token = ""
    current_label = None

    for token, label in zip(tokens, predicted_labels):
        if token.startswith("##"):
            # If it's a subword, add it to the current token
            current_token += token[2:]
        else:
            # Append the previous token and label if they exist
            if current_token:
                merged_tokens.append(current_token)
                merged_labels.append(current_label)
            # Start a new token
            current_token = token
            current_label = label
    # Append the final token and label
    if current_token:
        merged_tokens.append(current_token)
        merged_labels.append(current_label)

    # Print the merged tokens with their labels
    for token, label in zip(merged_tokens, merged_labels):
        if label != "O":  # Only print tokens with named entity labels
            print(f"{token:10} : {label}")

# Example sentence for testing
sentence = "Apple was founded by Steve Jobs in Cupertino, California."
ner_prediction(sentence)

apple      : B-ORG
steve      : B-PER
jobs       : I-PER
cupertino  : B-LOC
california : B-LOC


In [27]:
# Save the model and tokenizer
model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")

('./ner_model/tokenizer_config.json',
 './ner_model/special_tokens_map.json',
 './ner_model/vocab.txt',
 './ner_model/added_tokens.json',
 './ner_model/tokenizer.json')