In [None]:
# !pip install transformers datasets torch accelerate seqeval
# For GPU training
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [1]:
!pip install seqeval

In [2]:
!pip install -U datasets


In [None]:
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from seqeval.metrics import accuracy_score, classification_report, f1_score
import numpy as np


In [None]:

# Load ModernBERT tokenizer and model
model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:

# Example: Load CoNLL-2003 dataset for NER
# You can replace this with your own dataset
dataset = load_dataset("conll2003")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
# Define label names and create label mappings
label_list = dataset["train"].features["ner_tags"].feature.names
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}

print(f"Labels: {label_list}")
print(f"Number of labels: {len(label_list)}")

Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
Number of labels: 9


In [4]:
# Initialize model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

In [None]:
def tokenize_and_align_labels(examples):
    """Tokenize inputs and align labels with tokenized tokens"""
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=False
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                # Special tokens get -100 label (ignored in loss)
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # First token of a word gets the label
                label_ids.append(label[word_idx])
            else:
                # Subsequent tokens of same word get -100 (or same label)
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [5]:
# Tokenize datasets
tokenized_train = dataset["train"].map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names
)
tokenized_valid = dataset["validation"].map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["validation"].column_names
)

In [None]:

# Data collator
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True
)


In [None]:
def compute_metrics(eval_pred):
    """Compute seqeval metrics for NER evaluation"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

In [None]:


# Training arguments
training_args = TrainingArguments(
    output_dir="./modernbert-ner",
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    logging_steps=100,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,
    fp16=True,  # Use mixed precision if you have a compatible GPU
    dataloader_pin_memory=False,
    remove_unused_columns=True,  # Change from False to True
    push_to_hub=False,  # Set to True if you want to push to HuggingFace Hub
)


In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
# Train the model
print("Starting training...")
trainer.train()

Starting training...


W0628 18:37:21.148000 1914 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode


Step,Training Loss,Validation Loss,Accuracy,F1
500,0.1179,0.104912,0.971029,0.82414
1000,0.0398,0.057582,0.985398,0.910766
1500,0.0278,0.051213,0.988026,0.929259
2000,0.0123,0.04922,0.988883,0.934884
2500,0.0113,0.046781,0.989155,0.936273


TrainOutput(global_step=2634, training_loss=0.10961505901931178, metrics={'train_runtime': 668.867, 'train_samples_per_second': 62.977, 'train_steps_per_second': 3.938, 'total_flos': 1577408010395238.0, 'train_loss': 0.10961505901931178, 'epoch': 3.0})

In [None]:
# Save the model
trainer.save_model()
tokenizer.save_pretrained("./modernbert-ner")

('./modernbert-ner/tokenizer_config.json',
 './modernbert-ner/special_tokens_map.json',
 './modernbert-ner/tokenizer.json')

In [None]:
# Evaluate on test set
test_results = trainer.evaluate(tokenized_valid)
print(f"Test results: {test_results}")

print("Training completed!")

Test results: {'eval_loss': 0.04691711440682411, 'eval_accuracy': 0.9892527549550251, 'eval_f1': 0.9363408521303258, 'eval_runtime': 16.8501, 'eval_samples_per_second': 192.877, 'eval_steps_per_second': 12.107, 'epoch': 3.0}
Training completed!


In [None]:
# Inference on new text
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
test_text = "Amazon founder Jeff Bezos, sourish and Tesla CEO Elon Musk attended the World Economic Forum in Davos, Switzerland, where they discussed space exploration with NASA administrator Bill Nelson and European Space Agency director Josef Aschbacher."
inputs = tokenizer(test_text, return_tensors="pt", truncation=True, padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to same device as model
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    predicted_labels = [id_to_label[pred.item()] for pred in predictions[0]]

    # Better aligned output
    print(f"\nInput text: {test_text}")
    print("-" * 50)
    print(f"{'Token':<15} {'Label':<10}")
    print("-" * 50)
    for token, label in zip(tokens, predicted_labels):
        print(f"{token:<15} {label:<10}")


Input text: Amazon founder Jeff Bezos, sourish and Tesla CEO Elon Musk attended the World Economic Forum in Davos, Switzerland, where they discussed space exploration with NASA administrator Bill Nelson and European Space Agency director Josef Aschbacher.
--------------------------------------------------
Token           Label     
--------------------------------------------------
[CLS]           O         
Amazon          B-ORG     
Ġfounder        O         
ĠJeff           B-PER     
ĠBe             I-PER     
zos             I-PER     
,               O         
Ġsour           B-PER     
ish             O         
Ġand            O         
ĠTesla          B-ORG     
ĠCEO            O         
ĠEl             B-PER     
on              I-PER     
ĠMusk           I-PER     
Ġattended       O         
Ġthe            O         
ĠWorld          B-MISC    
ĠEconomic       I-MISC    
ĠForum          I-MISC    
Ġin             O         
ĠDav            B-LOC     
os              I-LO

In [None]:
# ignore first and last special characters

In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("joe-xhedi/ModernBERT-NER")
model = AutoModelForTokenClassification.from_pretrained("joe-xhedi/ModernBERT-NER")

# Create NER pipeline
ner_pipeline = pipeline("ner",
                       model=model,
                       tokenizer=tokenizer,
                       aggregation_strategy="simple")

# Example usage
text = "John Doe works at OpenAI in San Francisco."
results = ner_pipeline(text)
print(results)


Device set to use cuda:0 

[{'entity_group': 'PER', 'score': np.float32(0.9963356), 'word': 'John Doe', 'start': 0, 'end': 8}, {'entity_group': 'ORG', 'score': np.float32(0.98997074), 'word': ' OpenAI', 'start': 17, 'end': 24}, {'entity_group': 'LOC', 'score': np.float32(0.82463753), 'word': ' San Francisco', 'start': 27, 'end': 41}]
