In [1]:
import datasets
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

In [2]:
# Load the CoNLL-2003 dataset
dataset = datasets.load_dataset("conll2003")

# Load a pre-trained NER model and tokenizer
model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

Downloading model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
# Set a consistent sequence length for tokenization and padding
consistent_sequence_length = 128  # You can adjust this as needed

# Define the data processing function
def tokenize_and_pad(examples):
    # Tokenize the input text and pad to the specified sequence length
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",  # Pad to the specified length
        max_length=consistent_sequence_length,
        is_split_into_words=True,
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        label_ids = label[:consistent_sequence_length] + [-100] * (consistent_sequence_length - len(label))
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [20]:
# Tokenize and preprocess the dataset using the data processing function
tokenized_datasets = dataset.map(tokenize_and_pad, batched=True)


Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [21]:
# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir="./ner_model",
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=2,
    save_steps=500,
    learning_rate=2e-5,
    num_train_epochs=3,
    report_to="tensorboard",
    load_best_model_at_end=True,
)

In [22]:
# Define Trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

In [None]:
# Start training the NER model
trainer.train()

Step,Training Loss,Validation Loss
500,0.4715,0.310439
1000,0.2728,0.260073


In [None]:
# Save the trained model to the specified output directory
model.save_pretrained("/content/ner_model")


In [None]:
# Evaluate the model's performance on the validation dataset
results = trainer.evaluate()
print(results)