In [2]:
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

dataset = load_dataset("conll2003")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [3]:
# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# Tokenize the dataset and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        padding=True,  # Apply padding
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # Only label the first token of a given word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and alignment function to the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Remove columns not needed for training
tokenized_dataset = tokenized_dataset.remove_columns(["tokens", "pos_tags", "chunk_tags", "ner_tags"])

# Set the format to PyTorch tensors
tokenized_dataset.set_format("torch")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [4]:
# Load the BERT model for token classification
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(dataset['train'].features['ner_tags'].feature.names))

# Data collator that will dynamically pad the inputs received, as well as the labels.
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1591,0.039387
2,0.0283,0.036488
3,0.0155,0.036638


{'eval_loss': 0.03663837909698486, 'eval_runtime': 19.3991, 'eval_samples_per_second': 167.534, 'eval_steps_per_second': 10.516, 'epoch': 3.0}


In [5]:
# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.03663837909698486, 'eval_runtime': 19.7711, 'eval_samples_per_second': 164.381, 'eval_steps_per_second': 10.318, 'epoch': 3.0}


In [8]:
import torch
# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# Example sentences
sentences = [
    "Hugging Face Inc. is a company based in New York City.",
    "Bert is a neural network-based technique for natural language processing."
]

# Tokenize sentences
tokenized_inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Move inputs to the appropriate device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Move tokenized inputs to the device
input_ids = tokenized_inputs["input_ids"].to(device)
attention_mask = tokenized_inputs["attention_mask"].to(device)

# Get model predictions
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

# Get the logits
logits = outputs.logits

# Convert logits to predictions
predictions = torch.argmax(logits, dim=2).cpu().numpy()

# Get the label names
label_names = dataset['train'].features['ner_tags'].feature.names

# Function to align predictions with the input tokens
def align_predictions(predictions, label_ids):
    preds = []
    for pred in predictions:
        preds.append([label_names[p] for p in pred])
    return preds

# Align the predictions
aligned_predictions = align_predictions(predictions, input_ids.cpu().numpy())

# Print the results
for sentence, tokens, preds in zip(sentences, tokenized_inputs["input_ids"], aligned_predictions):
    print(f"Sentence: {sentence}")
    tokenized_tokens = tokenizer.convert_ids_to_tokens(tokens)
    print("Tokenized Tokens:", tokenized_tokens)
    print("Predictions:", preds)

Sentence: Hugging Face Inc. is a company based in New York City.
Tokenized Tokens: ['[CLS]', 'Hu', '##gging', 'Face', 'Inc', '.', 'is', 'a', 'company', 'based', 'in', 'New', 'York', 'City', '.', '[SEP]']
Predictions: ['O', 'B-ORG', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O', 'O']
Sentence: Bert is a neural network-based technique for natural language processing.
Tokenized Tokens: ['[CLS]', 'Bert', 'is', 'a', 'neural', 'network', '-', 'based', 'technique', 'for', 'natural', 'language', 'processing', '.', '[SEP]', '[PAD]']
Predictions: ['O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
