In [1]:
# Install required libraries
#!pip install transformers torch datasets seqeval

# Verify installations
import transformers
import torch
print(f"Transformers version: {transformers.__version__}")
print(f"PyTorch version: {torch.__version__}")

# Basic BERT architecture explanation
"""
BERT (Bidirectional Encoder Representations from Transformers):
- Transformer-based architecture
- 12/24 layers (base/large)
- 768/1024 hidden units (base/large)
- Pre-trained on masked language modeling and next sentence prediction
- Special [CLS] and [SEP] tokens for sentence classification
"""

Transformers version: 4.48.2
PyTorch version: 2.5.1+cu124


'\nBERT (Bidirectional Encoder Representations from Transformers):\n- Transformer-based architecture\n- 12/24 layers (base/large)\n- 768/1024 hidden units (base/large)\n- Pre-trained on masked language modeling and next sentence prediction\n- Special [CLS] and [SEP] tokens for sentence classification\n'

In [3]:
#!pip install datasets transformers torch seqeval

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (f

In [4]:
from datasets import DatasetDict, Dataset
from transformers import BertTokenizerFast

# Custom CoNLL-2003 parser
def read_conll(file_path):
    tokens = []
    ner_tags = []
    current_tokens = []
    current_tags = []

    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith("-DOCSTART-"):
                continue
            if not line:
                if current_tokens:
                    tokens.append(current_tokens)
                    ner_tags.append(current_tags)
                    current_tokens = []
                    current_tags = []
                continue
            parts = line.split()
            current_tokens.append(parts[0])
            current_tags.append(parts[-1])

    return {"tokens": tokens, "ner_tags": ner_tags}

# Load datasets
raw_datasets = DatasetDict({
    "train": Dataset.from_dict(read_conll("eng.train")),
    "validation": Dataset.from_dict(read_conll("eng.testa")),
    "test": Dataset.from_dict(read_conll("eng.testb"))
})

# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

# Label mapping
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

# Tokenization function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )

    labels = []
    for i, tags in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[tags[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Process datasets
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [5]:
from transformers import BertForTokenClassification, TrainingArguments, Trainer

# Load model
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# Training configuration
training_args = TrainingArguments(
    output_dir="ner_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
    report_to="none"
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"]
)

# Start training
trainer.train()

# Save model
trainer.save_model("ner_model")
tokenizer.save_pretrained("ner_model")

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1634,0.040387
2,0.0281,0.0381
3,0.0146,0.037397


('ner_model/tokenizer_config.json',
 'ner_model/special_tokens_map.json',
 'ner_model/vocab.txt',
 'ner_model/added_tokens.json',
 'ner_model/tokenizer.json')

In [6]:
#!pip install seqeval

In [7]:
# Corrected Week 4 Evaluation Code
from seqeval.metrics import classification_report
import numpy as np

# Get predictions
predictions = trainer.predict(tokenized_datasets["test"])
preds = np.argmax(predictions.predictions, axis=2)

# Convert labels and predictions to text format
true_labels = []
pred_labels = []

for i in range(len(preds)):
    # Convert to NumPy arrays
    attention_mask = np.array(tokenized_datasets["test"][i]["attention_mask"])
    labels = np.array(tokenized_datasets["test"][i]["labels"])

    # Remove padding using attention mask
    valid_labels = labels[attention_mask.astype(bool)]
    valid_preds = preds[i][attention_mask.astype(bool)]

    # Filter out -100 labels and convert to text
    true_label_seq = []
    pred_label_seq = []

    for l, p in zip(valid_labels, valid_preds):
        if l != -100:
            true_label_seq.append(id2label[l])
            pred_label_seq.append(id2label[p])

    true_labels.append(true_label_seq)
    pred_labels.append(pred_label_seq)

# Generate classification report
print(classification_report(true_labels, pred_labels))

              precision    recall  f1-score   support

         LOC       0.93      0.93      0.93      1666
        MISC       0.78      0.84      0.81       702
         ORG       0.88      0.91      0.90      1661
         PER       0.97      0.96      0.96      1615

   micro avg       0.91      0.92      0.91      5644
   macro avg       0.89      0.91      0.90      5644
weighted avg       0.91      0.92      0.92      5644

