In [2]:
pip install datasets transformers

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [3]:
from datasets import load_dataset
from transformers import BertTokenizerFast
import pickle
import numpy as np
from collections import Counter

In [4]:
dataset = load_dataset("conll2003")
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(
        sentences,
        padding=True,
        truncation=True,
        return_offsets_mapping=True,
        is_split_into_words=True
    )
    aligned_labels = []

    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        aligned_label = []

        for word_idx in word_ids:
            if word_idx is None:
                aligned_label.append(-100)
            elif word_idx != previous_word_idx:
                aligned_label.append(label[word_idx])
            else:
                aligned_label.append(label[word_idx] if label[word_idx] != 0 else -100)

            previous_word_idx = word_idx

        aligned_labels.append(aligned_label)

    return tokenized_inputs, aligned_labels

In [6]:
train_sentences = [example["tokens"] for example in train_data]
train_labels = [example["ner_tags"] for example in train_data]

val_sentences = [example["tokens"] for example in val_data]
val_labels = [example["ner_tags"] for example in val_data]

test_sentences = [example["tokens"] for example in test_data]
test_labels = [example["ner_tags"] for example in test_data]

tokenized_train, aligned_train_labels = tokenize_and_align_labels(train_sentences, train_labels)
tokenized_val, aligned_val_labels = tokenize_and_align_labels(val_sentences, val_labels)
tokenized_test, aligned_test_labels = tokenize_and_align_labels(test_sentences, test_labels)

print("Data preparation complete!")

Data preparation complete!


In [None]:
with open("./ner_tokenized_data.pkl", "wb") as f:
    pickle.dump((tokenized_train, aligned_train_labels, tokenized_val, aligned_val_labels, tokenized_test, aligned_test_labels), f)

## Checking Dataset

In [12]:
label_counts = Counter(np.concatenate(aligned_train_labels))

if -100 in label_counts:
    del label_counts[-100]

sorted_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)

for label, count in sorted_labels:
    print(f"{label}: {count}")

print(f"Total number of classes: {len(label_counts)}")

0: 169578
3: 13357
5: 13329
1: 12276
2: 10325
4: 6119
7: 5949
8: 1969
6: 1706
Total number of classes: 9


In [18]:
def check_b_i_consistency(aligned_labels, dataset_name):
    errors = 0
    for sentence_labels in aligned_labels:
        prev_label = "O"
        for label in sentence_labels:
            if label == -100:
                continue
            label_str = str(label)

            if label_str.startswith("I-") and not prev_label.startswith("B-") and prev_label != label_str:
                print(f"⚠️ Invalid sequence: {prev_label} -> {label_str} w {dataset_name}")
                errors += 1

            prev_label = label_str if label != -100 else prev_label

    print(f"\nFound {errors} errors B/I in {dataset_name}")

check_b_i_consistency(aligned_train_labels, "Training set")
check_b_i_consistency(aligned_val_labels, "Val set")
check_b_i_consistency(aligned_test_labels, "Test set")


Found 0 errors B/I in Training set

Found 0 errors B/I in Val set

Found 0 errors B/I in Test set


In [15]:
max_length = max(len(seq) for seq in tokenized_train["input_ids"])
print(f"Max sequence length: {max_length} tokens")

Max sequence length: 173 tokens
