In [1]:
!pip -q install -U transformers datasets accelerate evaluate seqeval scikit-learn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [3]:
import transformers, datasets, evaluate
import seqeval
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("evaluate:", evaluate.__version__)

transformers: 4.57.3
datasets: 4.4.2
evaluate: 0.4.6


In [4]:
from seqeval.metrics import classification_report
print("seqeval ready")

seqeval ready


In [7]:
from datasets import load_dataset

ner_ds = load_dataset(
    "conll2003",
    revision="refs/convert/parquet"
)

print(ner_ds)
print("\nColumns:", ner_ds["train"].column_names)

print("\nExample:")
for k, v in ner_ds["train"][0].items():
    print(k, ":", v)

conll2003/train/0000.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/312k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/283k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

Columns: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']

Example:
id : 0
tokens : ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
pos_tags : [22, 42, 16, 21, 35, 37, 16, 21, 7]
chunk_tags : [11, 21, 11, 12, 21, 22, 11, 12, 0]
ner_tags : [3, 0, 7, 0, 0, 0, 7, 0, 0]


In [8]:
label_list = ner_ds["train"].features["ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [9]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [10]:
# We will label only the FIRST sub-token of each word, and ignore the rest with -100
# This is standard practice for token classification with wordpiece tokenizers.

label_list = ner_ds["train"].features["ner_tags"].feature.names

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    labels = []
    for i, word_labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        previous_word_id = None
        label_ids = []

        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_id:
                label_ids.append(word_labels[word_id])
            else:
                # same word, but a sub-token -> ignore
                label_ids.append(-100)
            previous_word_id = word_id

        labels.append(label_ids)

    tokenized["labels"] = labels
    return tokenized

tokenized_ner = ner_ds.map(tokenize_and_align_labels, batched=True)

print(tokenized_ner)
print("\nKeys:", tokenized_ner["train"].column_names)

# Sanity check on one example
idx = 0
print("\nExample tokens:", ner_ds["train"][idx]["tokens"][:20])
print("Example tags:", [label_list[t] for t in ner_ds["train"][idx]["ner_tags"][:20]])

print("\nTokenized input_ids length:", len(tokenized_ner["train"][idx]["input_ids"]))
print("Tokenized labels length:", len(tokenized_ner["train"][idx]["labels"]))
print("Number of ignored labels (-100):", sum(l == -100 for l in tokenized_ner["train"][idx]["labels"]))

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

Keys: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels']

Example tokens: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
Example tags: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

Tokenized input_ids length: 12
Tokenized labels length: 12
Number of ignored labels (-100): 3


In [11]:
import numpy as np
import evaluate
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    logits, labels = p
    predictions = np.argmax(logits, axis=-1)

    true_predictions = []
    true_labels = []
    for pred, lab in zip(predictions, labels):
        cur_preds = []
        cur_labels = []
        for p_i, l_i in zip(pred, lab):
            if l_i == -100:
                continue
            cur_preds.append(label_list[p_i])
            cur_labels.append(label_list[l_i])
        true_predictions.append(cur_preds)
        true_labels.append(cur_labels)

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

print("NER metrics ready ✅")

Downloading builder script: 0.00B [00:00, ?B/s]

NER metrics ready ✅


In [12]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list)
)

args = TrainingArguments(
    output_dir="models_artifacts/conll2003_distilbert_ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ner["train"],
    eval_dataset=tokenized_ner["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("NER Trainer ready ✅")

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NER Trainer ready ✅


  trainer = Trainer(


In [13]:
trainer.evaluate()

{'eval_loss': 2.241661310195923,
 'eval_model_preparation_time': 0.0015,
 'eval_precision': 0.012512528694752497,
 'eval_recall': 0.06512958599798048,
 'eval_f1': 0.020992107618453524,
 'eval_accuracy': 0.035979907324481135,
 'eval_runtime': 3.07,
 'eval_samples_per_second': 1058.649,
 'eval_steps_per_second': 33.225}

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Precision,Recall,F1,Accuracy
1,0.0646,0.057936,0.0015,0.901552,0.90929,0.905404,0.983684
2,0.0359,0.046334,0.0015,0.922267,0.928475,0.925361,0.987364
3,0.0208,0.044657,0.0015,0.928333,0.935207,0.931757,0.98824


TrainOutput(global_step=2634, training_loss=0.0664691525421635, metrics={'train_runtime': 158.3953, 'train_samples_per_second': 265.936, 'train_steps_per_second': 16.629, 'total_flos': 525319502290632.0, 'train_loss': 0.0664691525421635, 'epoch': 3.0})

In [15]:
test_metrics = trainer.evaluate(tokenized_ner["test"])
test_metrics

{'eval_loss': 0.11721900850534439,
 'eval_model_preparation_time': 0.0015,
 'eval_precision': 0.8784674063800277,
 'eval_recall': 0.8971317280453258,
 'eval_f1': 0.8877014716187805,
 'eval_accuracy': 0.9773016043932379,
 'eval_runtime': 4.0141,
 'eval_samples_per_second': 860.228,
 'eval_steps_per_second': 26.905,
 'epoch': 3.0}

In [16]:
save_dir = "models_artifacts/conll2003_distilbert_ner/best"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

print("NER model saved to:", save_dir)

NER model saved to: models_artifacts/conll2003_distilbert_ner/best
