In [1]:
!pip install transformers datasets evaluate seqeval

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3

In [3]:


import torch
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
import numpy as np

# Load CoNLL-2003 dataset
dataset = load_dataset("conll2003")
print("Dataset loaded successfully!")

# Get label list
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)
print(f"Available labels: {label_list}")
print(f"Number of labels: {num_labels}")

# Tokenizer and Model
model_checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    id2label={i: label for i, label in enumerate(label_list)},
    label2id={label: i for i, label in enumerate(label_list)}
)

# Tokenization function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    
    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        
        # Pad labels to same length as input
        while len(label_ids) < len(tokenized_inputs["input_ids"][i]):
            label_ids.append(-100)
        
        all_labels.append(label_ids)
    
    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

# Apply tokenization
tokenized_train = dataset["train"].map(tokenize_and_align_labels, batched=True)
tokenized_val = dataset["validation"].map(tokenize_and_align_labels, batched=True)
tokenized_test = dataset["test"].map(tokenize_and_align_labels, batched=True)

# Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Metric
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# TrainingArguments
training_args = TrainingArguments(
    output_dir="./ner-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    report_to="none",
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train
print("Starting training...")
trainer.train()

# Evaluate
results = trainer.evaluate(tokenized_test)
print(f"Test results: {results}")

# Save model
trainer.save_model("./ner-model")
tokenizer.save_pretrained("./ner-model")
print("Model saved successfully!")

2025-09-04 08:04:26.910561: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756973067.156417      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756973067.225672      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


README.md: 0.00B [00:00, ?B/s]

conll2003.py: 0.00B [00:00, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset loaded successfully!
Available labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
Number of labels: 9


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1801,0.057899,0.899385,0.911051,0.90518,0.98414
2,0.0379,0.047846,0.921791,0.929245,0.925503,0.987549
3,0.0205,0.045892,0.926056,0.934636,0.930326,0.98866


Test results: {'eval_loss': 0.12466607242822647, 'eval_precision': 0.8868154865713289, 'eval_recall': 0.9009567682494685, 'eval_f1': 0.8938301986289331, 'eval_accuracy': 0.978305362259517, 'eval_runtime': 7.4832, 'eval_samples_per_second': 461.432, 'eval_steps_per_second': 28.865, 'epoch': 3.0}
Model saved successfully!


In [4]:
from transformers import pipeline

# Create NER pipeline - yeh automatically device handle karega
ner_pipeline = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1  # 0 for GPU, -1 for CPU
)
# Test inference
test_texts = [
    "Apple is looking at buying U.K. startup for $1 billion",
    "John Smith works at Microsoft in New York",
    "I visited Paris and met Marie Curie"
]

print("🧪 Model Predictions:")
print("=" * 50)

for text in test_texts:
    print(f"\nText: {text}")
    results = ner_pipeline(text)
    
    if results:
        for entity in results:
            print(f"  {entity['word']} -> {entity['entity_group']} (confidence: {entity['score']:.3f})")
    else:
        print("  No entities detected")

Device set to use cuda:0


🧪 Model Predictions:

Text: Apple is looking at buying U.K. startup for $1 billion
  Apple -> ORG (confidence: 0.996)
  U. K -> LOC (confidence: 0.806)

Text: John Smith works at Microsoft in New York
  John Smith -> PER (confidence: 0.999)
  Microsoft -> ORG (confidence: 0.997)
  New York -> LOC (confidence: 0.998)

Text: I visited Paris and met Marie Curie
  Paris -> LOC (confidence: 0.999)
  Marie Curie -> PER (confidence: 0.815)


In [5]:
import shutil

# Zip the folder
shutil.make_archive("/kaggle/working/ner-model", 'zip', "/kaggle/working/ner-model")


'/kaggle/working/ner-model.zip'