In [None]:
pip install torch transformers scikit-learn seqeval wandb datasets seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB

In [None]:
import pickle
import torch
import numpy as np
import wandb
from datasets import Dataset
from transformers import (
    BertForTokenClassification,
    BertTokenizerFast,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification
)
from seqeval.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score

In [None]:
pkl_file_path = "./ner_tokenized_data.pkl"

with open(pkl_file_path, "rb") as f:
    tokenized_train, aligned_train_labels, tokenized_val, aligned_val_labels, tokenized_test, aligned_test_labels = pickle.load(f)

def convert_to_dataset(tokenized_inputs, aligned_labels):
    return Dataset.from_dict({
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": aligned_labels
    })

train_dataset = convert_to_dataset(tokenized_train, aligned_train_labels)
val_dataset = convert_to_dataset(tokenized_val, aligned_val_labels)
test_dataset = convert_to_dataset(tokenized_test, aligned_test_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=9)

label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]
    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)
    acc = accuracy_score(np.concatenate(true_labels), np.concatenate(true_predictions))

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": acc}

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps=100,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
500,0.1915,0.166663,0.79678,0.821371,0.808888,0.953648
1000,0.1053,0.094892,0.89992,0.91082,0.905337,0.976146
1500,0.0667,0.081495,0.919423,0.915216,0.917315,0.97935
2000,0.0432,0.074568,0.927866,0.935941,0.931886,0.982538
2500,0.0424,0.083168,0.920078,0.931635,0.92582,0.981859
3000,0.0401,0.077544,0.935053,0.936479,0.935766,0.983267
3500,0.0285,0.077747,0.933941,0.937377,0.935656,0.983775
4000,0.0254,0.076352,0.939218,0.944106,0.941655,0.984826
4500,0.0187,0.07628,0.93725,0.944734,0.940977,0.984928
5000,0.0199,0.070346,0.942078,0.948502,0.945279,0.985657


TrainOutput(global_step=5268, training_loss=0.10272242578033615, metrics={'train_runtime': 1731.2988, 'train_samples_per_second': 24.33, 'train_steps_per_second': 3.043, 'total_flos': 3719263427291754.0, 'train_loss': 0.10272242578033615, 'epoch': 3.0})

In [None]:
print("\n📊 Final Evaluation on Test Set:")
metrics = trainer.evaluate(test_dataset)
print(metrics)

trainer.save_model("./NER_HUGGING_FACE_3_EPOCH_BERT")
tokenizer.save_pretrained("./NER_HUGGING_FACE_3_EPOCH_BERT")


📊 Final Evaluation on Test Set:


{'eval_loss': 0.18106015026569366, 'eval_precision': 0.9033656062701706, 'eval_recall': 0.9088969292142128, 'eval_f1': 0.9061228264890862, 'eval_accuracy': 0.9734677584062088, 'eval_runtime': 33.1983, 'eval_samples_per_second': 104.011, 'eval_steps_per_second': 13.013, 'epoch': 3.0}


('/content/drive/MyDrive/NER_data/NER_HUGGING_FACE_3_EPOCH_BERT/tokenizer_config.json',
 '/content/drive/MyDrive/NER_data/NER_HUGGING_FACE_3_EPOCH_BERT/special_tokens_map.json',
 '/content/drive/MyDrive/NER_data/NER_HUGGING_FACE_3_EPOCH_BERT/vocab.txt',
 '/content/drive/MyDrive/NER_data/NER_HUGGING_FACE_3_EPOCH_BERT/added_tokens.json',
 '/content/drive/MyDrive/NER_data/NER_HUGGING_FACE_3_EPOCH_BERT/tokenizer.json')