In [49]:
# transformers not support NumPy 2.0 yet
!pip install -q numpy~=1.26.4 transformers~=4.46.2
!pip install -q datasets seqeval matplotlib pydantic

In [76]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset, DatasetDict
from transformers import DataCollatorForTokenClassification
from transformers import pipeline
from seqeval.metrics import classification_report
import matplotlib.pyplot as plt

from pydantic import BaseModel
from pprint import pprint

import torch

# 檢查是否有 GPU 可以使用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else device)

# Review dataset

In [None]:
# load data from huggingface acram/pii_ner dataset
# https://huggingface.co/datasets/acram/pii_ner
dataset = load_dataset("acram/pii_ner", trust_remote_code=True)

dataset

In [None]:
# print first 5 examples from the dataset
pd.set_option('display.max_colwidth', None)
pd.DataFrame(dataset['train'].select(range(5)), )

# Define BIO (Beginning, Inner, Outer) tagging scheme

In [None]:
# configuration in pydantic model
class Config(BaseModel):
  seed: int = 42
  model_name: str = "bert-base-cased" # name of pretrained backbone
  train_seq_len: int = 1024 # max size of input sequence for training
  train_batch_size: int = 4 # size of the input batch in training
  eval_batch_size: int = 4 # size of the input batch in evaluation
  epochs: int = 3 # number of epochs to train
  lr: float = 2e-5 # learning rate
  # BIO (Beginning, Inner, Outer) format labels
  labels: list = ['O',
                  'B-NAME_STUDENT','I-NAME_STUDENT',
                  'B-STREET_ADDRESS','I-STREET_ADDRESS',
                  'B-USERNAME','B-EMAIL','B-URL_PERSONAL','B-PHONE_NUM','B-DRIVING_LICENSE',
                  'B-PASSPORT','B-PAN_NUMBER','B-ID_NUM','B-AADHAR_ID']
  id2label: dict = dict(enumerate(labels)) # integer label to BIO format label mapping
  label2id: dict = {v:k for k,v in id2label.items()} # BIO format label to integer label mapping
  num_labels: int = len(labels) # number of PII (NER) tags

config = Config()
# print label2id mapping, and sort by value
pprint(sorted(config.label2id.items(), key=lambda x: x[1]))


In [None]:
model = "vblagoje/bert-english-uncased-finetuned-pos"
pos_classifier = pipeline(
  task="token-classification",
  model=model,
  device=device,)
pos_classifier("My name is Frank, my email is frank@gmail.com.")