In [344]:
# transformers not support NumPy 2.0 yet
!pip install -q numpy~=1.26.4 transformers~=4.46.2
!pip install -q datasets seqeval matplotlib pydantic

In [345]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset, DatasetDict
from transformers import DataCollatorForTokenClassification
from transformers import pipeline
from seqeval.metrics import classification_report

from pydantic import BaseModel
from pprint import pprint

import torch

# 檢查是否有 GPU 可以使用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else device)

# 下載資料

In [None]:
# 從 Kaggle 下載 PII External Dataset
!curl -L -o ./sample_data/pii-detect-gpt3-5-synthetic-data-8k.zip \
  https://www.kaggle.com/api/v1/datasets/download/dileepjayamal/pii-detect-gpt3-5-synthetic-data-8k

# 解壓縮
!unzip -o -q ./sample_data/pii-detect-gpt3-5-synthetic-data-8k.zip -d ./sample_data/

In [347]:
# 將 json 檔案轉換成 jsonl 檔案
def json_to_jsonl(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    with open(output_file, 'w', encoding='utf-8') as f:
        for d in data:
            json.dump(d, f, ensure_ascii=False)
            f.write('\n')

input_file = 'sample_data/PII_Detect_GPT3.5_Generated_data_v1.json'
output_file = 'sample_data/PII_Detect_GPT3.5_Generated_data_v1.jsonl'
json_to_jsonl(input_file, output_file)


# 資料包含什麼？

In [None]:
immutable_dataset = load_dataset('json', data_files=output_file)

immutable_dataset

In [None]:
# Keep required features: 'tokens', 'labels'
dataset = immutable_dataset.remove_columns(['trailing_whitespace'])
# Keep length of tokens is equal to length of labels
dataset = dataset.filter(lambda x: len(x['tokens']) == len(x['labels']))
dataset

In [None]:
# 顯示前 5 筆資料
pd.set_option('display.max_colwidth', None)
pd.DataFrame(dataset['train'].select(range(5)))

In [None]:
# 顯示資 BIO 詞性標注
label_names = set()
for data in dataset['train']:
    label_names.update(data['labels'])
# convert set to list and sort label names
label_names = list(label_names)
pprint(label_names)

# 訓練設定

In [None]:
# 訓練相關設定
class Config(BaseModel):
  seed: int = 42
  model_name: str = "bert-base-cased" # name of pretrained backbone
  train_seq_len: int = 1024 # max size of input sequence for training
  train_batch_size: int = 4 # size of the input batch in training
  eval_batch_size: int = 4 # size of the input batch in evaluation
  epochs: int = 3 # number of epochs to train
  lr: float = 2e-5 # learning rate
  tags: list # BIO (Beginning, Inner, Outer) format tags
  id2tag: dict # integer label to BIO format tags mapping
  tag2id: dict # BIO format tags to integer tags mapping
  num_tags: int # number of PII (NER) tags

tags_name = [
  'O',
  'B-URL_PERSONAL', 'I-URL_PERSONAL',
  'B-ID_NUM', 'I-ID_NUM',
  'B-NAME_STUDENT', 'I-NAME_STUDENT',
  'B-PHONE_NUM', 'I-PHONE_NUM',
  'B-STREET_ADDRESS', 'I-STREET_ADDRESS',
  'B-USERNAME', 'I-USERNAME',
  'B-EMAIL', 'I-EMAIL',
]
id2tag = dict(enumerate(tags_name))
config = Config(
  tags=tags_name,
  id2tag=id2tag,
  tag2id=dict((v, k) for k, v in id2tag.items()),
  num_tags=len(tags_name)
  )

pprint(f'id2tag: {sorted(config.id2tag.items(), key=lambda x: x[0])}')
pprint(f'tag2id: {sorted(config.tag2id.items(), key=lambda x: x[1])}')

In [None]:
# 並列顯示前 max_display 個 tokens 與 labels
max_display = 20

for i in range(5):
    tokens = dataset['train'][i]['tokens'][:max_display]
    labels = dataset['train'][i]['labels'][:max_display]
    line1 = ""
    line2 = ""
    for token, label in zip(tokens, labels):
        max_length = max(len(token), len(label))
        line1 += token + " " * (max_length - len(token) + 1)
        line2 += label + " " * (max_length - len(label) + 1)
    pprint(line1, width=200)
    pprint(line2, width=200)
    print()

# 先觀察 Fine-tuning 前的表現

In [None]:
model = "vblagoje/bert-english-uncased-finetuned-pos"
classifier = pipeline(
  task="token-classification",
  model=model,
  device=device,)
classifier("My name is Frank, my email is frank@gmail.com, and my phone number is 123-456-7890.")

# 了解 Tokenizer 行為

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

# 以第一筆資料為例
input_tokens = dataset["train"][0]["tokens"]
input_labels = dataset["train"][0]["labels"]
input_token_ids = tokenizer(input_tokens, is_split_into_words=True)

# 如我們所見，分詞器添加了模型使用的特殊標記（[CLS] 在開頭和 [SEP] 在結尾），
# 並且大多數單詞保持不變。然而，單詞 Bareilly 被分詞為三個子詞，Bar, ##eil 和 ##ly
pprint(input_token_ids.tokens())

In [None]:
# 原始資料
pprint(f'length of token: {len(input_tokens)}')
pprint(f'length of tag: {len(input_labels)}')
# 這導致了我們的輸入和標籤之間的不匹配
pprint(f'length of token id: {len(input_token_ids.tokens())}')

In [None]:
# 我們可以輕鬆地將每個標記映射到其對應的單詞。
input_token_ids.word_ids()

# 重新校準 Tokenizer 與標籤

稍作處理後，我們可以擴展標籤列表以匹配標記。首先，我們將應用的規則是特殊標記獲得 -100 標籤。這是因為默認情況下，-100 是我們將使用的損失函數中被忽略的索引。然後，每個標記獲得與其所在單詞開始標記相同的標籤，因為它們是同一實體的一部分。對於單詞內但不在開頭的標記，我們將 B- 替換為 I- ，因為該標記不是開始實體：

In [358]:
def align_labels_with_tokens(word_ids, labels):
    new_labels = []
    current_word_id = None
    for word_id in word_ids:
        if word_id is None:
            # Special token
            label = -100
        elif word_id != current_word_id:
            # Start of a new token!
            current_word_id = word_id
            label = -100 if word_id is None else config.tag2id.get(labels[word_id])
        else:
            # Same word as previous token
            label = config.tag2id.get(labels[word_id])
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
        new_labels.append(label)

    return new_labels

In [None]:
labels = align_labels_with_tokens(input_token_ids.word_ids(), input_labels)

pprint(f'length of token id: {len(input_token_ids.tokens())}')
pprint(f'length of labels: {len(labels)}')

In [None]:
# B-NAME_STUDENT(5) I-NAME_STUDENT(6)
pprint(labels[:max_display])
# Russell Contreras
pprint(input_token_ids.tokens()[:max_display])

# 批次重新 Tokenizer 與標籤

要預處理整個數據集，我們需要對所有輸入進行分詞，並對所有標籤應用 `align_labels_with_tokens()`。為了利用快速分詞器的速度，最好一次分詞大量文本，因此我們將編寫一個批次處理函數，並使用 Dataset.map() 方法，選項設置為 batched=True。

與之前的示例不同的是，當分詞器的輸入是文本列表（或在我們的情況下，是單詞列表的列表）時，word_ids() 函數需要獲取我們想要單詞 ID 的示例索引，因此我們也添加了這一點：

In [361]:
def tokenize_and_align_labels(dataset):
    tokenized_inputs = tokenizer(
        dataset['tokens'], truncation=True, is_split_into_words=True
    )
    all_labels = dataset['labels']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(word_ids, labels))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

In [None]:
# 顯示前 3 筆資料
pd.set_option('display.max_colwidth', None)
pd.DataFrame(tokenized_datasets['train'].select(range(3)))
