In [2]:
# transformers not support NumPy 2.0 yet
!pip install -q numpy~=1.26.4 transformers~=4.46.2
!pip install -q datasets pydantic peft

# 訓練 PII 遮掩模型

In [4]:
import pandas as pd

from transformers import (
  AutoTokenizer,
  AutoModelForCausalLM,
  TrainingArguments,
  Trainer,
)
from datasets import load_dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq
from transformers import pipeline

from pydantic import BaseModel
from pprint import pprint

import torch

# 檢查是否有 GPU 可以使用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else device)

## 下載資料

In [17]:
# The full `train` split
immutable_dataset = load_dataset("ai4privacy/pii-masking-65k", split="train")

## 資料包含什麼？

In [None]:
# Reserve 0.05% of the training set for testing
test_dataset = immutable_dataset.train_test_split(
  test_size=0.0005, # 0.05% of the data is used for testing
  shuffle=False, # Ensure that train and validation sets are the same across runs
  )
# Split into 80% training and 20% validation sets
train_dataset = test_dataset['train'].train_test_split(
  test_size=0.2, # 20% of the data is used for validation
  shuffle=False, # Ensure that train and test sets are the same across runs
  )
immutable_dataset = DatasetDict({
  'train': train_dataset['train'],
  'validation': train_dataset['test'],
  'test': test_dataset['test'],
  })
immutable_dataset


In [None]:
# 保留必要 features: 'masked_text', 'unmasked_text'
dataset = immutable_dataset.remove_columns(['token_entity_labels', 'tokenised_unmasked_text'])
dataset

In [None]:
# 顯示前 first_n_data 筆資料
first_n_data = 3
pd.set_option('display.max_colwidth', None)
pd.DataFrame(dataset['train'].select(range(first_n_data)))

## 訓練設定

In [9]:
# 訓練相關設定
class Config(BaseModel):
  seed: int = 42
  model_name: str = 'distilbert/distilgpt2' # 使用蒸餾模型，降低參數量
  # model_name: str = 'microsoft/Phi-3.5-mini-instruct' # model name
  saved_model_path: str = 'sample_data/saved_encoder_model' # path to save the trained model
  train_seq_len: int = 1024 # max size of input sequence for training
  train_batch_size: int = 4 # size of the input batch in training
  eval_batch_size: int = 4 # size of the input batch in evaluation
  epochs: int = 1 # 為加速訓練，只訓練一個 epoch
  lr: float = 2e-5 # learning rate, controls how fast or slow the model learns
  weight_decay: float = 0.01 # weight decay, helps the model stay simple and avoid overfitting by penalizing large weights.

config = Config()


## 詠唱格式化 (Prompt Formatting)

下一步是格式化我們的詠唱 (Prompt)。為此，我們將創建一個格式化函數。

In [12]:
def instruction_formatter(x):
  text = f'''
    Given the information below, mask the personal identifiable information.

    Input: {x['unmasked_text']}

    Output:
  '''

  return text


## 先觀察 Fine-tuning 前的表現

In [None]:
# 載入預訓練模型
generator = pipeline(
  task='text-generation',
  model=config.model_name,
  device=device,
  max_length=1024,
  model_kwargs={
    'temperature': 0.0,
  },
)


In [None]:
# 顯示預訓練模型預測結果
input = instruction_formatter(dataset['train'][0])
print(f'輸入: {input}')
response = generator(
  input,
  pad_token_id=generator.tokenizer.eos_token_id,
)
print(response[0]['generated_text'])

## 數據預處理

In [None]:
# 透過預訓練模型取得 Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
  config.model_name,
)
# 檢視 Tokenizer，是否存在 PADDING token?
pprint(tokenizer)

In [58]:
# Add PADDING token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

In [63]:
def process_func(x):
  token_ids, attention_mask, labels = [], [], []
  instruction = tokenizer(
    instruction_formatter(x),
    truncation=True)
  # Generate the response and add the end of string token
  response = tokenizer(
    x['masked_text'] + tokenizer.eos_token,
    truncation=True,)
  # Combine the instruction and response
  token_ids = instruction['input_ids'] + response['input_ids']
  attention_mask = instruction['attention_mask'] + response['attention_mask']
  # The labels are the response token ids, but ignore the instruction token ids by setting them to -100
  labels = [-100] * len(instruction['input_ids']) + response['input_ids']

  return {
    'input_ids': token_ids,
    'attention_mask': attention_mask,
    'labels': labels,
  }

In [None]:
tokenized_dataset = dataset.map(
  process_func,
  batched=False,
  remove_columns=dataset['train'].column_names,
)

In [None]:
# 檢視輸入資料
print('=== 輸入資料 ===')
print(tokenizer.decode(tokenized_dataset['train']['input_ids'][0]))
print()
print('=== 標註資料 ===')
print(tokenizer.decode(
  list(filter(lambda x: x != -100, tokenized_dataset['train']['labels'][0]))
  ))


## 訓練模型

您現在可以開始訓練您的模型了！使用 AutoModelForCausalLM 加載預訓練的模型：

In [65]:
model = AutoModelForCausalLM.from_pretrained(
  config.model_name,
)

In [None]:
training_args = TrainingArguments(
  output_dir='sample_data/train_output_pii_masking',
  learning_rate=config.lr,
  per_device_train_batch_size=config.train_batch_size,
  per_device_eval_batch_size=config.eval_batch_size,
  num_train_epochs=config.epochs,
  weight_decay=config.weight_decay,
  eval_strategy='epoch', # 每個 epoch 評估一次
  save_strategy='epoch', # 每個 epoch 儲存一次
  load_best_model_at_end=True,
  report_to='none', # Disable wandb on colab
)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_dataset['train'],
  eval_dataset=tokenized_dataset['validation'],
  data_collator=DataCollatorForSeq2Seq(tokenizer, padding=True),
  tokenizer=tokenizer,
)

In [None]:
# 查看可訓練的參數量
print('Parameters: {:,}, Trainable Parameters: {:,}'.format(
  model.num_parameters(),
  model.num_parameters(only_trainable=True)))

In [None]:
# 開始訓練，這可能需要一些時間
trainer.train()

In [None]:
# 儲存模型
trainer.save_model(config.saved_model_path)

In [None]:
# 載入新模型
generator = pipeline(
  task='text-generation',
  model=config.saved_model_path,
  device=device,
  tokenizer=tokenizer,
  max_length=1024,
  model_kwargs={
    'temperature': 0.0,
  },
)

In [None]:
# 顯示新模型預測結果
input = instruction_formatter(dataset['train'][0])
print(f'輸入: {input}')
response = generator(input)
print(response[0]['generated_text'])