Ref :

https://huggingface.co/docs/peft/main/en/task_guides/token-classification-lora

https://colab.research.google.com/drive/1ViGQuvsBhAbzcXvOyNawlWwux-VXMx1L?usp=sharing

필요 라이브러리 설치

In [1]:
!pip install -q peft transformers datasets evaluate seqeval

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31

필요 패키지 임포트

In [2]:
from datasets import load_dataset
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np

모델 및 기본 파라미터 세팅

In [3]:
model = "roberta-base"
lr = 1e-3
batchSize = 16
epoches = 3

BioNLP2004 데이터셋

In [4]:
bioNlp = load_dataset("tner/bionlp2004")
bioNlp["train"][0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/156k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/332k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16619 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1927 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3856 [00:00<?, ? examples/s]

{'tokens': ['Since',
  'HUVECs',
  'released',
  'superoxide',
  'anions',
  'in',
  'response',
  'to',
  'TNF',
  ',',
  'and',
  'H2O2',
  'induces',
  'VCAM-1',
  ',',
  'PDTC',
  'may',
  'act',
  'as',
  'a',
  'radical',
  'scavenger',
  '.'],
 'tags': [0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

sequence 라벨링 평가에 사용될 precision, accuracy, F1, recall를 지원하는 모듈

In [5]:
seqEval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [6]:
labelList = [
    "O",
    "B-DNA",
    "I-DNA",
    "B-protein",
    "I-protein",
    "B-cell_type",
    "I-cell_type",
    "B-cell_line",
    "I-cell_line",
    "B-RNA",
    "I-RNA",
]

In [7]:
def compute_metrics(p) :
  predictions, labels = p
  predictions = np.argmax(predictions, axis=2)

  true_predictions = [
      [ labelList[p] for (p, l) in zip(prediction, label) if l != -100 ]
      for prediction, label in zip(predictions, labels)
  ]

  true_labels = [
      [ labelList[l] for (p, l) in zip(prediction, label) if l != -100 ]
      for prediction, label in zip(predictions, labels)
  ]

  results = seqEval.compute(predictions = true_predictions, references = true_labels)

  return {
      "precision" : results["overall_precision"],
      "recall" : results["overall_recall"],
      "f1" : results["overall_f1"],
      "accuracy" : results["overall_accuracy"]
  }


Tokenizer 불러오기

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model, add_prefix_space=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [9]:
def tokenize_and_align_labels(examples) :
  tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
  labels = []
  for i, label in enumerate(examples[f"tags"]) :
    word_ids = tokenized_inputs.word_ids(batch_index = i)
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids :
      if word_idx is not None and word_idx != previous_word_idx :
        label_ids.append(label[word_idx])
      else :
        label_ids.append(-100)
      previous_word_idx = word_idx
    labels.append(label_ids)

  tokenized_inputs["labels"] = labels

  return tokenized_inputs


bionlp 데이터셋 토크나이징

In [10]:
tokenizedBioNlp = bioNlp.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/16619 [00:00<?, ? examples/s]

Map:   0%|          | 0/1927 [00:00<?, ? examples/s]

Map:   0%|          | 0/3856 [00:00<?, ? examples/s]

In [11]:
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)

In [12]:
id2label = {
    0: "O",
    1: "B-DNA",
    2: "I-DNA",
    3: "B-protein",
    4: "I-protein",
    5: "B-cell_type",
    6: "I-cell_type",
    7: "B-cell_line",
    8: "I-cell_line",
    9: "B-RNA",
    10: "I-RNA",
}
label2id = {
    "O": 0,
    "B-DNA": 1,
    "I-DNA": 2,
    "B-protein": 3,
    "I-protein": 4,
    "B-cell_type": 5,
    "I-cell_type": 6,
    "B-cell_line": 7,
    "I-cell_line": 8,
    "B-RNA": 9,
    "I-RNA": 10,
}

In [13]:
model = AutoModelForTokenClassification.from_pretrained(model, num_labels = 11, id2label = id2label, label2id = label2id)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PEFT 모델 설정

In [14]:
peft_config = LoraConfig(task_type = TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all")

In [15]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 700,427 || all params: 124,661,782 || trainable%: 0.5618618543412126


train config 설정

In [16]:
training_args = TrainingArguments(
    output_dir = "roberta-base-lora-token-classification",
    learning_rate = lr,
    per_device_train_batch_size = batchSize,
    per_device_eval_batch_size = batchSize,
    num_train_epochs = epoches,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
)

모델 training 시작

In [17]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenizedBioNlp["train"],
    eval_dataset = tokenizedBioNlp["validation"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.217,0.174498,0.73977,0.797587,0.767591,0.942134
2,0.1758,0.163847,0.757652,0.811273,0.783546,0.943806
3,0.1559,0.151644,0.778175,0.823159,0.800035,0.948312


TrainOutput(global_step=3117, training_loss=0.2012183501776266, metrics={'train_runtime': 526.612, 'train_samples_per_second': 94.675, 'train_steps_per_second': 5.919, 'total_flos': 1993053683590728.0, 'train_loss': 0.2012183501776266, 'epoch': 3.0})

학습 끝난 모델 load

In [19]:
peft_model_id = "./roberta-base-lora-token-classification/checkpoint-3117"
config = PeftConfig.from_pretrained(peft_model_id)
inference_model = AutoModelForTokenClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=11, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, peft_model_id)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


inference test

In [20]:
test_text = "The activation of IL-2 gene expression and NF-kappa B through CD28 requires reactive oxygen production by 5-lipoxygenase."
inputs = tokenizer(test_text, return_tensors="pt")

In [23]:
with torch.no_grad():
    logits = model(**inputs).logits

tokens = inputs.tokens()
predictions = torch.argmax(logits, dim=2)

for token, prediction in zip(tokens, predictions[0].numpy()):
    print((token, model.config.id2label[prediction]))

('<s>', 'O')
('The', 'O')
('Ġactivation', 'O')
('Ġof', 'O')
('ĠIL', 'B-DNA')
('-', 'I-DNA')
('2', 'I-DNA')
('Ġgene', 'I-DNA')
('Ġexpression', 'O')
('Ġand', 'O')
('ĠNF', 'B-protein')
('-', 'I-protein')
('k', 'I-protein')
('appa', 'I-protein')
('ĠB', 'I-protein')
('Ġthrough', 'O')
('ĠCD', 'B-protein')
('28', 'I-protein')
('Ġrequires', 'O')
('Ġreactive', 'O')
('Ġoxygen', 'O')
('Ġproduction', 'O')
('Ġby', 'O')
('Ġ5', 'B-protein')
('-', 'I-protein')
('lip', 'I-protein')
('oxy', 'I-protein')
('gen', 'I-protein')
('ase', 'I-protein')
('.', 'O')
('</s>', 'O')
