<a href="https://colab.research.google.com/github/jinseriouspark/embedding_for_all/blob/main/%5Bw2%5D_fine_tuning_%EC%8B%A4%EC%8A%B5%EC%BD%94%EB%93%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine Tuning
- pretrained model with task

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install accelrate -U

In [1]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_metric

## load dataset

In [161]:
data = load_dataset('wikiann','ko')
label_names = data['train'].features['ner_tags'].feature.names


In [179]:
idx2label = {i : l for i, l in enumerate(label_names)}
idx2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC'}

In [3]:
data.shape, label_names

({'validation': (10000, 4), 'test': (10000, 4), 'train': (20000, 4)},
 ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'])

In [4]:
# token과 ner-tag
data['train']

Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'spans'],
    num_rows: 20000
})

In [5]:
data['train'][0]

{'tokens': ['현재',
  '대한민국',
  'K리그',
  '챌린지의',
  '서울',
  '이랜드',
  'FC에서',
  '활약하고',
  '있다',
  '.'],
 'ner_tags': [0, 5, 3, 4, 3, 4, 4, 0, 0, 0],
 'langs': ['ko', 'ko', 'ko', 'ko', 'ko', 'ko', 'ko', 'ko', 'ko', 'ko'],
 'spans': ['LOC: 대한민국', 'ORG: K리그 챌린지의', 'ORG: 서울 이랜드 FC에서']}

## load model : koelectra
: pretrained ELECTRA language model 한국어판

- 생성기에서 토큰을 보고 판별기에서 그것이 '진짜' 토큰인지 '가짜 ' 토큰인지 판단하여 학습함
- 이 방법을 사용할 경우, 모든 입력 토큰을 훈련할 수 있으며, bert 등과 비교하여 효과가 우수함
- WordPiece를 사용



(참고자료)
- 링크 1 : https://huggingface.co/monologg/koelectra-base-v3-generator
- 링크 2 : https://github.com/monologg/KoELECTRA/blob/master/README_EN.md
- 링크 3 : https://github.com/monologg/KoELECTRA/blob/master/finetune/README_EN.md

In [6]:
model_name = "monologg/koelectra-small-v3-discriminator"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name,
                                                       num_labels = len(label_names)).to('cuda')


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
print(data['train'][0])
text =' '.join(data['train']['tokens'][0])
print(tokenizer.encode(text))

{'tokens': ['현재', '대한민국', 'K리그', '챌린지의', '서울', '이랜드', 'FC에서', '활약하고', '있다', '.'], 'ner_tags': [0, 5, 3, 4, 3, 4, 4, 0, 0, 0], 'langs': ['ko', 'ko', 'ko', 'ko', 'ko', 'ko', 'ko', 'ko', 'ko', 'ko'], 'spans': ['LOC: 대한민국', 'ORG: K리그 챌린지의', 'ORG: 서울 이랜드 FC에서']}
[2, 6339, 7001, 47, 19611, 18756, 4234, 6265, 23358, 10839, 4073, 4129, 8377, 4279, 4219, 3249, 4176, 18, 3]


In [8]:
# batch 단위로 인코딩 하는 함수
batch_result_line2 = tokenizer.batch_encode_plus(data['train']['tokens'][:3], is_split_into_words = True)
batch_result_line2

{'input_ids': [[2, 6339, 7001, 47, 19611, 18756, 4234, 6265, 23358, 10839, 4073, 4129, 8377, 4279, 4219, 3249, 4176, 18, 3], [2, 11, 11, 11, 3598, 25892, 11, 11, 11, 3], [2, 11554, 4172, 4162, 2780, 4031, 3311, 4112, 2241, 15, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [9]:
# input_ids, token_type_ids, attention_mask를 위한 값 생성

def tokenize_adjust_labels(all_samples):
  tokenized_samples = tokenizer.batch_encode_plus(all_samples['tokens'],
                                                  is_split_into_words = True)

  total_adjusted_labels = []
  for k in range(0, len(tokenized_samples['input_ids'])):
    prev_wid = -1
    word_id_list = tokenized_samples.word_ids(batch_index = k) # output # 같은 단어는 같은 id : [None, 0, 1, 2, 2, 3, 3, 4, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, None]
    existing_label_ids = all_samples['ner_tags'][k] # output : [[0, 5, 3, 4, 3, 4, 4, 0, 0, 0],...]
    i = -1
    adjusted_label_ids = []

    for wid in word_id_list:
      if wid is None: # 단어 리스트에 포함되지 않으면 -100
        adjusted_label_ids.append(-100)


      elif wid != prev_wid: # 새 단어가 등장하여 과거 word_id 와 다를 경우
        i += 1
        adjusted_label_ids.append(existing_label_ids[i])
        prev_wid = wid
      else: # 등장하기 시작한 word_id 가 유지될 경우
        label_name = label_names[existing_label_ids[i]]
        adjusted_label_ids.append(existing_label_ids[i])

    total_adjusted_labels.append(adjusted_label_ids)
  tokenized_samples['labels'] = total_adjusted_labels
  return tokenized_samples


In [10]:
tokenized_dataset = data.map(tokenize_adjust_labels, batched = True)

In [71]:
tokenized_dataset['train'][0]

{'tokens': ['현재',
  '대한민국',
  'K리그',
  '챌린지의',
  '서울',
  '이랜드',
  'FC에서',
  '활약하고',
  '있다',
  '.'],
 'ner_tags': [0, 5, 3, 4, 3, 4, 4, 0, 0, 0],
 'langs': ['ko', 'ko', 'ko', 'ko', 'ko', 'ko', 'ko', 'ko', 'ko', 'ko'],
 'spans': ['LOC: 대한민국', 'ORG: K리그 챌린지의', 'ORG: 서울 이랜드 FC에서'],
 'input_ids': [2,
  6339,
  7001,
  47,
  19611,
  18756,
  4234,
  6265,
  23358,
  10839,
  4073,
  4129,
  8377,
  4279,
  4219,
  3249,
  4176,
  18,
  3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 0, 5, 3, 3, 4, 4, 3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, -100]}

In [11]:
data # 이전

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})

In [12]:
tokenized_dataset # tokenizer.tokenize()를 통과시켰을 때 및 label이 추가됨

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
})

In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [14]:
!pip install seqeval

metric = load_metric('seqeval')



  metric = load_metric('seqeval')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [25]:
def compute_metrics(pred):
  predictions, labels = pred
  predictions = np.argmax(predictions, axis=2)

  # speical token은 삭제
  true_predictions = [
      [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(predictions, labels)
  ]

  true_labels = [
      [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(predictions, labels)
  ]

  # metrics.compute
  results = metric.compute(predictions = true_predictions, references = true_labels)
  flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
  }

  for k in results.keys():
    if k not in flattened_results.keys():
      flattened_results[f'{k}_f1'] = results[k]['f1']

  return flattened_results

### 1. Trainer 를 활용하여 학습하기

In [26]:
# 추상화된 객체 사용해서 학습
training_args = TrainingArguments(
    output_dir = './find_tune_bert_output',
    evaluation_strategy='steps', # 훈련중에 채택할 평가전략 (no, steps, epoch)
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 7,
    weight_decay = 0.01,
    logging_steps = 1000,
    save_strategy = 'no', # 훈련중에는 저장하지 않음,
    use_cpu = False
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation'],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()

Step,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc F1,Org F1,Per F1
1000,0.2146,0.290305,0.763195,0.834043,0.797048,0.917495,0.872935,0.681205,0.796561
2000,0.2538,0.237364,0.818328,0.828968,0.823614,0.92988,0.894029,0.718694,0.817802
3000,0.2192,0.226512,0.824571,0.843662,0.834007,0.933968,0.895636,0.740318,0.832225
4000,0.1967,0.220646,0.826071,0.852936,0.839288,0.935432,0.9047,0.749934,0.82902
5000,0.1814,0.216909,0.837071,0.85375,0.845328,0.937919,0.905687,0.760548,0.838792


Step,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc F1,Org F1,Per F1
1000,0.2146,0.290305,0.763195,0.834043,0.797048,0.917495,0.872935,0.681205,0.796561
2000,0.2538,0.237364,0.818328,0.828968,0.823614,0.92988,0.894029,0.718694,0.817802
3000,0.2192,0.226512,0.824571,0.843662,0.834007,0.933968,0.895636,0.740318,0.832225
4000,0.1967,0.220646,0.826071,0.852936,0.839288,0.935432,0.9047,0.749934,0.82902
5000,0.1814,0.216909,0.837071,0.85375,0.845328,0.937919,0.905687,0.760548,0.838792
6000,0.1661,0.216868,0.840269,0.848957,0.844591,0.938178,0.904957,0.758254,0.837846
7000,0.1577,0.211649,0.841288,0.860424,0.850748,0.940121,0.908809,0.766986,0.847248
8000,0.1544,0.211022,0.846809,0.856069,0.851414,0.940529,0.909014,0.768458,0.847706


TrainOutput(global_step=8750, training_loss=0.18901681256975447, metrics={'train_runtime': 490.969, 'train_samples_per_second': 285.15, 'train_steps_per_second': 17.822, 'total_flos': 342151515102240.0, 'train_loss': 0.18901681256975447, 'epoch': 7.0})

In [None]:
!pip install huggingface_hub

In [109]:
testset_result =trainer.predict(tokenized_dataset['test'])
testset_result[0].shape # 10_000개가 164 토크나이징 되어 7개의 ner tag 를 가지고 있다.

In [174]:
trainer

<transformers.trainer.Trainer at 0x7a3606b8eaa0>

In [144]:
# 토큰 기반 결과물
[idx2label[i] for i in testset_result[0][0].argmax(axis=1)][:10]

['O', 'O', 'O', 'O', 'B-PER', 'B-PER', 'I-PER', 'O', 'O', 'O']

In [None]:
# model architecture
model

## model save

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

# 1) trainer 를 사용하여 업로드하기
#trainer.push_to_hub() # output_dir 에 지정한 경로명이 반영

# 2) 모델을 로컬경로에 저장한 다음 업로드하기

path = 'koelectra-small-v3-discriminator-ft-wikiann-ko-ner'
trainer.save_model(f'./{path}')

from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained('./koelectra-small-v3-discriminator-ft-wikiann-ko-ner')
model = AutoModelForTokenClassification.from_pretrained('./koelectra-small-v3-discriminator-ft-wikiann-ko-ner')

#idx2label[7] = 'NONE' # 앞뒤 -100을 넣어준 부분을 label dictionary 에도 동일하게 반영
model.config.id2label= idx2label # ner tag 추가

model.push_to_hub(path)
tokenizer.push_to_hub(path)

In [181]:
# inference
from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained('seriouspark/koelectra-small-v3-discriminator-ft-wikiann-ko-ner')
model = AutoModelForTokenClassification.from_pretrained('seriouspark/koelectra-small-v3-discriminator-ft-wikiann-ko-ner')

# 문장 하나를 넣어 확인
try_text = '아침에 일어나 학교를 간 김고은'
inputs = tokenizer([try_text], return_tensors = 'pt')
with torch.no_grad():
  logits = model(**inputs)
result_line =np.argmax(logits.logits[0], axis=1)

# ner 결과물을 실제로 확인
text_tokens = tokenizer.tokenize(try_text)
ner_tag = [idx2label[i.item()] for i in result_line][1:-1]
for tokens, tag in zip(text_tokens,ner_tag ):
  print(tokens, tag)

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

아침 O
##에 O
일어나 O
학교 O
##를 O
간 O
김 B-PER
##고 B-PER
##은 B-PER


### 2. training loop 직접 구현하기

In [None]:
!pip install datasets
!pip install transformers
!pip install accelerate -U

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = "monologg/koelectra-small-v3-discriminator"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/56.6M [00:00<?, ?B/s]

In [None]:
# load nsmc dataset
dataset = load_dataset('nsmc')

# get label info
print(set(dataset['train']['label']))

#defind tokenizing function
def tokenize_function(examples):
  return tokenizer(examples['document'], padding = 'max_length', truncation = True)

# tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched = True)

Downloading data:   0%|          | 0.00/11.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/150000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# model + additional layers

class StackMoreLayers(nn.Module):
  def __init__(self, model_name):
    super(StackMoreLayers, self).__init__()
    self.encoder = AutoModel.from_pretrained(model_name)
    self.lstm = nn.LSTM(input_size = self.encoder.config.hidden_size,
                        hidden_size = 512,
                        batch_first = True, # batch,
                        bidirectional = True)
    self.interaction_head = nn.Linear(1024, 1) # 512 & bidirection 이기 때문에 1024, 1 의 shape 을 가짐

  def forward(self, input_ids, attention_mask):
    outputs = self.encoder(input_ids=input_ids, attention_mask = attention_mask)
    #print(outputs.shape)
    last_hidden_state = outputs.last_hidden_state
    #print(last_hidden_state.shape)
    outputs = self.lstm(last_hidden_state)
    #print(outputs.shape)
    outputs = self.interaction_head(outputs[0])
    #print(outputs.shape)
    return outputs

# model + additional layers

class StackMoreLayers2(nn.Module):
  def __init__(self, model_name):
    super(StackMoreLayers2, self).__init__()
    self.encoder = AutoModel.from_pretrained(model_name)
    self.additional_layer = AutoModel.from_pretrained(model_name)
    self.lstm = nn.LSTM(input_size = self.encoder.config.hidden_size,
                        hidden_size = 512,
                        batch_first = True, # batch,
                        bidirectional = True)
    self.interaction_head = nn.Linear(1024, 1) # 512 & bidirection 이기 때문에 1024, 1 의 shape 을 가짐

  def forward(self, input_ids, attention_mask):
    outputs = self.encoder(input_ids=input_ids, attention_mask = attention_mask)
    #print(outputs.shape)
    last_hidden_state = outputs.last_hidden_state
    #print(last_hidden_state.shape)
    outputs = self.lstm(last_hidden_state)
    #print(outputs.shape)
    outputs = self.interaction_head(outputs[0])
    #print(outputs.shape)
    return outputs

In [None]:
tokenized_datasets = tokenized_datasets.rename_column('label','labels')
# 아래 부분 없이 dataset 자체에서 처리하게 되면, batch단위 tensor 를 만들 수 없엉진다.
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle = True, batch_size = 4)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size = 4)

In [None]:
optimizer = optim.AdamW(model.parameters(), lr = 5e-5)
#loss_fn = nn.CrossEntropyLoss()
#loss_fn = nn.BCELoss()
criterion = nn.BCELoss()

embedding 과 embedding_project 가 Electra 모델 구조에서 2번 사용되는 이유? 각각 다른 역할을 수행하기 때문

1. Embedding : 모델의 입력 토큰을 벡터 형태로 변환하는 역할을 수행
  - 단어 ID에 해당하는 임베딩 벡터를 찾아와 모델에 입력
  - Electra 모델의 electraEmbeddings 레이어에는 단어 임베딩, 위치 임베딩, 토큰 타입 임베딩 등이 포함됨
  - 이런 임베딩은 모델에 입력되기 전에 적절한 차원으로 변환

2. EmbeddingProject : 임베딩 레이어의 출력을 더 큰 차원으로 변환하는 역할을 함
  - Electra 모델의 embedding_project 레이어는 Embedding 레이어의 출력 벡터차원을 더 큰 차원으로 확장
  - 모델이 더 복잡한 틁징을 학습하고 더 깊은 이해를 할 수 있음
  - 모델의 용량을 늘리고 더 복잡한 관계 학습이 가능

In [None]:
from torchsummary import summary

summary(model, input_size = (128,))  # 기존 electra model

In [None]:
new_model = StackMoreLayers(model_name).to(device)
new_model2 = StackMoreLayers2(model_name).to(device)

In [None]:
# 질문 : electraencoder 가 다 날라간건가?
summary(new_model, input_size = (128,))   # 기존 electra model + LStM + interaction head

In [None]:
next(iter(train_dataloader))

In [None]:
# 임베딩 레이어를 freeze
print('임베딩 레이어 freeze')
print(new_model.encoder.embeddings)
for param in new_model.encoder.embeddings.parameters():
  param.requires_grad = False

print('-'* 10)


# 첫번째 어텐션 레이어 freeze
print('첫번째 어텐션 레이어 freeze')
print(new_model.encoder.encoder.layer[0])
for param in new_model.encoder.encoder.layer[0].parameters():
  param.requires_grad = False
# 수정코드
for param in new_model.encoder.encoder.layer[0].attention.self.parameters():
  param.requires_grad = False

## 원본코드
#for param in new_model.encoder.encoder.layer[1].attention.self.parameters():
#  param.requires_grad = False

In [None]:
new_model

In [None]:
## training

num_epochs = 3
for epoch in range(num_epochs):
  model.train()
  total_loss = 0
  for batch in train_dataloader:
    target = batch['label'].to(device)
    batch = {k: v.to(device) for k, v in batch.items() if k != 'label'}

    print(batch.keys())
    outputs = new_model(**batch).to(device)
    loss = criterion(outputs[:, -1].squeeze(1), target.float())
    loss = outputs.loss
    total_loss += loss.item()

    optimizer.zero_grad() # 그래디언트 초기화, 없다면 이전 step부터 누적
    loss.backward()       # backprop 으로 그래디언트 계산
    optimizer.step()      # 계산한 그래디언트로 가중치를 조정

  avg_train_loss = total_loss / len(train_dataloader)
  print(f'Epoch {epoch +1} / {num_epochs} | Train Loss : {avg_train_loss}')

  # 검증루프
  model.eval() # eval 모드, 추론을 위해 dropout 등의 기능은 제외
  total_eval_loss = 0
  with torch.no_grad():
    for batch in eval_dataloader:
      batch = {k : v.to(device) for k, v in batch.item()}
      outputs = model(**batch)
      loss = outputs.loss
      total_eval_loss += loss.item()

  avg_eval_loss = total_eval_loss / len(eval_dataloader)
  print(f'Epoch {epoch+1} / {num_epochs} | Eval Loss : {avg_eval_loss}')

# 추가 head 는 저장이 되지 않기 때문에 safetensors 로 저장
model.encoder.save_pretrained(f'model_{epoch+1}')
# 추가 레이어의 weight까지 저장하려면 .pt 파일로 저장 가능하다.
import os
os.makedirs('./koelectra-lstm/', exist_ok = True)
torch.save(model.lstm.state_dict(), "model.pt")