https://wikidocs.net/166802

https://huggingface.co/learn/nlp-course/ko/chapter2/4

In [7]:
from datasets import load_dataset
from transformers import BertTokenizer
from transformers import AutoTokenizer

In [31]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [27]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Fine-Tuning

In [2]:
dataset = load_dataset('yelp_review_full')

Downloading readme:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/299M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [5]:
s = dataset['train'][100]

print(s['label'])
print(s['text'])

0
My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\nThe cashier took my friends's order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid's meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \"serving off their orders\" when they didn't have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\nThe manager was rude when giving me my order. She didn't make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\nI've eaten at various McDonalds restaurants for over 30 years. I've worked at more than one lo

In [105]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [109]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-cased')

def tokenize_func(examples):
    return tokenizer(examples['text'], padding = 'max_length', truncation = True)

# dataset에 있는 map 메서드를 통해 전체 데이터에 토크나이징을 적용할 수 있음
tokenized_datasets = dataset.map(tokenize_func, batched = True) # 배치 단위로 mapping을 진행하는 옵션으로 속도면에서 성능을 발휘

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [111]:
# 연습을 위한 예제 데이터 셋을 더 작은 규모로 설정
small_train_dataset = tokenized_datasets['train'].shuffle(seed = 42).select(range(1000))
small_eval_dataset = tokenized_datasets['test'].shuffle(seed = 42).select(range(1000))

### Pytorch Version

In [112]:
from transformers import AutoModelForSequenceClassification

In [113]:
model = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-cased', num_labels = 5)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at google-bert/bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-c

In [115]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir = 'test_trainer') # 체크포인트 저장 경로 설정

In [117]:
# 정확도 평가를 위한 작업 (케라스, 텐서플로우와 달리 학습 과정에서 바로 정확도가 산출되지 않는다.)

import numpy as np
import evaluate

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)

    return metric.compute(predictions = predictions, references = labels)

In [118]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = 'test_trainer',
    evaluation_strategy = 'epoch',
    num_train_epochs = , # default = 3
)

In [120]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = small_train_dataset,
    eval_dataset = small_eval_dataset,
    compute_metrics = compute_metrics
)

In [121]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.079821,0.548
2,No log,1.002188,0.576
3,No log,1.050602,0.583


TrainOutput(global_step=375, training_loss=0.9944601236979167, metrics={'train_runtime': 129.4787, 'train_samples_per_second': 23.17, 'train_steps_per_second': 2.896, 'total_flos': 789354427392000.0, 'train_loss': 0.9944601236979167, 'epoch': 3.0})

In [122]:
trainer.evaluate()

{'eval_loss': 1.0506024360656738,
 'eval_accuracy': 0.583,
 'eval_runtime': 10.3182,
 'eval_samples_per_second': 96.916,
 'eval_steps_per_second': 12.114,
 'epoch': 3.0}

### Keras Version

In [123]:
# 데이터 형식 변환이 필요함
dataset = load_dataset('glue', 'cola')

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/251k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [125]:
train_dataset = dataset['train']
valid_dataset = dataset['validation']
test_dataset = dataset['test']

In [130]:
len(train_dataset), len(valid_dataset), len(test_dataset)

(8551, 1043, 1063)

In [136]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-cased')
tokenized_data = tokenizer(train_dataset['sentence'], return_tensors = 'np', padding = True)

tokenized_data = dict(tokenized_data)

labels = np.array(train_dataset['label'])

In [149]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam # 최적화 함수 설정

model = TFAutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-cased')
model.compile(optimizer = Adam(3e-5))

model.fit(tokenized_data, labels)

# Tokenizer

In [6]:
test_sent_kor = '종근당건강 락토핏 솔루션 2 예민한 장 450mg 30캡슐 X 3박스 3개월분'

In [28]:
model_name = 'klue/roberta-small'

tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer_auto = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer_hug = AutoTokenizer.from_pretrained(model_name)

In [10]:
tokenizer_bert

BertTokenizer(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [35]:
incode = tokenizer_bert(test_sent_kor)
print(incode)

decode = tokenizer_bert.decode(incode['input_ids'])
print(decode)

{'input_ids': [101, 100, 100, 100, 123, 100, 100, 10181, 1306, 1403, 100, 161, 100, 100, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] [UNK] [UNK] [UNK] 2 [UNK] [UNK] 450mg [UNK] X [UNK] [UNK] [SEP]


In [11]:
tokenizer_auto

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [36]:
incode = tokenizer_auto(test_sent_kor)
print(incode)

decode = tokenizer_auto.decode(incode['input_ids'])
print(decode)

{'input_ids': [101, 100, 100, 100, 123, 100, 100, 10181, 1306, 1403, 100, 161, 100, 100, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] [UNK] [UNK] [UNK] 2 [UNK] [UNK] 450mg [UNK] X [UNK] [UNK] [SEP]


In [38]:
tokenizer_hug

BertTokenizerFast(name_or_path='klue/roberta-small', vocab_size=32000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [39]:
incode = tokenizer_hug(test_sent_kor)
print(incode)

decode = tokenizer_hug.decode(incode['input_ids'])
print(decode)

{'input_ids': [0, 1558, 2169, 2481, 2332, 2280, 943, 2386, 2946, 8463, 22, 11028, 2470, 1526, 13103, 2037, 2064, 3740, 2941, 3381, 60, 23, 13473, 23, 2019, 2429, 2377, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] 종근당건강 락토핏 솔루션 2 예민한 장 450mg 30캡슐 X 3박스 3개월분 [SEP]


### 인코딩

In [60]:
seq = 'using Using a Transformer network is simple'
tokens = tokenizer_hug.tokenize(seq)

In [61]:
print(len(tokens), tokens)

15 ['us', '##ing', 'U', '##s', '##ing', 'a', 'Trans', '##form', '##er', 'net', '##work', 'is', 's', '##im', '##ple']


In [62]:
print(len(tokenizer_hug(seq)['input_ids']), tokenizer_hug(seq)['input_ids'])
# [CLS], tokens... , [SEP] 해서 token보다 2개 갯수 더 많은 결과

17 [0, 26099, 4586, 57, 2041, 4586, 68, 23877, 16240, 3762, 17640, 24384, 11376, 86, 6828, 31439, 2]


In [63]:
ids = tokenizer_hug.convert_tokens_to_ids(tokens)

print(ids)
# 이러한 아이디 값들을 텐서로 변환하는 단계가 

[26099, 4586, 57, 2041, 4586, 68, 23877, 16240, 3762, 17640, 24384, 11376, 86, 6828, 31439]


### 디코딩

토큰들을 병합하여, 읽을 수 있는 원본 문장을 도출하는 작업

In [64]:
decoded_string = tokenizer_hug.decode(ids)

print(decoded_string)
print(tokenizer_hug.decode(tokenizer_hug(seq)['input_ids']))

using Using a Transformer network is simple
[CLS] using Using a Transformer network is simple [SEP]


### 다중 시퀀스 처리

In [72]:
# return_tensors = 'pt'를 입력해주면, 결과값을 텐서로 리턴 받음
tokenized_inputs = tokenizer_hug(seq, return_tensors = 'pt')
print(tokenized_inputs)

{'input_ids': tensor([[    0, 26099,  4586,    57,  2041,  4586,    68, 23877, 16240,  3762,
         17640, 24384, 11376,    86,  6828, 31439,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [73]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [92]:
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [93]:
tokens = tokenizer.tokenize(seq)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)

print(tokens)
print(ids)
print(input_ids)

['using', 'using', 'a', 'transform', '##er', 'network', 'is', 'simple']
[2478, 2478, 1037, 10938, 2121, 2897, 2003, 3722]
tensor([ 2478,  2478,  1037, 10938,  2121,  2897,  2003,  3722])


In [94]:
tokenized_inputs = tokenizer(seq, return_tensors = 'pt')

print(tokenized_inputs['input_ids'])

tensor([[  101,  2478,  2478,  1037, 10938,  2121,  2897,  2003,  3722,   102]])


In [95]:
# 차원 형태 변형
input_ids = torch.tensor([ids])

In [96]:
model(input_ids)

SequenceClassifierOutput(loss=None, logits=tensor([[ 3.0906, -2.6761]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

### Padding

In [97]:
bathced_ids = [
    [200, 200, 200],
    [200, 200]
]

# 여러 개의 텍스트 데이터에 대해서, 각 데이터들을 토큰화 했을 때, 데이터의 사이즈가 달라진다.
# 이를 해결하기 위한 방법이, 1. Padding 2. Truncating 가 있다.

In [99]:
tokenizer.pad_token_id

0

In [100]:
seq_ids_1 = [[200, 200, 200]]
seq_ids_2 = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(seq_ids_1)).logits)
print(model(torch.tensor(seq_ids_2)).logits)
print(model(torch.tensor(batched_ids)).logits)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3373, -1.2163]], grad_fn=<AddmmBackward0>)


### Transformer API

In [102]:
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# list로 여러개의 텍스트도 입력 가능
seq = "I've been waiting for a HuggingFace course my whole life."
seq_list = [
    "I've been waiting for a HuggingFace course my whole life.",
    "So have I!"
]

model_inputs = tokenizer(seq_list) # __call__ 함수를 통해 작동
print(model_inputs)

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}


In [104]:
model_inputs = tokenizer(seq_list, padding = 'longest')
print(model_inputs)

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}
