In [None]:
from transformers import AutoTokenizer, AutoModel

text = "What is Huggingface Transformers?"
# BERT
model_ckpt = "bert-base-uncased"
bert_model = AutoModel.from_pretrained(model_ckpt)
bert_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# PyTorch 이므로 return tensor를 pt로 
encoded_input = bert_tokenizer(text, return_tensors='pt')
bert_output = bert_model(**encoded_input)

  from .autonotebook import tqdm as notebook_tqdm


## HuggingFace에서 RoBERTa

In [2]:
from transformers import AutoModel, AutoTokenizer

# 분류 헤드 렌덤으로 초기화된 모델 -> 파인튜닝 필수
model_ckpt = 'klue/roberta-base'
model = AutoModel.from_pretrained(model_ckpt) 

# 분류 헤드가 포함된 모델 
# from transformers import AutoModelForSequenceClassification
# model_id = 'SamLowe/roberta-base-go_emotions' # Down Stream(emotion)관련 분류기 
# classification_model = AutoModelForSequenceClassification.from_pretrained(model_id)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_ckpt = 'klue/roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

tokenized = tokenizer("토크나이저는 텍스트를 토큰 단위로 나눈다")
print(tokenized)
# input_ids는 각 토큰이 id로 변환된 값
# token_type_ids는 두개 이상의 문장이 입력으로 주어졌을 때 각 토큰이 어떤 문장에 속해있는지
# attention_mask는 토큰이 원본 문장에 존재하면 1, padding된 토큰들은 0

{'input_ids': [0, 9157, 7461, 2190, 2259, 8509, 2138, 1793, 2855, 5385, 2200, 20950, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [4]:
# 토크나이저는 input_ids를 토큰으로 재변환 가능 
print(tokenizer.convert_ids_to_tokens(tokenized['input_ids']))

['[CLS]', '토크', '##나이', '##저', '##는', '텍스트', '##를', '토', '##큰', '단위', '##로', '나눈다', '[SEP]']


In [5]:
# BERT기반 모델은 [CLS]토큰이 무조건 맨 앞에 위치한다. 
# 보퉁 A, B 두 개의 모델을 받아 NSP로 사전학습 되어있으므로, 문장을 분리할 [SEP] 토큰이 필요하다.
print(tokenizer.decode(tokenized['input_ids']))

[CLS] 토크나이저는 텍스트를 토큰 단위로 나눈다 [SEP]


In [6]:
print(tokenizer.decode(tokenized['input_ids'], skip_special_tokens=True))

토크나이저는 텍스트를 토큰 단위로 나눈다


## 토크나이저에 여러 문장 넣기

In [7]:
tokenizer(['첫 번째 문장', '두 번째 문장'])

{'input_ids': [[0, 1656, 1141, 3135, 6265, 2], [0, 864, 1141, 3135, 6265, 2]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [8]:
first_tokenized_result = tokenizer(['첫 번째 문장','두 번째 문장'])['input_ids']
tokenizer.batch_decode(first_tokenized_result)

['[CLS] 첫 번째 문장 [SEP]', '[CLS] 두 번째 문장 [SEP]']

## BERT와 RoBERTa의 토크나이저 비교

- BERT 에서는 사전 학습 과정에서 NSP로 됨 
- RoBERTa 에서는 NSP 작업을 사전 학습 과정에서 제거했기 때문에 문장 토큰 구분이 필요 없음

In [8]:
bert_tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')
bert_tokenizer([['첫 번째 문장','두 번째 문장']])

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

{'input_ids': [[2, 1656, 1141, 3135, 6265, 3, 864, 1141, 3135, 6265, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [9]:
roberta_tokenizer = AutoTokenizer.from_pretrained('klue/roberta-base')
roberta_tokenizer([['첫 번째 문장', '두 번째 문장']])

{'input_ids': [[0, 1656, 1141, 3135, 6265, 2, 864, 1141, 3135, 6265, 2]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [9]:
# 로컬 데이터도 불러올 수 있음 
from datasets import load_dataset

klue_mrc_dataset = load_dataset('klue','mrc')

In [10]:
klue_mrc_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'context', 'news_category', 'source', 'guid', 'is_impossible', 'question_type', 'question', 'answers'],
        num_rows: 17554
    })
    validation: Dataset({
        features: ['title', 'context', 'news_category', 'source', 'guid', 'is_impossible', 'question_type', 'question', 'answers'],
        num_rows: 5841
    })
})

## 모델 학습
- YNAT 연랍 뉴스 기사의 제목을 바탕으로 카테고리 예측 모델 

In [11]:
klue_tc_train = load_dataset('klue','ynat', split = 'train')
klue_tc_eval = load_dataset('klue','ynat', split = 'validation')
klue_tc_train

Dataset({
    features: ['guid', 'title', 'label', 'url', 'date'],
    num_rows: 45678
})

In [12]:
klue_tc_train[0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [13]:
klue_tc_train.features['label'].names

['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치']

In [14]:
klue_tc_train = klue_tc_train.remove_columns(['guid','url','date'])
klue_tc_eval = klue_tc_eval.remove_columns(['guid','url','date'])
klue_tc_train

Dataset({
    features: ['title', 'label'],
    num_rows: 45678
})

In [15]:
# int2str -> ID를 string으로 변환해주는 메서드
klue_tc_train.features['label'].int2str(1)
klue_tc_train.features['label']

ClassLabel(names=['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치'], id=None)

In [16]:
klue_tc_label = klue_tc_train.features['label']

def make_str_label(batch):
    batch['label_str'] = klue_tc_label.int2str(batch['label'])
    return batch

klue_tc_train = klue_tc_train.map(make_str_label, batched = True, batch_size = 1000)

klue_tc_train[0]

{'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영', 'label': 3, 'label_str': '생활문화'}

In [None]:
# 전체 train 데이터에서 1000개의 샘플을 떼어내는데 이는 test로 반환됨 -> train_dataset 
train_dataset = klue_tc_train.train_test_split(test_size = 1000, shuffle = True, seed = 42)['test']
dataset = klue_tc_eval.train_test_split(test_size = 1000, shuffle = True, seed = 42)

test_dataset = dataset['test']
# dataset의 train중 1000개 valid_dataset으로 
valid_dataset = dataset['train'].train_test_split(test_size=1000, shuffle=True, seed=42)['test']

## HuggingFace의 Trainer API

In [34]:
import torch
import numpy as np
from transformers import (
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def tokenize_function(examples):
    return tokenizer(examples['title'], padding = "max_length", truncation = True)

model_ckpt = 'klue/roberta-base'
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = len(train_dataset.features['label'].names))

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
train_dataset = train_dataset.map(tokenize_function, batched = False)
valid_dataset = valid_dataset.map(tokenize_function, batched = False)
test_dataset = test_dataset.map(tokenize_function, batched = False)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [35]:
training_args = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 1,
    per_device_train_batch_size = 8, # 배치 크기 
    per_device_eval_batch_size = 8,
    eval_strategy = 'epoch', # 평가 수행 빈도 설정 
    learning_rate = 5e-5,
    push_to_hub = False,
)

# HugginhFace의 Trainer 모델은 compute_metrics를 함수로 정의한 후 받음
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
    return {"accuracy":(predictions == labels).mean()}

In [36]:
# Trainer API에서는 자동으로 모델을 이동시킴 -> to(device)
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
)

trainer.train()

trainer.evaluate(test_dataset)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.718538,0.784


{'eval_loss': 0.7148598432540894,
 'eval_accuracy': 0.776,
 'eval_runtime': 478.1066,
 'eval_samples_per_second': 2.092,
 'eval_steps_per_second': 0.261,
 'epoch': 1.0}

## Trainer API를 사용하지 않는 경우

In [67]:
import torch
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from torch.optim import AdamW # 가중치 조정 Adam

# "title"에 대한 전처리 
def tokenizer_function(example):
    return tokenizer(example['title'], padding = "max_length", truncation = True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_ckpt = "klue/roberta-base"

num_labels = len(train_dataset.features['label'].names)
print(f"Label:{train_dataset.features['label'].names}")
print(f"Number of labels: {num_labels}") # 확인용 출력
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = num_labels)

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model.to(device)



Label:['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치']
Number of labels: 7


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [62]:
train_dataset = train_dataset.map(tokenize_function, batched = False)
valid_dataset = valid_dataset.map(tokenize_function, batched = False)
test_dataset = test_dataset.map(tokenize_function, batched = False)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [63]:
def make_dataloader(dataset, batch_size, shuffle = True):
    # PyTorch에 맞게 with_format으로 tensor화
    # with_format은 torch.Tensor와 동일한 효과 
    # batched = True로 배치 단위로 데이터 처리 
    dataset = dataset.map(tokenize_function, batched = True).with_format('torch')
    # transformer는 labels를 입력으로 기대하므로 컬럼명 변경 
    # outputs = model(input_ids, attention_mask, labels)
    dataset = dataset.rename_column("label","labels")
    dataset = dataset.remove_columns(column_names = ["title"]) # 불필요한 컬럼 제거 
    return DataLoader(dataset, batch_size = batch_size, shuffle = shuffle)

train_dataloader = make_dataloader(train_dataset, batch_size = 8, shuffle = True)
valid_dataloader = make_dataloader(valid_dataset, batch_size = 8, shuffle = False)
test_dataloader = make_dataloader(test_dataset, batch_size = 8, shuffle = False)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [71]:
def train_epoch(model, data_loader, optimizer):
    model.train() # 모델 학습 모드
    total_loss = 0
    
    # 데이터로더를 불러와서 batch 단위로 반복 
    for batch in tqdm(data_loader):
        optimizer.zero_grad()
        
        # 각 배치 단위 데이터에서 트랜스포머에 맞는 입력값 추출
        # to(device)로 장치 통일 
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # print("--- Batch Info ---")
        # print(f"input_ids shape: {input_ids.shape}, dtype: {input_ids.dtype}")
        # print(f"attention_mask shape: {attention_mask.shape}, dtype: {attention_mask.dtype}")
        # print(f"labels shape: {labels.shape}, dtype: {labels.dtype}")
        # print("------------------")
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        total_loss += loss.item() # item() -> 파이썬의 float 값으로 변화
    
    avg_loss = total_loss / len(data_loader)
    
    return avg_loss

In [69]:
def evaluate(model, data_loader):
    model.eval() # 모델 추론 단계
    
    total_loss = 0
    predictions = []
    true_labels = []
    
    with torch.inference_mode():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = outputs.loss
            
            total_loss += loss.item()
            
            preds = torch.argmax(logits, dim = 1)
            
            # 예측 리스트와 정답 레이블 확장 
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
        
    avg_loss = total_loss / len(data_loader)
    acc = np.mean(np.asarray(predictions) == np.asarray(true_labels))
    
    return avg_loss, acc
            

In [72]:
num_epochs = 1
optimizer = AdamW(model.parameters(), lr=5e-5)

# 학습 루프
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train_loss = train_epoch(model, train_dataloader, optimizer)
    print(f"Training loss: {train_loss}")
    valid_loss, valid_accuracy = evaluate(model, valid_dataloader)
    print(f"Validation loss: {valid_loss}")
    print(f"Validation accuracy: {valid_accuracy}")

# Testing
_, test_accuracy = evaluate(model, test_dataloader)
print(f"Test accuracy: {test_accuracy}")

Epoch 1/1


  0%|          | 0/125 [00:00<?, ?it/s]

Training loss: 1.0435428705215455


  0%|          | 0/125 [00:00<?, ?it/s]

Validation loss: 0.8621064586639404
Validation accuracy: 0.729


  0%|          | 0/125 [00:00<?, ?it/s]

Test accuracy: 0.728
