In [1]:
!pip install transformers datasets accelerate -q
!pip install wandb -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dep

# 텍스트 분류

네이버 영화 리뷰 감성 분류: Naver sentiment movie corpus (NSMC)

데이터셋
- train: 150K
- test: 50K

In [2]:
import torch
from datasets import load_dataset
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, pipeline
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import os
import gc
from tqdm.notebook import trange, tqdm

In [3]:
dataset_id = "e9t/nsmc"
model_id = "klue/roberta-small"

# 데이터셋 로드

In [4]:
ds = load_dataset(dataset_id, trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/3.18k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.33M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.89M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/150000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})


In [6]:
ds['train'][0]

{'id': '9976970', 'document': '아 더빙.. 진짜 짜증나네요 목소리', 'label': 0}

# 토크나이저 설정

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

토크나이징 함수 정의

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["document"], truncation=True, padding=True)

데이터셋에 토크나이징 적용

In [9]:
tokenized_datasets = ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets['train'][0]

구글 코랩 T4에서 실습시 오래 걸리기 때문에 상위 10%씩의 데이터만 사용

In [12]:
tokenized_datasets['train'] = tokenized_datasets['train'].select(range(len(tokenized_datasets['train'])//10))
tokenized_datasets['test'] = tokenized_datasets['test'].select(range(len(tokenized_datasets['test']) // 10))

# DataCollator 설정

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 확인

In [14]:
ts = tokenized_datasets['train']
print(ts.features)

{'id': Value(dtype='string', id=None), 'document': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive'], id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [15]:
samples = [{k: ts[i][k] for k in ts.features if k in ['input_ids', 'token_type_ids', 'attention_mask', 'label']} for i in range(4)]

In [16]:
# DataCollator 적용
batch = data_collator(samples)

In [None]:
samples

In [20]:
print(batch['input_ids'].shape)
print(batch['token_type_ids'].shape)
print(batch['attention_mask'].shape)
# print(batch['label'].shape

torch.Size([4, 105])
torch.Size([4, 105])
torch.Size([4, 105])


# 모델 로드

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

커스텀 레이블 추가

In [23]:
# 레이블 정의
id2label = {0: "부정", 1: "긍정"}
label2id = {"부정": 0, "긍정": 1}

# 설정 로드 및 레이블 지정
config = AutoConfig.from_pretrained(model_id,
                                    num_labels=2,
                                    id2label=id2label,
                                    label2id=label2id)



config.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(model_id, config=config)

model.safetensors:   0%|          | 0.00/273M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 평가 메트릭 정의

In [25]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Trainer 설정

In [26]:
os.environ["WANDB_PROJECT"]="HF_workshop"

wandb 계정이 있다면 셀 실행 / 없다면 패스

In [27]:
from google.colab import userdata
import wandb

wandb_api_key = userdata.get('WANDB_API_KEY')
if wandb_api_key:
    wandb.login(key=wandb_api_key)
    print("Successfully logged in to Weights & Biases")
else:
    print("WANDB_API_KEY not found in Colab secrets")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Successfully logged in to Weights & Biases


In [28]:

training_args = TrainingArguments(
    output_dir="./model-by-trainer",
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_ratio=0.1, # 전체 스텝의 10%는 웜업에 사용
    learning_rate=1e-4, # 학습률
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="wandb", # wandb 계정이 없다면 "none" 으로 설정
    run_name="nsmc-text-classfication",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,

)



# 훈련

## Trainer 활용 훈련

참고: 샘플링하지 않고 전체 데이터 훈련시
- T4 에서는 에포크 당 25분 걸림
- RTX4090에서는 에포크 당 3분40초 걸림

In [29]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mjangmin-o[0m ([33mozlab[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3256,0.342808,0.8548,0.856295,0.854941,0.857653
2,0.2231,0.323206,0.8688,0.873652,0.849438,0.899286
3,0.1555,0.412214,0.8744,0.87584,0.873423,0.878271


TrainOutput(global_step=705, training_loss=0.2607790051622594, metrics={'train_runtime': 491.109, 'train_samples_per_second': 91.629, 'train_steps_per_second': 1.436, 'total_flos': 1478118839675040.0, 'train_loss': 0.2607790051622594, 'epoch': 3.0})

## 커스텀 훈련 루프

In [None]:
# 메모리 해제 (완벽하게 해제되지는 않음)
del model
del trainer
torch.cuda.empty_cache()  # GPU 메모리 정리
gc.collect()  # 가비지 컬렉션 실행

1413

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_id, config=config)
model.to(device)

compact_tokenized_datasets = tokenized_datasets.remove_columns(['id', 'document'])

train_dataloader = DataLoader(compact_tokenized_datasets["train"], batch_size=32, shuffle=True, collate_fn=data_collator)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch+1}/{num_epochs} completed")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch 1/3 completed


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch 2/3 completed


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch 3/3 completed


## 커스텀 훈련 검증 평가

In [None]:
model.eval()
eval_dataloader = DataLoader(compact_tokenized_datasets["test"], batch_size=64, collate_fn=data_collator)

total_preds = []
total_labels = []

with torch.no_grad():
    for batch in tqdm(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        total_preds.extend(preds.cpu().numpy())
        total_labels.extend(batch["labels"].cpu().numpy())

accuracy = accuracy_score(total_labels, total_preds)
precision, recall, f1, _ = precision_recall_fscore_support(total_labels, total_preds, average='binary')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

  0%|          | 0/79 [00:00<?, ?it/s]

Accuracy: 0.8678
F1 Score: 0.8671890697207152
Precision: 0.8790224032586558
Recall: 0.8556701030927835


# 파이프라인을 통한 테스트

In [30]:
classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)

In [32]:
classification_pipeline("환불각.")

[{'label': '부정', 'score': 0.8619368076324463}]