<a href="https://colab.research.google.com/github/haegomm/ai_practice/blob/master/distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DistilBERT로 뉴스 기사 분류 모델 학습하기

In [None]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses datasets



## DistilBERT pre-training tokenizer 불러오기

In [None]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader

tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'distilbert-base-uncased')

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


##[MY CODE] AG News 데이터셋 불러오기

In [None]:
# 데이터셋 로드 (AG News 데이터셋 사용)
ds = load_dataset("fancyzhx/ag_news")


def collate_fn(batch):
  texts, labels = [], []
  for row in batch:
    labels.append(row['label'])
    texts.append(row['text'])

  # 토큰화 및 패딩 (truncation 제거로 모든 문장 유지)
  texts = torch.LongTensor(tokenizer(texts, padding=True).input_ids)
  labels = torch.LongTensor(labels)

  return texts, labels

# 데이터 로더 생성 (훈련 및 테스트 데이터 배치화)
train_loader = DataLoader(
    ds['train'], batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    ds['test'], batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [None]:
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'distilbert-base-uncased')
model

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

##[MY CODE] 출력 차원 조정

In [None]:
from torch import nn

# DistilBERT 기반 분류 모델 정의
class TextClassifier(nn.Module):
  def __init__(self):
    super().__init__()

    # DistilBERT 모델 로드 (사전 훈련된 모델 사용)
    self.encoder = torch.hub.load('huggingface/pytorch-transformers', 'model', 'distilbert-base-uncased')
    self.classifier = nn.Linear(768, 4) # 4개의 클래스(World, Sports, Business, Science/Technology)를 위한 출력 차원 조정

  def forward(self, x):
    x = self.encoder(x)['last_hidden_state']
    x = self.classifier(x[:, 0]) # 첫 번째 [CLS] 토큰을 분류에 사용

    return x

# 모델 초기화
model = TextClassifier()

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


In [None]:
# DistilBERT 인코더 동결 (전이 학습)
# 사전 훈련된 DistilBERT 모델의 인코더 부분을 동결하여 학습을 가속화하고 과적합 방지
# 인코더의 가중치는 업데이트되지 않으며, 분류 레이어만 학습
for param in model.encoder.parameters():
  param.requires_grad = False

##[MY CODE] 정확도 평가 수정: 가장 높은 확률을 가진 클래스 선택

In [None]:
def accuracy(model, dataloader):
  cnt = 0
  acc = 0

  for data in dataloader:
    inputs, labels = data
    inputs, labels = inputs.to('cuda'), labels.to('cuda')

    preds = model(inputs)
    preds = torch.argmax(preds, dim=-1) # 가장 높은 확률을 가진 클래스 선택

    cnt += labels.shape[0]
    acc += (labels == preds).sum().item()

  return acc / cnt

##[MY CODE] 다중 클래스 분류 손실 함수 사용
##[LOG] train loss 출력

In [None]:
from torch.optim import Adam
import numpy as np
import matplotlib.pyplot as plt


lr = 0.001
model = model.to('cuda')
loss_fn = nn.CrossEntropyLoss() # 분류 손실함수 CrossEntropyLoss 사용

optimizer = Adam(model.parameters(), lr=lr)
n_epochs = 10

# 모델 학습 및 평가 실행 및 시각화
train_losses = []
train_accuracies = []
test_accuracies = []

for epoch in range(n_epochs):
  total_loss = 0.
  model.train()

  for data in train_loader:
    model.zero_grad()
    inputs, labels = data
    inputs, labels = inputs.to('cuda'), labels.to('cuda')

    preds = model(inputs) # 이진 분류에서 다중 클래스 분류로 변경
    loss = loss_fn(preds, labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  train_losses.append(total_loss)
  train_acc = accuracy(model, train_loader)
  test_acc = accuracy(model, test_loader)
  train_accuracies.append(train_acc)
  test_accuracies.append(test_acc)

  print(f"Epoch {epoch + 1}/{n_epochs} | Train Loss: {total_loss:.4f} | Train Accuracy: {train_acc:.3f} | Test Accuracy: {test_acc:.3f}")

Epoch 1/10 | Train Loss: 885.9392 | Train Accuracy: 0.873 | Test Accuracy: 0.870
Epoch 2/10 | Train Loss: 702.1948 | Train Accuracy: 0.881 | Test Accuracy: 0.881
Epoch 3/10 | Train Loss: 671.8452 | Train Accuracy: 0.881 | Test Accuracy: 0.879
Epoch 4/10 | Train Loss: 659.7851 | Train Accuracy: 0.883 | Test Accuracy: 0.881
Epoch 5/10 | Train Loss: 650.7018 | Train Accuracy: 0.881 | Test Accuracy: 0.877
Epoch 6/10 | Train Loss: 644.9291 | Train Accuracy: 0.880 | Test Accuracy: 0.879
Epoch 7/10 | Train Loss: 642.3082 | Train Accuracy: 0.884 | Test Accuracy: 0.885
Epoch 8/10 | Train Loss: 637.2493 | Train Accuracy: 0.883 | Test Accuracy: 0.881
Epoch 9/10 | Train Loss: 637.4252 | Train Accuracy: 0.883 | Test Accuracy: 0.881
Epoch 10/10 | Train Loss: 635.1702 | Train Accuracy: 0.884 | Test Accuracy: 0.881
