In [2]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses datasets



In [4]:
import torch
from torchinfo import summary
from datasets import load_dataset
from torch.utils.data import DataLoader
import transformers
from transformers import DistilBertTokenizer, DistilBertConfig, DistilBertModel

# DistilBERTTokenizer 가져오기 (pretrained)
# 이 tokenizer는 문장을 토큰화해서 모델이 이해할 수 있는 input_ids로 변환해줌
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer

DistilBertTokenizer(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

## special token
- [PAD] : 길이를 맞춰주기위해 넣어주는 토큰
- [UNK] : Tokenizer에 없는 단어를 대체하는 토큰
- [CLS] : 문장의 시작을 구분하는 토큰
- [SEP] : 문장의 끝을 구분하는 토큰
- [MASK] : 특정 토큰을 가리는 토큰

In [11]:
# 뉴스 기사 데이터셋의 5%만 가져오기
train_ds = load_dataset("fancyzhx/ag_news", split="train[:5%]")
test_ds = load_dataset("fancyzhx/ag_news", split="test[:5%]")

README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [51]:
# 데이터셋 형태 확인하기

train_text = train_ds['text']
train_label = train_ds['label']

test_text = test_ds['text']
test_label = test_ds['label']

print(f'train data length: {len(train_text)}')
print(f'train label length: {len(train_label)}')
print(f'train sample: {train_text[0]} | {train_label[0]}')

print(f'test data length: {len(test_text)}')
print(f'test label length: {len(test_label)}')
print(f'test sample: {test_text[0]} | {test_label[0]}')

train data length: 6000
train label length: 6000
train sample: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again. | 2
test data length: 380
test label length: 380
test sample: Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul. | 2


# huggingface : dataset 정보
* text: a string feature => 기사  
* label: a classification label (4개로 분류됨)
 : World (0), Sports (1), Business (2), Sci/Tech (3)

In [55]:
# 데이터를 배치로 묶는 함수 정의
def collate_fn(batch):
    max_len = 400  # 입력 문장의 최대 길이 설정
    texts, labels = [], []  # 입력 문장들과 라벨들을 저장할 리스트

    # 배치 내 각 샘플에 대해 text와 label 추출
    for row in batch:
        labels.append(row['label'])
        texts.append(row['text'])

    # tokenizer로 텍스트를 토큰화 : truncation=False로 수정
    texts = torch.LongTensor(
        tokenizer(texts, padding=True, truncation=False, max_length=max_len).input_ids
    )

    # 라벨 리스트를 LongTensor로 변환
    labels = torch.LongTensor(labels)

    # 모델 학습에 필요한 입력 (토큰화된 문장들)과 정답 라벨 반환
    return texts, labels

# 학습용 DataLoader 정의 (shuffle=True로 배치 순서 랜덤화)
train_loader = DataLoader(
    train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn
)

# 테스트용 DataLoader 정의 (shuffle=False로 배치 순서 고정)
test_loader = DataLoader(
    test_ds, batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [57]:
# 모델 가져오기
config = DistilBertConfig()
model = DistilBertModel(config)
print(model)
print(summary(model))

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [59]:
from torch import nn

# 텍스트 분류 모델 정의 (DistilBERT + Linear layer)
class TextClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        # 사전학습된 DistilBERT 모델을 encoder로 사용 (pretrained transformer)
        self.encoder = DistilBertModel.from_pretrained('distilbert-base-uncased')

        # [CLS] 토큰 분류기 정의
        self.classifier = nn.Linear(768, 4) # label의 분류가 4개이므로 출력 차원 조절

    def forward(self, x):
        # encoder에 input_ids 전달
        x = self.encoder(x)['last_hidden_state']

        # [CLS] 토큰 위치 벡터를 classification head에 전달
        x = self.classifier(x[:, 0])

        return x  # logit 출력

model = TextClassifier()

In [61]:
for param in model.encoder.parameters():
    param.requires_grad = False

In [63]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [67]:
from torch.optim import Adam
import numpy as np
import matplotlib.pyplot as plt

lr = 0.001
model = model.to(device)
# 일반 분류 : loss_fn으로 CrossEntropyLoss 사용
loss_fn = nn.CrossEntropyLoss() 

optimizer = Adam(model.parameters(), lr=lr)
n_epochs = 10

for epoch in range(n_epochs):
    total_loss = 0.
    model.train()

    for data in train_loader:
        model.zero_grad()

        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        preds = model(inputs)

        loss = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1:3d} | Train Loss: {total_loss:.4f}')

Epoch   1 | Train Loss: 125.3775
Epoch   2 | Train Loss: 80.8265
Epoch   3 | Train Loss: 64.4224
Epoch   4 | Train Loss: 58.1567
Epoch   5 | Train Loss: 53.9230
Epoch   6 | Train Loss: 51.5969
Epoch   7 | Train Loss: 49.8435
Epoch   8 | Train Loss: 48.0503
Epoch   9 | Train Loss: 46.9408
Epoch  10 | Train Loss: 46.2360


* nn.crossEntropyLoss에 label로 정수형 LongTensor를 넣어줬어야 함  
  labels.to(device).float()에서 labels.to(device)로 수정

In [78]:
def accuracy(model, dataloader):
    cnt = 0      # 전체 샘플 수
    acc = 0      # 정답 개수 누적

    for data in dataloader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        preds = model(inputs)  # (batch_size, num_classes = 4)
        # print(preds.shape)
        preds = torch.argmax(preds, dim=-1) # 출력 차원 조정

        cnt += labels.size(0)  # 총 샘플 수 누적
        acc += (labels == preds).sum().item()  # 예측이 맞은 수 누적

    return acc / cnt  # 정확도 반환

# 평가 시 gradient 계산 비활성화
with torch.no_grad():
    model.eval()  # 평가 모드로 전환 (계산 비활성화)
    train_acc = accuracy(model, train_loader)
    test_acc = accuracy(model, test_loader)

    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")

