## Step 1: Dataset 준비 및 사전 작업

In [2]:
# 필요한 라이브러리 설치 및 불러오기
!pip install tqdm boto3 requests regex sentencepiece sacremoses datasets

import torch
from datasets import load_dataset
from torch.utils.data import DataLoader

# DistilBERT pre-trained tokenizer 불러오기
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'distilbert-base-uncased')

# AG_News 데이터셋 로드
ds = load_dataset("fancyzhx/ag_news")

# Truncation을 제외한 collate_fn 함수 정의
def collate_fn(batch):
    texts, labels = [], []
    for row in batch:
        labels.append(row['label'])
        texts.append(row['text'])

    encoding = tokenizer(texts, padding=True, return_tensors="pt")
    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    labels = torch.LongTensor(labels)

    return input_ids, attention_mask, labels

# DataLoader 설정
train_loader = DataLoader(ds['train'], batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(ds['test'], batch_size=64, shuffle=False, collate_fn=collate_fn)

Collecting boto3
  Downloading boto3-1.35.31-py3-none-any.whl.metadata (6.6 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting botocore<1.36.0,>=1.35.31 (from boto3)
  Downloading botocore-1.35.31-py3-none-any.whl.metadata (5.6 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Downloading s3transfer-0.10.2-py3-none-any.whl.metadata (1.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multi

Downloading: "https://github.com/huggingface/pytorch-transformers/zipball/main" to /root/.cache/torch/hub/main.zip
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

## Step 2: 모델 정의 및 수정

In [5]:
from torch import nn

# DistilBERT를 사용하는 텍스트 분류 모델 정의
class TextClassifier(nn.Module):
    def __init__(self):
        super(TextClassifier, self).__init__()
        self.encoder = torch.hub.load('huggingface/pytorch-transformers', 'model', 'distilbert-base-uncased')
        self.classifier = nn.Linear(768, 4)  # 4개의 클래스

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs['last_hidden_state']
        x = self.classifier(x[:, 0])  # [CLS] 토큰만 사용하여 분류
        return x

model = TextClassifier().to('cuda')

# CrossEntropyLoss 사용 (다중 클래스 분류)
loss_fn = nn.CrossEntropyLoss()


Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


## 3. 함수 수정 및 평가

In [6]:
from torch.optim import Adam

# Optimizer 설정
optimizer = Adam(model.parameters(), lr=0.0001)  # 학습률 줄임

# Accuracy 함수 수정 (다중 클래스 분류에 맞게)
def accuracy(model, dataloader):
    cnt = 0
    acc = 0
    model.eval()

    with torch.no_grad():
        for data in dataloader:
            input_ids, attention_mask, labels = data
            input_ids, attention_mask, labels = input_ids.to('cuda'), attention_mask.to('cuda'), labels.to('cuda')

            preds = model(input_ids, attention_mask)
            preds = torch.argmax(preds, dim=-1)  # 가장 높은 확률의 클래스를 예측

            cnt += labels.size(0)
            acc += (preds == labels).sum().item()

    return acc / cnt

# 학습 루프 (매 epoch마다 train loss 출력)
n_epochs = 10
for epoch in range(n_epochs):
    total_loss = 0
    model.train()

    for data in train_loader:
        input_ids, attention_mask, labels = data
        input_ids, attention_mask, labels = input_ids.to('cuda'), attention_mask.to('cuda'), labels.to('cuda')

        optimizer.zero_grad()
        preds = model(input_ids, attention_mask)
        loss = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1:2d} | Train Loss: {total_loss:.4f}")

# 최종 학습 후 정확도 측정
train_acc = accuracy(model, train_loader)
test_acc = accuracy(model, test_loader)
print(f"=========> Train Accuracy: {train_acc:.3f} | Test Accuracy: {test_acc:.3f}")


Epoch  1 | Train Loss: 407.3806
Epoch  2 | Train Loss: 246.2814
Epoch  3 | Train Loss: 171.6969
Epoch  4 | Train Loss: 122.9436
Epoch  5 | Train Loss: 89.0730
Epoch  6 | Train Loss: 72.3245
Epoch  7 | Train Loss: 54.9770
Epoch  8 | Train Loss: 50.3001
Epoch  9 | Train Loss: 45.6764
Epoch 10 | Train Loss: 36.9729


## 샘플 출력

In [7]:
import random

# 예측 결과 샘플 출력 함수
def print_sample_predictions(model, dataloader, num_samples=10):
    model.eval()
    with torch.no_grad():
        for data in dataloader:
            input_ids, attention_mask, labels = data
            input_ids, attention_mask, labels = input_ids.to('cuda'), attention_mask.to('cuda'), labels.to('cuda')

            preds = model(input_ids, attention_mask)
            preds = torch.argmax(preds, dim=-1)

            # 랜덤으로 샘플 선택하여 예측 결과 출력
            for i in range(num_samples):
                idx = random.randint(0, input_ids.shape[0] - 1)
                print(f"Sample {i + 1}:")
                print(f"Text: {tokenizer.decode(input_ids[idx], skip_special_tokens=True)}")
                print(f"Prediction: {preds[idx].item()} | Actual: {labels[idx].item()}")
                print('-' * 50)
            break  # 한 배치에서만 샘플 추출

# 샘플 예측 출력
print_sample_predictions(model, test_loader)


Sample 1:
Text: johnson back to his best as d - backs end streak new york ( reuters ) - randy johnson struck out 14 batters in 8 1 / 3 innings to help the arizona diamondbacks end a nine - game losing streak with a 2 - 0 win over the host new york mets in the national league sunday.
Prediction: 1 | Actual: 1
--------------------------------------------------
Sample 2:
Text: storage, servers bruise hp earnings update earnings per share rise compared with a year ago, but company misses analysts'expectations by a long shot.
Prediction: 3 | Actual: 3
--------------------------------------------------
Sample 3:
Text: card fraud unit nets 36, 000 cards in its first two years, the uk's dedicated card fraud unit, has recovered 36, 000 stolen cards and 171 arrests - and estimates it saved 65m.
Prediction: 3 | Actual: 3
--------------------------------------------------
Sample 4:
Text: spam suspension hits sohu. com shares ( ft. com ) ft. com - shares in sohu. com, a leading us - listed chinese 

모델을 10 epoch 동안 학습한 결과, **Train Accuracy: 99.7%**, **Test Accuracy: 93.4%**로 높은 성능을 기록했습니다. 이를 통해 모델이 뉴스 기사 분류 작업에서 매우 좋은 성능을 보이는 것을 확인할 수 있습니다.