In [None]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses datasets

Collecting boto3
  Downloading boto3-1.35.32-py3-none-any.whl.metadata (6.6 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting botocore<1.36.0,>=1.35.32 (from boto3)
  Downloading botocore-1.35.32-py3-none-any.whl.metadata (5.6 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Downloading s3transfer-0.10.2-py3-none-any.whl.metadata (1.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multi

In [None]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader

In [None]:
from transformers import DistilBertTokenizer



tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# 데이터셋 로드
ds = load_dataset("fancyzhx/ag_news")


def collate_fn(batch):
    max_len = 400
    texts, labels = [], []

    for row in batch:
        labels.append(row['label'])
        texts.append(row['text'])

    encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_len, return_tensors="pt")

    return encodings['input_ids'], torch.tensor(labels)

train_loader = DataLoader(
    ds['train'], batch_size=32, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    ds['test'], batch_size=32, shuffle=False, collate_fn=collate_fn
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

이제 pre-trained DistilBERT를 불러옵니다. 이번에는 PyTorch hub에서 제공하는 DistilBERT를 불러봅시다.

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)
model = model.to('cuda')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch import nn
import torch

class TextClassifier(nn.Module):
    def __init__(self, num_classes=4):  # AG News 데이터셋은 4개의 클래스
        super().__init__()
        self.encoder = torch.hub.load('huggingface/pytorch-transformers', 'model', 'distilbert-base-uncased')
        self.classifier = nn.Linear(768, num_classes)  # num_classes에 맞게 수정

    def forward(self, input_ids, attention_mask=None):
        # encoder에 입력을 넣고 마지막 히든 상태를 가져옵니다
        x = self.encoder(input_ids, attention_mask=attention_mask)['last_hidden_state']
        x = self.classifier(x[:, 0])  # [CLS] 토큰의 출력을 사용
        return x

# 모델 초기화 (AG News에 맞게 num_classes를 4로 설정)
model = TextClassifier(num_classes=4)  # AG News 데이터셋은 4개의 클래스를 갖고 있습니다
criterion = nn.CrossEntropyLoss()


Downloading: "https://github.com/huggingface/pytorch-transformers/zipball/main" to /root/.cache/torch/hub/main.zip


In [None]:
for param in model.encoder.parameters():
  param.requires_grad = False

In [None]:
from torch.optim import Adam
import numpy as np
import matplotlib.pyplot as plt

# 하이퍼파라미터 설정
lr = 0.001
model = model.to('cuda')

loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=lr)
n_epochs = 10

for epoch in range(n_epochs):
    total_loss = 0.
    model.train()
    for data in train_loader:
        # Gradient 초기화
        model.zero_grad()
        inputs, labels = data
        inputs, labels = inputs.to('cuda'), labels.to('cuda')

        # 모델 출력 (preds는 (batch_size, num_classes) 형태가 되어야 함)
        preds = model(inputs)  # 모델 출력

        # 손실 계산
        loss = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()


    print(f"Epoch {epoch:3d} | Train Loss: {total_loss:.4f}")





We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch   0 | Train Loss: 1564.4881
Epoch   1 | Train Loss: 1305.3833
Epoch   2 | Train Loss: 1276.3482
Epoch   3 | Train Loss: 1253.8022
Epoch   4 | Train Loss: 1242.6226
Epoch   5 | Train Loss: 1236.6764
Epoch   6 | Train Loss: 1226.4422
Epoch   7 | Train Loss: 1221.2127
Epoch   8 | Train Loss: 1218.9671
Epoch   9 | Train Loss: 1219.6964


In [None]:
def accuracy(model, dataloader):
    cnt = 0
    acc = 0

    for data in dataloader:
        inputs, labels = data
        inputs, labels = inputs.to('cuda'), labels.to('cuda')

        preds = model(inputs)
        preds = torch.argmax(preds, dim=-1)  # 다중 클래스 분류를 위한 argmax 사용

        cnt += labels.shape[0]
        acc += (labels == preds).sum().item()

    return acc / cnt if cnt > 0 else 0  # cnt가 0인 경우 방지


with torch.no_grad():
    model.eval()
    train_acc = accuracy(model, train_loader)
    test_acc = accuracy(model, test_loader)
    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")


