In [1]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses datasets



In [2]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import OpenAIGPTTokenizer, OpenAIGPTForSequenceClassification
from torch import nn
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel

tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
tokenizer.pad_token = tokenizer.unk_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [4]:
# 데이터셋 로드 및 전처리
ds = load_dataset('fancyzhx/ag_news')


In [5]:
len_classes = len(ds['train'].features['label'].names)

class TextClassifier(nn.Module):
  def __init__(self, n_class):
    super(TextClassifier, self).__init__()

    self.encoder = torch.hub.load('huggingface/pytorch-transformers', 'model', 'openai-gpt')
    self.classifier = nn.Linear(768, n_class)

  def forward(self, input_ids, attention_mask):
    outputs = self.encoder(input_ids, attention_mask)['last_hidden_state']
    x = outputs.mean(dim=1)
    x = self.classifier(x)
    return x

model = TextClassifier(len_classes)
model = model.to('cuda')


Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


In [6]:
def collate_fn(batch):
    max_len = 400
    texts, labels = [], []
    for row in batch:
        labels.append(row['label'])
        texts.append(row['text'])

    # 패딩을 위해 tokenizer에 pad_token을 설정 (unk_token 사용 중)
    encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_len, return_tensors="pt")

    return encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels)

train_loader = DataLoader(ds['train'], batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(ds['test'], batch_size=16, shuffle=False, collate_fn=collate_fn)


In [7]:
 for param in model.encoder.parameters():
      param.requires_grad = False

In [None]:
from torch.optim import Adam
import numpy as np

# 하이퍼파라미터 설정
optimizer = Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

n_epochs = 10

for epoch in range(n_epochs):
    total_loss = 0.
    model.train()

    for data in train_loader:
        inputs, attention_mask, labels = data
        inputs, attention_mask, labels = inputs.to('cuda'), attention_mask.to('cuda'), labels.to('cuda')

        optimizer.zero_grad()
        logits = model(input_ids=inputs, attention_mask=attention_mask)

        # 손실 계산 및 역전파
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{n_epochs}, Loss: {total_loss:.4f}")


Epoch 1/10, Loss: 2961.4942
Epoch 2/10, Loss: 2699.3613
Epoch 3/10, Loss: 2643.2096
Epoch 4/10, Loss: 2611.1112
Epoch 5/10, Loss: 2599.9903


In [None]:
def accuracy(model, dataloader):
    cnt = 0
    acc = 0

    for data in dataloader:
        inputs, labels = data
        inputs, labels = inputs.to('cuda'), labels.to('cuda')

        preds = model(inputs)
        preds = torch.argmax(preds, dim=-1)  # 다중 클래스 분류를 위한 argmax 사용

        cnt += labels.shape[0]
        acc += (labels == preds).sum().item()

    return acc / cnt if cnt > 0 else 0  # cnt가 0인 경우 방지


with torch.no_grad():
    model.eval()
    train_acc = accuracy(model, train_loader)
    test_acc = accuracy(model, test_loader)
    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")