<a href="https://colab.research.google.com/github/tolluset/hh-ai-1/blob/w3-ad/week3/3-advance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [3주차 심화과제] GPT로 뉴스 기사 분류 모델 학습하기



In [None]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses datasets

## ✅ Tokenizer 및 `TextClassifier`의 encoder를 GPT로 변경
- ✅ `distilbert-base-uncased`로 설정한 tokenizer와 encoder를 `openai-gpt`로 변경해줍니다.
- ✅ GPT의 tokenizer는 padding token이 없어, 다음 코드로 padding token을 추가해줍니다:

`tokenizer.pad_token = tokenizer.unk_token`

In [None]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader

tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'openai-gpt')

tokenizer.pad_token = tokenizer.unk_token

In [10]:
ds = load_dataset("fancyzhx/ag_news")

def collate_fn(batch):
  max_len = 400
  texts, labels = [], []
  for row in batch:
    labels.append(row['label'])
    texts.append(row['text'])

  encoding = tokenizer(texts, padding=True, return_tensors='pt')

  texts, attention_mask = encoding['input_ids'], encoding['attention_mask']
  labels = torch.tensor(labels)

  return texts, attention_mask, labels


train_loader = DataLoader(
    ds['train'], batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    ds['test'], batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [11]:
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'openai-gpt')
model

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


OpenAIGPTModel(
  (tokens_embed): Embedding(40478, 768)
  (positions_embed): Embedding(512, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
  )
)

# ✅ Classify 할 때 사용하는 token representation 변경
- ✅ 현재 `TextClassifier`는 첫 번째 token의 representation으로 label을 예측하고 있습니다.
- ✅ 하지만 GPT의 pre-train 방식을 생각하면 첫 번째 token으로 label을 예측하면 안됩니다.
- ✅ GPT의 pre-train 방식과 이전 RNN 실습에서 아이디어를 얻어 classify를 할 때 사용하는 token representation을 적절하게 변경하시면 됩니다.

In [12]:
from torch import nn


class TextClassifier(nn.Module):
  def __init__(self):
    super().__init__()

    self.encoder = torch.hub.load('huggingface/pytorch-transformers', 'model', 'openai-gpt')
    self.classifier = nn.Linear(768, 4)

  def forward(self, input_ids, attention_mask):
    x = self.encoder(input_ids, attention_mask)['last_hidden_state']
    x = x.mean(dim=1)
    x = self.classifier(x)

    return x


model = TextClassifier()

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


In [13]:
for param in model.encoder.parameters():
  param.requires_grad = False

# ✅ 학습 결과 report
- ✅  DistilBERT 실습과 같이 매 epoch 마다의 train loss를 출력하고 최종 모델의 test accuracy를 report합니다.

In [14]:
from torch.optim import Adam
import numpy as np
import matplotlib.pyplot as plt
import itertools


lr = 0.001
model = model.to('cuda')
loss_fn = nn.CrossEntropyLoss()

optimizer = Adam(model.parameters(), lr=lr)
n_epochs = 10

for epoch in range(n_epochs):
  total_loss = 0.
  model.train()
  for data in itertools.islice(train_loader, 100):
    model.zero_grad()
    inputs, masks, labels = data
    inputs, masks, labels = inputs.to('cuda'), masks.to('cuda'), labels.to('cuda')

    preds = model(inputs, masks)
    loss = loss_fn(preds, labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

Epoch   0 | Train Loss: 71.27290883660316
Epoch   1 | Train Loss: 47.523161709308624
Epoch   2 | Train Loss: 45.09037318825722
Epoch   3 | Train Loss: 43.43977831304073
Epoch   4 | Train Loss: 43.18365207314491
Epoch   5 | Train Loss: 40.86907768249512
Epoch   6 | Train Loss: 40.35617744922638
Epoch   7 | Train Loss: 38.736176893115044
Epoch   8 | Train Loss: 39.7614456564188
Epoch   9 | Train Loss: 38.679342806339264


In [15]:
def accuracy(model, dataloader):
  cnt = 0
  acc = 0

  for data in itertools.islice(dataloader, 100):
    inputs, attention_mask, labels = data
    inputs, attention_mask, labels = inputs.to('cuda'), attention_mask.to('cuda'), labels.to('cuda')

    preds = model(inputs, attention_mask)
    preds = torch.argmax(preds, dim=-1)
    # preds = (preds > 0).long()[..., 0]

    cnt += labels.shape[0]
    acc += (labels == preds).sum().item()

  return acc / cnt


with torch.no_grad():
  model.eval()
  train_acc = accuracy(model, train_loader)
  test_acc = accuracy(model, test_loader)
  print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")

