# [3주차 심화과제] GPT로 뉴스 기사 분류 모델 학습하기


In [1]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader

tokenizer = torch.hub.load(
    "huggingface/transformers",  # huggingface/pytorch-transformers 에서 huggingface/transformers 로 변경됨
    "tokenizer",  # github.com/huggingface/transformers repo의 root에 'hubconf.py' 파일의 'tokenizer' 함수를 가져옴
    "openai-gpt",  # tokenizer 함수의 인자로 전달됨
    trust_repo=True,  # huggingface/transformers 를 믿을 수 있는 repo로 설정 -> 로컬의 trusted_list에 추가됨
    clean_up_tokenization_spaces=True,  # 현재는 True가 default 이지만 향후 False가 default로 바뀔예정 : 문장 중간에 나오는 띄어쓰기와 관련된 이슈 > https://github.com/huggingface/transformers/issues/31884
)

tokenizer.pad_token = tokenizer.unk_token

Using cache found in /Users/obov/.cache/torch/hub/huggingface_transformers_main


In [2]:
ds = load_dataset("fancyzhx/ag_news")

ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [3]:
tokenizer(
    ds["train"][0]["text"],
    padding="max_length",
    truncation=False,
    max_length=400,
).attention_mask

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [4]:
unique_labels = set(ds["train"]["label"])
count_of_label = len(unique_labels)
count_of_label

4

In [5]:
label_names = ds["train"].features["label"].names

label_names

['World', 'Sports', 'Business', 'Sci/Tech']

In [12]:
def collate_fn(batch):
    max_len = 400
    texts, labels = [], []
    for row in batch:
        labels.append(row["label"])
        texts.append(row["text"])

    tokenized = tokenizer(
        texts,
        padding=True,
        truncation=False,
        max_length=None,
    )

    texts = torch.LongTensor(tokenized.input_ids)
    attention_mask = torch.LongTensor(tokenized.attention_mask)
    labels = torch.LongTensor(labels)

    return texts, attention_mask, labels


train_loader = DataLoader(
    ds["train"], batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    ds["test"], batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [13]:
model = torch.hub.load(
    "huggingface/transformers",  # huggingface/pytorch-transformers 에서 huggingface/transformers 로 변경됨
    "model",  # 'model' 함수를 가져옴
    "openai-gpt",
)
model

Using cache found in /Users/obov/.cache/torch/hub/huggingface_transformers_main


OpenAIGPTModel(
  (tokens_embed): Embedding(40478, 768)
  (positions_embed): Embedding(512, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x Block(
      (attn): Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
  )
)

In [14]:
from torch import nn


class TextClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        self.encoder = torch.hub.load(
            "huggingface/transformers",  # huggingface/pytorch-transformers 에서 huggingface/transformers 로 변경됨
            "model",  # 'model' 함수를 가져옴
            "openai-gpt",
        )

        self.classifier = nn.Linear(768, 4)

    def forward(self, x, attention_mask):
        x = self.encoder(x, attention_mask)["last_hidden_state"]
        x = self.classifier(x[:, 0])
        return x


model = TextClassifier()

Using cache found in /Users/obov/.cache/torch/hub/huggingface_transformers_main


In [15]:
count_of_params = 0
for param in model.encoder.parameters():
    count_of_params += 1
    param.requires_grad = False
count_of_params

146

In [16]:
from torch.optim import Adam
import numpy as np
import matplotlib.pyplot as plt


lr = 0.001
model = model.to("mps")
loss_fn = nn.CrossEntropyLoss()

optimizer = Adam(model.parameters(), lr=lr)
n_epochs = 10

print_first_input = True

for epoch in range(n_epochs):
    total_loss = 0.0
    model.train()
    for data in train_loader:
        model.zero_grad()
        inputs, attention_mask, labels = data
        inputs, attention_mask, labels = (
            inputs.to("mps"),
            attention_mask.to("mps"),
            labels.to("mps").float(),
        )

        preds = model(
            inputs,
            attention_mask=attention_mask,
        )
        if print_first_input:
            print("preds", preds)
            print("labels", labels)
            print("Inputs:", inputs)
            print("attention_mask:", attention_mask)
            print("shape of preds", preds.shape)
            print("shape of labels", labels.shape)
            print_first_input = False
        loss = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

preds tensor([[-0.3225, -0.4071, -0.2274,  0.1971],
        [-0.1983, -0.1241, -0.1521, -0.2517],
        [ 0.0647, -0.1943,  0.3770,  0.2341],
        [ 0.0201, -0.1693,  0.3812,  0.4029],
        [ 0.0343, -0.5757,  0.4767,  0.1297],
        [-0.1553, -0.3875,  0.0891,  0.1721],
        [-0.3104, -0.2127, -0.1722, -0.1212],
        [-0.3331, -0.2350, -0.3308, -0.2784],
        [ 0.0927, -0.1623,  0.1655,  0.3982],
        [-0.2221, -0.2272, -0.1613, -0.1490],
        [ 0.2842, -0.5479,  0.5183,  0.1601],
        [-0.2120, -0.3115, -0.3846, -0.0692],
        [-0.2448,  0.0625, -0.1625, -0.3189],
        [-0.0490, -0.4538,  0.4566,  0.3958],
        [-0.0589, -0.3221,  0.1623,  0.2479],
        [-0.1304, -0.4387, -0.0235, -0.1868],
        [-0.3861, -0.2374, -0.5719, -0.2121],
        [-0.3271, -0.1654, -0.1998, -0.1558],
        [-0.4325, -0.1782, -0.2832, -0.2776],
        [-0.4521, -0.3148, -0.2991, -0.0677],
        [-0.4045, -0.1081, -0.2345, -0.4536],
        [-0.1453, -0.3648,  

In [17]:
def accuracy(model, dataloader):
    cnt = 0
    acc = 0

    for data in dataloader:
        inputs, attention_mask, labels = data
        inputs, attention_mask, labels = (
            inputs.to("mps"),
            attention_mask.to("mps"),
            labels.to("mps"),
        )

        preds = model(inputs, attention_mask)
        preds = torch.argmax(preds, dim=-1)
        # preds = (preds > 0).long()[..., 0]

        cnt += labels.shape[0]
        acc += (labels == preds).sum().item()

    return acc / cnt


with torch.no_grad():
    model.eval()
    train_acc = accuracy(model, train_loader)
    test_acc = accuracy(model, test_loader)
    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")



In [None]:
ds["test"]

In [16]:
import random


def pick_random_number():
    return random.randint(0, 7599)

In [34]:
rnd_num = pick_random_number()