# [3주차 심화과제] GPT로 뉴스 기사 분류 모델 학습하기


In [17]:
import torch
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader

tokenizer = torch.hub.load(
    "huggingface/transformers",  # huggingface/pytorch-transformers 에서 huggingface/transformers 로 변경됨
    "tokenizer",  # github.com/huggingface/transformers repo의 root에 'hubconf.py' 파일의 'tokenizer' 함수를 가져옴
    "openai-gpt",  # tokenizer 함수의 인자로 전달됨
    trust_repo=True,  # huggingface/transformers 를 믿을 수 있는 repo로 설정 -> 로컬의 trusted_list에 추가됨
    clean_up_tokenization_spaces=True,  # 현재는 True가 default 이지만 향후 False가 default로 바뀔예정 : 문장 중간에 나오는 띄어쓰기와 관련된 이슈 > https://github.com/huggingface/transformers/issues/31884
)

tokenizer.pad_token = tokenizer.unk_token

Using cache found in /Users/obov/.cache/torch/hub/huggingface_transformers_main


In [3]:
ds = load_dataset("fancyzhx/ag_news")

ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [4]:
tokenizer(
    ds["train"][0]["text"],
    padding="max_length",
    truncation=False,
    max_length=400,
).attention_mask

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [5]:
unique_labels = set(ds["train"]["label"])
count_of_label = len(unique_labels)
count_of_label

4

In [6]:
label_names = ds["train"].features["label"].names

label_names

['World', 'Sports', 'Business', 'Sci/Tech']

In [7]:
def collate_fn(batch):
    max_len = 400
    texts, labels = [], []
    for row in batch:
        labels.append(row["label"])
        texts.append(row["text"])

    tokenized = tokenizer(
        texts,
        padding="max_length",
        truncation=False,
        max_length=max_len,
    )

    texts = torch.LongTensor(tokenized.input_ids)
    attention_mask = torch.LongTensor(tokenized.attention_mask)
    labels = torch.LongTensor(labels)

    return texts, attention_mask, labels


train_loader = DataLoader(
    ds["train"], batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    ds["test"], batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [18]:
# 1200개 샘플로 줄이기
small_ds = DatasetDict(
    {
        "train": ds["train"].select(range(1200)),
        "test": ds["test"].select(range(1200)),
    }
)
print(small_ds)
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
})


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [19]:
small_train_loader = DataLoader(
    small_ds["train"], batch_size=64, shuffle=True, collate_fn=collate_fn
)
small_test_loader = DataLoader(
    small_ds["test"], batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [20]:
model = torch.hub.load(
    "huggingface/transformers",  # huggingface/pytorch-transformers 에서 huggingface/transformers 로 변경됨
    "model",  # 'model' 함수를 가져옴
    "openai-gpt",
)
model

Using cache found in /Users/obov/.cache/torch/hub/huggingface_transformers_main


OpenAIGPTModel(
  (tokens_embed): Embedding(40478, 768)
  (positions_embed): Embedding(512, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x Block(
      (attn): Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
  )
)

In [21]:
from torch import nn


class TextClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        self.encoder = torch.hub.load(
            "huggingface/transformers",  # huggingface/pytorch-transformers 에서 huggingface/transformers 로 변경됨
            "model",  # 'model' 함수를 가져옴
            "openai-gpt",
        )

        self.classifier = nn.Linear(768, 4)

    def forward(self, x, attention_mask):
        x = self.encoder(x, attention_mask)["last_hidden_state"]
        x = self.classifier(x[:, 0])
        return x


model = TextClassifier()

Using cache found in /Users/obov/.cache/torch/hub/huggingface_transformers_main


In [22]:
count_of_params = 0
for param in model.encoder.parameters():
    count_of_params += 1
    param.requires_grad = False
count_of_params

146

In [24]:
from torch.optim import Adam
import numpy as np
import matplotlib.pyplot as plt


lr = 0.001
model = model.to("mps")
loss_fn = nn.CrossEntropyLoss()

optimizer = Adam(model.parameters(), lr=lr)
n_epochs = 10

print_first_input = True

for epoch in range(n_epochs):
    total_loss = 0.0
    model.train()
    for data in small_train_loader:
        model.zero_grad()
        inputs, attention_mask, labels = data
        inputs, attention_mask, labels = (
            inputs.to("mps"),
            attention_mask.to("mps"),
            labels.to("mps").float(),
        )

        preds = model(
            inputs,
            attention_mask=attention_mask,
        )
        if print_first_input:
            print("preds", preds)
            print("labels", labels)
            print("Inputs:", inputs)
            print("attention_mask:", attention_mask)
            print("shape of preds", preds.shape)
            print("shape of labels", labels.shape)
            print_first_input = False
        loss = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

preds tensor([[ 0.2036,  0.2971, -0.1594, -0.3522],
        [ 0.3905,  0.2316,  0.0215, -0.4728],
        [-0.1445,  0.2422, -0.3864,  0.3181],
        [ 0.2350,  0.3727, -0.3274, -0.2913],
        [-0.3744,  0.3153, -0.0712,  0.2391],
        [ 0.2991,  0.2121,  0.0256, -0.3510],
        [-0.1296,  0.1954, -0.0808, -0.0113],
        [ 0.1382,  0.1816,  0.0725, -0.0424],
        [ 0.1876,  0.3315, -0.1588, -0.4182],
        [ 0.0225,  0.1703,  0.2058, -0.1790],
        [-0.1828,  0.0166,  0.6989,  0.6424],
        [ 0.2088,  0.1230,  0.1605, -0.0570],
        [ 0.3022, -0.0395, -0.0153, -0.0515],
        [ 0.2959,  0.1639, -0.0540, -0.2521],
        [-0.0214,  0.4468, -0.1476, -0.2114],
        [-0.0543, -0.0319,  0.3513,  0.3223],
        [ 0.2580,  0.1050, -0.0041, -0.3195],
        [ 0.1381,  0.2139, -0.0832, -0.0648],
        [ 0.3217,  0.3227, -0.3821, -0.3009],
        [ 0.3551,  0.0060,  0.0700,  0.1509],
        [-0.0806, -0.1372,  0.0981,  0.2891],
        [-0.1211,  0.3356, -

In [25]:
def accuracy(model, dataloader):
    cnt = 0
    acc = 0

    for data in dataloader:
        inputs, attention_mask, labels = data
        inputs, attention_mask, labels = (
            inputs.to("mps"),
            attention_mask.to("mps"),
            labels.to("mps"),
        )

        preds = model(inputs, attention_mask)
        preds = torch.argmax(preds, dim=-1)
        # preds = (preds > 0).long()[..., 0]

        cnt += labels.shape[0]
        acc += (labels == preds).sum().item()

    return acc / cnt


with torch.no_grad():
    model.eval()
    train_acc = accuracy(model, small_train_loader)
    test_acc = accuracy(model, small_test_loader)
    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")



In [None]:
ds["test"]

In [16]:
import random


def pick_random_number():
    return random.randint(0, 7599)

In [34]:
rnd_num = pick_random_number()