# [3주차 심화과제] GPT로 뉴스 기사 분류 모델 학습하기


In [1]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader

tokenizer = torch.hub.load(
    "huggingface/pytorch-transformers", "tokenizer", "distilbert-base-uncased"
)

Using cache found in /Users/obov/.cache/torch/hub/huggingface_pytorch-transformers_main


In [2]:
ds = load_dataset("fancyzhx/ag_news")

ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [3]:
tokenizer(
    ds["train"][0]["text"],
    padding="max_length",
    truncation=False,
    max_length=400,
).attention_mask

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [4]:
unique_labels = set(ds["train"]["label"])
count_of_label = len(unique_labels)
count_of_label

4

In [5]:
label_names = ds["train"].features["label"].names

label_names

['World', 'Sports', 'Business', 'Sci/Tech']

In [8]:
def collate_fn(batch):
    max_len = 400
    texts, labels = [], []
    for row in batch:
        labels.append(row["label"])
        texts.append(row["text"])

    tokenized = tokenizer(
        texts,
        padding="max_length",
        truncation=False,
        max_length=max_len,
    )

    texts = torch.LongTensor(tokenized.input_ids)
    attention_mask = torch.LongTensor(tokenized.attention_mask)
    labels = torch.LongTensor(labels)

    return texts, attention_mask, labels


train_loader = DataLoader(
    ds["train"], batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    ds["test"], batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [9]:
model = torch.hub.load(
    repo_or_dir="huggingface/pytorch-transformers",
    model="model",
    pretrained_model_name_or_path="distilbert-base-uncased",
)
model

Using cache found in /Users/obov/.cache/torch/hub/huggingface_pytorch-transformers_main


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [10]:
from torch import nn


class TextClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        self.encoder = torch.hub.load(
            repo_or_dir="huggingface/pytorch-transformers",
            model="model",
            pretrained_model_name_or_path="distilbert-base-uncased",
        )

        self.classifier = nn.Linear(768, 4)

    def forward(self, x, attention_mask):
        x = self.encoder(x, attention_mask)["last_hidden_state"]
        x = self.classifier(x[:, 0])
        return x


model = TextClassifier()

Using cache found in /Users/obov/.cache/torch/hub/huggingface_pytorch-transformers_main


In [11]:
count_of_params = 0
for param in model.encoder.parameters():
    count_of_params += 1
    param.requires_grad = False
count_of_params

100

In [12]:
from torch.optim import Adam
import numpy as np
import matplotlib.pyplot as plt


lr = 0.001
model = model.to("mps")
loss_fn = nn.CrossEntropyLoss()

optimizer = Adam(model.parameters(), lr=lr)
n_epochs = 10

print_first_input = True

for epoch in range(n_epochs):
    total_loss = 0.0
    model.train()
    for data in train_loader:
        model.zero_grad()
        inputs, attention_mask, labels = data
        inputs, attention_mask, labels = (
            inputs.to("mps"),
            attention_mask.to("mps"),
            labels.to("mps").float(),
        )

        preds = model(
            inputs,
            attention_mask=attention_mask,
        )
        if print_first_input:
            print("preds", preds)
            print("labels", labels)
            print("Inputs:", inputs)
            print("attention_mask:", attention_mask)
            print("shape of preds", preds.shape)
            print("shape of labels", labels.shape)
            print_first_input = False
        loss = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

preds tensor([[ 1.9902e-02,  2.5074e-01, -2.3254e-01, -8.3140e-02],
        [-2.1045e-02,  1.7326e-01, -2.1292e-01, -3.0414e-01],
        [-6.7079e-02,  1.2894e-01, -1.2479e-01, -3.1035e-01],
        [-6.8395e-02,  1.6730e-01, -2.6003e-01, -2.0549e-01],
        [ 1.5025e-01, -4.2746e-02, -1.4937e-01, -3.2154e-01],
        [-6.1295e-02,  2.4508e-01, -3.0930e-01, -2.8839e-01],
        [-4.7492e-02,  9.0440e-02, -1.5599e-01, -2.3637e-01],
        [-1.2243e-03, -2.0237e-02, -1.5001e-01, -2.9642e-01],
        [ 9.5011e-03,  1.0467e-01, -2.4524e-01, -2.6726e-01],
        [ 1.1898e-01,  2.3911e-02, -2.1732e-01, -1.6042e-01],
        [-5.7489e-02,  2.1898e-01, -2.2285e-01, -4.1191e-01],
        [ 1.1383e-01,  1.8877e-01, -4.2725e-02, -4.2314e-02],
        [-3.5718e-02,  2.1130e-01, -1.3871e-01, -2.7978e-01],
        [ 1.0896e-01,  2.2398e-02, -2.9729e-01, -3.7367e-01],
        [-5.4276e-02,  1.2385e-01, -1.7484e-01, -1.9980e-01],
        [-3.7088e-02,  1.1773e-01, -1.8343e-01, -2.1575e-01],
  

In [14]:
def accuracy(model, dataloader):
    cnt = 0
    acc = 0

    for data in dataloader:
        inputs, attention_mask, labels = data
        inputs, attention_mask, labels = (
            inputs.to("mps"),
            attention_mask.to("mps"),
            labels.to("mps"),
        )

        preds = model(inputs, attention_mask)
        preds = torch.argmax(preds, dim=-1)
        # preds = (preds > 0).long()[..., 0]

        cnt += labels.shape[0]
        acc += (labels == preds).sum().item()

    return acc / cnt


with torch.no_grad():
    model.eval()
    train_acc = accuracy(model, train_loader)
    test_acc = accuracy(model, test_loader)
    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")



In [15]:
ds["test"]

Dataset({
    features: ['text', 'label'],
    num_rows: 7600
})

In [16]:
import random


def pick_random_number():
    return random.randint(0, 7599)

In [34]:
rnd_num = pick_random_number()

In [35]:
test_label = ds["test"][rnd_num]["label"]
print(test_label)
test_text = ds["test"][rnd_num]["text"]
test_text

2


'Update 1: United Airlines to Slash US Flights As part of its bid to emerge profitably from bankruptcy, United Airlines announced plans Wednesday to slash its domestic flight schedule, increase its more profitable international schedule and reduce the size of its fleet over the next six months.'

In [38]:
text_token = tokenizer(
    test_text,
    padding="max_length",
    truncation=False,
    max_length=400,
)

text_token.attention_mask
text_token.input_ids

[101,
 10651,
 1015,
 1024,
 2142,
 7608,
 2000,
 18296,
 2149,
 7599,
 2004,
 2112,
 1997,
 2049,
 7226,
 2000,
 12636,
 5618,
 8231,
 2013,
 10528,
 1010,
 2142,
 7608,
 2623,
 3488,
 9317,
 2000,
 18296,
 2049,
 4968,
 3462,
 6134,
 1010,
 3623,
 2049,
 2062,
 15282,
 2248,
 6134,
 1998,
 5547,
 1996,
 2946,
 1997,
 2049,
 4170,
 2058,
 1996,
 2279,
 2416,
 2706,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0

In [39]:
model(text_token.input_ids, text_token.attention_mask)

AttributeError: 'list' object has no attribute 'size'