# [3주차 심화과제] GPT로 뉴스 기사 분류 모델 학습하기


In [1]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader

tokenizer = torch.hub.load(
    "huggingface/transformers",  # huggingface/pytorch-transformers 에서 huggingface/transformers 로 변경됨
    "tokenizer",  # github.com/huggingface/transformers repo의 root에 'hubconf.py' 파일의 'tokenizer' 함수를 가져옴
    "openai-gpt",  # tokenizer 함수의 인자로 전달됨
    trust_repo=True,  # huggingface/transformers 를 믿을 수 있는 repo로 설정 -> 로컬의 trusted_list에 추가됨
    clean_up_tokenization_spaces=True,  # 현재는 True가 default 이지만 향후 False가 default로 바뀔예정 : 문장 중간에 나오는 띄어쓰기와 관련된 이슈 > https://github.com/huggingface/transformers/issues/31884
)

tokenizer.pad_token = tokenizer.unk_token

Using cache found in /Users/obov/.cache/torch/hub/huggingface_transformers_main


In [2]:
ds = load_dataset("fancyzhx/ag_news")

ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [3]:
tokenizer(
    ds["train"][0]["text"],
    padding="max_length",
    truncation=False,
    max_length=400,
).attention_mask

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [4]:
unique_labels = set(ds["train"]["label"])
count_of_label = len(unique_labels)
count_of_label

4

In [5]:
label_names = ds["train"].features["label"].names

label_names

['World', 'Sports', 'Business', 'Sci/Tech']

In [6]:
def collate_fn(batch):
    max_len = 400
    texts, labels = [], []
    for row in batch:
        labels.append(row["label"])
        texts.append(row["text"])

    tokenized = tokenizer(
        texts,
        padding=True,
        truncation=False,
        max_length=None,
    )

    texts = torch.LongTensor(tokenized.input_ids)
    attention_mask = torch.LongTensor(tokenized.attention_mask)
    labels = torch.LongTensor(labels)

    return texts, attention_mask, labels


train_loader = DataLoader(
    ds["train"], batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    ds["test"], batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [34]:
from datasets import DatasetDict, concatenate_datasets


# 총 샘플 수를 기반으로 각 레이블에서 샘플을 균등하게 선택
def select_balanced_subset(dataset, total_samples):
    label_column = next(col for col in dataset.column_names if "label" in col.lower())

    # 레이블 개수에 따라 각 레이블에서 선택할 샘플 수 계산
    num_labels = len(set(dataset[label_column]))
    samples_per_class = total_samples // num_labels

    subsets = []
    for label in set(dataset[label_column]):
        label_subset = dataset.filter(lambda x: x[label_column] == label)
        subsets.append(
            label_subset.select(range(min(samples_per_class, len(label_subset))))
        )

    # 레이블별로 선택된 데이터를 합침
    return concatenate_datasets(subsets)


small_train = select_balanced_subset(ds["train"], 12000)
small_test = select_balanced_subset(ds["test"], 2000)

# 작은 데이터셋으로 새로운 DatasetDict 생성
small_ds = DatasetDict(
    {
        "train": small_train,
        "test": small_test,
    }
)

small_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [35]:
small_train_loader = DataLoader(
    small_ds["train"], batch_size=64, shuffle=True, collate_fn=collate_fn
)
small_test_loader = DataLoader(
    small_ds["test"], batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [39]:
# 원 데이터 셋 레이블별 데이터 수 확인
label_counts = ds["train"].features["label"].names
label_distribution = ds["train"].to_pandas().label.value_counts()

for label_id, count in label_distribution.items():
    print(f"Label {label_counts[label_id]}: {count} samples")

Label Business: 30000 samples
Label Sci/Tech: 30000 samples
Label Sports: 30000 samples
Label World: 30000 samples


In [40]:
# 축소 시킨 데이터셋 레이블별 데이터 수 확인
label_counts = small_ds["train"].features["label"].names
label_distribution = small_ds["train"].to_pandas().label.value_counts()

for label_id, count in label_distribution.items():
    print(f"Label {label_counts[label_id]}: {count} samples")

Label World: 3000 samples
Label Sports: 3000 samples
Label Business: 3000 samples
Label Sci/Tech: 3000 samples


In [41]:
model = torch.hub.load(
    "huggingface/transformers",  # huggingface/pytorch-transformers 에서 huggingface/transformers 로 변경됨
    "model",  # 'model' 함수를 가져옴
    "openai-gpt",
)
model

Using cache found in /Users/obov/.cache/torch/hub/huggingface_transformers_main


OpenAIGPTModel(
  (tokens_embed): Embedding(40478, 768)
  (positions_embed): Embedding(512, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x Block(
      (attn): Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
  )
)

In [42]:
from torch import nn


class TextClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        self.encoder = torch.hub.load(
            "huggingface/transformers",  # huggingface/pytorch-transformers 에서 huggingface/transformers 로 변경됨
            "model",  # 'model' 함수를 가져옴
            "openai-gpt",
        )

        self.classifier = nn.Linear(768, 4)

    def forward(self, x, attention_mask):
        x = self.encoder(x, attention_mask)["last_hidden_state"]
        x = x.max(dim=1)[0]  # 가장 큰 값만 선택
        x = self.classifier(x)
        return x


model = TextClassifier()

Using cache found in /Users/obov/.cache/torch/hub/huggingface_transformers_main


In [43]:
count_of_params = 0
for param in model.encoder.parameters():
    count_of_params += 1
    param.requires_grad = False
count_of_params

146

In [44]:
from torch.optim import Adam
import numpy as np
import matplotlib.pyplot as plt


lr = 0.001
model = model.to("mps")
loss_fn = nn.CrossEntropyLoss()

optimizer = Adam(model.parameters(), lr=lr)
n_epochs = 10

print_first_input = True

for epoch in range(n_epochs):
    total_loss = 0.0
    model.train()
    for data in small_train_loader:
        model.zero_grad()
        inputs, attention_mask, labels = data
        inputs, attention_mask, labels = (
            inputs.to("mps"),
            attention_mask.to("mps"),
            labels.to("mps").float(),
        )

        preds = model(
            inputs,
            attention_mask=attention_mask,
        )
        if print_first_input:
            print("preds", preds)
            print("labels", labels)
            print("Inputs:", inputs)
            print("attention_mask:", attention_mask)
            print("shape of preds", preds.shape)
            print("shape of labels", labels.shape)
            print_first_input = False
        loss = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

preds tensor([[-1.0251,  0.2344, -0.7584, -0.1665],
        [-0.7779,  0.5885, -1.0529,  0.3269],
        [-0.9566,  0.7144, -0.8012,  0.2603],
        [-1.0519, -0.2210, -0.7478,  0.8879],
        [-1.6367,  0.2850, -0.5304,  0.2358],
        [-1.0687,  0.5111, -0.5073,  0.5187],
        [-0.8887,  0.2018, -0.9316,  0.3756],
        [-1.1801,  0.0302, -0.7718,  0.3878],
        [-1.3531,  0.7033, -0.6145,  0.4693],
        [-1.3847,  0.4729, -0.6427,  0.4872],
        [-0.5714,  0.5066, -1.4190, -0.0477],
        [-1.1127,  0.3809, -0.8202, -0.1400],
        [-1.1418,  0.3912, -0.8186,  0.0788],
        [-1.1145, -0.1856, -1.1379, -0.1611],
        [-1.2718,  0.6175, -0.4127,  0.0172],
        [-1.5658,  0.0746, -0.8550,  0.8947],
        [-1.1414, -0.0074, -0.7594,  0.7775],
        [-1.0898,  0.8744, -0.3230, -0.3132],
        [-1.5049,  0.3820, -0.6812,  0.3572],
        [-1.0867,  0.5169, -0.7616,  0.2442],
        [-1.1711,  0.3595, -0.5937,  0.5665],
        [-1.4602,  0.2634, -

In [45]:
def accuracy(model, dataloader):
    cnt = 0
    acc = 0

    for data in dataloader:
        inputs, attention_mask, labels = data
        inputs, attention_mask, labels = (
            inputs.to("mps"),
            attention_mask.to("mps"),
            labels.to("mps"),
        )

        preds = model(inputs, attention_mask)
        preds = torch.argmax(preds, dim=-1)
        # preds = (preds > 0).long()[..., 0]

        cnt += labels.shape[0]
        acc += (labels == preds).sum().item()

    return acc / cnt


with torch.no_grad():
    model.eval()
    train_acc = accuracy(model, small_train_loader)
    test_acc = accuracy(model, small_test_loader)
    print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")

