In [1]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses datasets

Collecting boto3
  Downloading boto3-1.35.31-py3-none-any.whl.metadata (6.6 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting botocore<1.36.0,>=1.35.31 (from boto3)
  Downloading botocore-1.35.31-py3-none-any.whl.metadata (5.6 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Downloading s3transfer-0.10.2-py3-none-any.whl.metadata (1.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multi

In [2]:
# 필요한 라이브러리 임포트
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import GPT2Tokenizer, GPT2Model
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from datasets import load_dataset

# 데이터셋 로드 (Huggingface의 AG News 데이터셋 사용)
ds = load_dataset("fancyzhx/ag_news")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [3]:
# GPT2 토크나이저 및 모델 초기화
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# model = GPT2Model.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2-medium') # Use a smaller GPT2 model

# GPT2에는 padding token이 없기 때문에 padding token을 unknown token으로 설정
tokenizer.pad_token = tokenizer.unk_token

# 배치를 처리하는 함수 정의
def collate_fn(batch):
    texts = [item['text'] for item in batch]  # 배치에서 텍스트 추출
    labels = [item['label'] for item in batch]  # 배치에서 레이블 추출

    # 텍스트를 토크나이저로 인코딩 (padding을 추가하여 입력 길이를 맞춤)
    encoding = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    labels = torch.tensor(labels, dtype=torch.long)  # 레이블을 텐서로 변환

    return {
        'input_ids': encoding['input_ids'],           # 입력 토큰 ID
        'attention_mask': encoding['attention_mask'], # 주의 마스크 (padding된 부분 무시)
        'labels': labels                              # 레이블
    }

# 데이터 로더 설정
# train_loader = DataLoader(ds['train'], batch_size=64, shuffle=True, collate_fn=collate_fn)
# test_loader = DataLoader(ds['test'], batch_size=64, shuffle=False, collate_fn=collate_fn)

train_loader = DataLoader(ds['train'], batch_size=32, shuffle=True, collate_fn=collate_fn) # Reduced batch_size to 32
test_loader = DataLoader(ds['test'], batch_size=32, shuffle=False, collate_fn=collate_fn) # Reduced batch_size to 32

# GPT 기반 분류기 정의
class GPTClassifier(nn.Module):
    def __init__(self, gpt_model, num_labels):
        super(GPTClassifier, self).__init__()
        self.gpt = gpt_model                                # GPT 모델 불러오기
        self.dropout = nn.Dropout(0.1)                      # 드롭아웃 레이어 (과적합 방지)
        self.classifier = nn.Linear(self.gpt.config.hidden_size, num_labels)  # 최종 분류를 위한 선형 계층

    def forward(self, input_ids, attention_mask=None):
        # GPT 모델에 입력을 통과시켜 hidden states 출력
        outputs = self.gpt(input_ids=input_ids, attention_mask=attention_mask)

        # 마지막 토큰의 hidden state를 추출 (배치 크기, hidden size)
        last_hidden_state = outputs.last_hidden_state
        last_token_state = last_hidden_state[:, -1, :]      # 시퀀스에서 마지막 토큰 사용

        # 드롭아웃 적용
        last_token_state = self.dropout(last_token_state)

        # 선형 계층을 통해 최종 예측 (로짓 값)
        logits = self.classifier(last_token_state)
        return logits

# GPT 모델과 레이블 개수로 GPTClassifier 초기화 (AG 뉴스 데이터셋의 경우 4개 레이블)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPTClassifier(model, num_labels=4).to(device)

# 옵티마이저 설정
optimizer = optim.Adam(model.parameters(), lr=1e-5)
epochs = 3  # 학습 에포크 수

# 학습 루프 정의
for epoch in range(epochs):
    model.train()  # 모델을 학습 모드로 전환
    total_loss = 0  # 총 손실 초기화

    # 훈련 데이터에 대해 배치 단위로 학습
    for batch in train_loader:
        optimizer.zero_grad()  # 옵티마이저의 기울기 초기화
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # 모델의 예측 결과
        outputs = model(input_ids, attention_mask=attention_mask)

        # 손실 계산 (교차 엔트로피 손실 함수)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        total_loss += loss.item()  # 손실을 누적

        # 역전파 및 옵티마이저 스텝
        loss.backward()
        optimizer.step()

    # 각 에포크마다 평균 손실 출력
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")

# 테스트 데이터셋에서 정확도 측정
model.eval()  # 모델을 평가 모드로 전환
all_preds = []
all_labels = []

# 테스트 데이터에 대해 예측 수행
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # 모델 예측
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()  # 예측 값 추출
        all_preds.extend(preds)  # 예측 값 저장
        all_labels.extend(labels.cpu().numpy())  # 실제 레이블 저장

# 최종 테스트 정확도 계산 및 출력
test_accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 57.06 MiB is free. Process 46600 has 14.69 GiB memory in use. Of the allocated memory 14.00 GiB is allocated by PyTorch, and 576.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)