<a href="https://colab.research.google.com/github/leehansori/AIPlus/blob/main/%5B3%EC%A3%BC%EC%B0%A8%5D_%EC%8B%AC%ED%99%94%EA%B3%BC%EC%A0%9C_Pre_trained_%EB%AA%A8%EB%8D%B8%EB%A1%9C_%ED%9A%A8%EC%9C%A8%EC%A0%81%EC%9D%B8_NLP_%EB%AA%A8%EB%8D%B8_%ED%95%99%EC%8A%B5%ED%95%98%EA%B8%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [3주차]_심화과제 - Pre-trained 모델로 효율적인 NLP 모델 학습하기

In [1]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses datasets pandas

Collecting boto3
  Downloading boto3-1.37.31-py3-none-any.whl.metadata (6.7 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting botocore<1.38.0,>=1.37.31 (from boto3)
  Downloading botocore-1.37.31-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3)
  Downloading s3transfer-0.11.4-py3-none-any.whl.metadata (1.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (f

In [2]:
import kagglehub

# 'thedevastator/unlocking-language-understanding-with-the-multin' 데이터셋의 최신 버전을 다운로드
path = kagglehub.dataset_download("thedevastator/unlocking-language-understanding-with-the-multin")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/unlocking-language-understanding-with-the-multin


In [3]:
import torch
from torch.utils.data import DataLoader

# Hugging Face에서 DistilBERT tokenizer를 로드
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'distilbert-base-uncased')

Downloading: "https://github.com/huggingface/pytorch-transformers/zipball/main" to /root/.cache/torch/hub/main.zip
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
import random
import pandas as pd


# CSV 파일을 불러와 전처리하는 함수 정의
def load_data(path, nrows=None):
  # CSV 파일을 pandas로 읽어옴. nrows를 지정하면 해당 행까지만 읽음.
  df = pd.read_csv(path, nrows=nrows, keep_default_na=False)
  data = [] # 전처리된 데이터를 저장할 리스트

  # 각 행에 대해 반복하면서 전처리 수행
  for _, row in df.iterrows():
    # premise와 hypothesis가 모두 비어있지 않은 경우만 필터링
    if len(row['premise']) * len(row['hypothesis']) != 0:
      # 딕셔너리 형태로 저장: premise, hypothesis, label
      data.append({'premise': row['premise'], 'hypothesis': row['hypothesis'], 'label': row['label']})

  # 전처리된 데이터 반환
  return data


train_data = load_data(path + '/train.csv', nrows=1000) # 학습 데이터셋 (최대 1000개 샘플)
test_data = load_data(path + '/validation_matched.csv', nrows=1000)

In [5]:
train_data[0], test_data[0]

({'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.',
  'hypothesis': 'Product and geography are what make cream skimming work. ',
  'label': 1},
 {'premise': 'The new rights are nice enough',
  'hypothesis': 'Everyone really likes the newest benefits ',
  'label': 1})

In [15]:
train_data[1], test_data[1]

({'premise': 'you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him',
  'hypothesis': 'You lose the things to the following level if the people recall.',
  'label': 0},
 {'premise': 'This site includes a list of all award winners and a searchable database of Government Executive articles.',
  'hypothesis': 'The Government Executive articles housed on the website are not able to be searched.',
  'label': 2})

In [6]:
# 배치 데이터 전처리 함수
def collate_fn(batch):
  max_len = 400  # 최대 시퀀스 길이
  texts, labels = [], []
  for row in batch:
    labels.append(row['label'])  # 레이블 저장
    texts.append(row['premise'] + row['hypothesis'])  # premise + hypothesis 연결

  # 텍스트를 토크나이즈하여 텐서로 변환 (padding, truncation 적용)
  texts = torch.LongTensor(tokenizer(texts, padding=True, truncation=True, max_length=max_len).input_ids)
  labels = torch.LongTensor(labels)  # 레이블도 텐서로 변환

  return texts, labels  # 모델 입력용 배치 반환


train_loader = DataLoader(
    train_data, batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    test_data, batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [7]:
from torch import nn


# 텍스트 분류기 모델
class TextClassifier(nn.Module):
  def __init__(self):
    super().__init__()

    # DistilBERT 모델 로드 (사전 학습된 언어 모델)
    self.encoder = torch.hub.load('huggingface/pytorch-transformers', 'model', 'distilbert-base-uncased')
    # BERT의 출력 차원(768)을 받아 3개의 클래스 예측하는 선형층
    self.classifier = nn.Linear(768, 3)

  def forward(self, x):
    # BERT 인코더 실행, 결과는 딕셔너리 형식
    x = self.encoder(x)['last_hidden_state']
    # 첫 번째 토큰([CLS])의 임베딩을 분류기로 전달
    x = self.classifier(x[:, 0])

    return x


model = TextClassifier()

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [8]:
# 사전 학습된 BERT의 가중치를 고정 (학습하지 않도록 설정)
for param in model.encoder.parameters():
  param.requires_grad = False

In [9]:
from torch.optim import Adam
import numpy as np
import matplotlib.pyplot as plt


lr = 0.001  # 학습률
model = model.to('cuda')
loss_fn = nn.CrossEntropyLoss()  # 다중 클래스 분류를 위한 손실 함수

optimizer = Adam(model.parameters(), lr=lr)  # 옵티마이저 정의
n_epochs = 50  # 에폭 수

# 모델 학습 루프
for epoch in range(n_epochs):
  total_loss = 0.
  model.train()  # 학습 모드로 전환
  for data in train_loader:
    model.zero_grad()  # 기존 기울기 초기화
    inputs, labels = data
    inputs, labels = inputs.to('cuda'), labels.to('cuda').long()

    preds = model(inputs)  # 예측 수행
    loss = loss_fn(preds, labels)  # 손실 계산
    loss.backward()  # 역전파
    optimizer.step()  # 파라미터 업데이트

    total_loss += loss.item()  # 전체 손실 누적

  print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch   0 | Train Loss: 17.60379910469055
Epoch   1 | Train Loss: 17.344808220863342
Epoch   2 | Train Loss: 17.262840926647186
Epoch   3 | Train Loss: 17.119271516799927
Epoch   4 | Train Loss: 16.996744871139526
Epoch   5 | Train Loss: 17.001707911491394
Epoch   6 | Train Loss: 16.95223557949066
Epoch   7 | Train Loss: 17.05186575651169
Epoch   8 | Train Loss: 16.911274194717407
Epoch   9 | Train Loss: 16.75434648990631
Epoch  10 | Train Loss: 16.77719485759735
Epoch  11 | Train Loss: 16.78562968969345
Epoch  12 | Train Loss: 16.69353950023651
Epoch  13 | Train Loss: 16.58889079093933
Epoch  14 | Train Loss: 16.559812426567078
Epoch  15 | Train Loss: 16.58122330904007
Epoch  16 | Train Loss: 16.54782462120056
Epoch  17 | Train Loss: 16.513657867908478
Epoch  18 | Train Loss: 16.4517959356308
Epoch  19 | Train Loss: 16.350105345249176
Epoch  20 | Train Loss: 16.405906319618225
Epoch  21 | Train Loss: 16.25863742828369
Epoch  22 | Train Loss: 16.204187512397766
Epoch  23 | Train Loss: 

In [10]:
# 정확도 평가 함수
def accuracy(model, dataloader):
  cnt = 0
  acc = 0

  for data in dataloader:
    inputs, labels = data
    inputs, labels = inputs.to('cuda'), labels.to('cuda')

    preds = model(inputs)  # 예측 수행
    preds = torch.argmax(preds, dim=-1)  # 가장 높은 확률의 클래스 선택

    cnt += labels.shape[0]  # 전체 샘플 수 누적
    acc += (labels == preds).sum().item()  # 정답 예측 수 누적

  return acc / cnt  # 정확도 계산


# 모델 성능 평가
with torch.no_grad():  # 기울기 계산 비활성화 (메모리 효율)
  model.eval()
  train_acc = accuracy(model, train_loader)
  test_acc = accuracy(model, test_loader)
  print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")



In [13]:
def predict(premise, hypothesis, model, tokenizer, max_len=400):
    # 입력 문장 전처리: premise와 hypothesis를 하나로 연결
    text = premise + hypothesis

    # 토크나이징 및 텐서 변환
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_len, return_tensors="pt")
    input_ids = inputs['input_ids'].to('cuda')

    # 모델 추론
    model.eval()
    with torch.no_grad():
        logits = model(input_ids)
        probs = torch.softmax(logits, dim=-1)
        pred = torch.argmax(probs, dim=-1).item()

    # 레이블
    label_map = {
        0: "entailment",       # 참
        1: "neutral",          # 중립
        2: "contradiction"     # 모순
    }

    return label_map[pred], probs.squeeze().cpu().numpy()


In [14]:
# 예제 문장
premise = "A woman is reading a book on the subway."
hypothesis = "Someone is learning on a train."

# 예측
label, prob = predict(premise, hypothesis, model, tokenizer)

print(f"Prediction: {label}")
print(f"Probabilities: entailment={prob[0]:.2f}, neutral={prob[1]:.2f}, contradiction={prob[2]:.2f}") # 각각 라벨에 대한 확률

Prediction: contradiction
Probabilities: entailment=0.35, neutral=0.28, contradiction=0.37
