<a href="https://colab.research.google.com/github/lecielpourlamer/01-thymeleaf-demo-employees-list-thymeleafdemo/blob/master/handson_llm_ch04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 4장 텍스트 분류

In [1]:
%%capture
!pip install datasets

In [2]:
# 깃허브에서 위젯 상태 오류를 피하기 위해 진행 표시줄을 나타내지 않도록 설정
from transformers.utils import logging

logging.disable_progress_bar()

## 영화 리뷰 데이터 셋

In [3]:
from datasets import load_dataset

# 데이터를 로드
data = load_dataset("rotten_tomatoes")
data

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [4]:
# 훈련 세트의 샘플 확인
data["train"][0, -1]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'things really get weird , though not particularly scary : the movie is all portent and no content .'],
 'label': [1, 0]}

# 표현 모델로 텍스트 분류하기

## 작업에 특화된 모델 사용하기

In [5]:
# 모델 로드하기
from transformers import pipeline

# 허깅 스페이스 모델 경로
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# 파이프라인으로 모델을 로드하기
pipe = pipeline(
    model=model_path,
    tokenizer=model_path,
    return_all_scores=True,
    device="cuda:0"
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--cardiffnlp--twitter-roberta-base-sentiment-latest/snapshots/3216a57f2a0d9c45a2e6c20157c20c49fb4bf9c7/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "dtype": "float32",
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "negative",
    "1": "neutral",
    "2": "positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 0,
    "neutral": 1,
    "positive": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.56.1",
 

In [6]:
# 테스트 세트로 모델을 실행
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

# 추론을 실행
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "text")), total=len(data["test"])):
  negative_score = output[0]["score"]
  positive_score = output[2]["score"]
  assignment = np.argmax([negative_score, positive_score])
  y_pred.append(assignment)

Disabling tokenizer parallelism, we're using DataLoader multithreading already
100%|██████████| 1066/1066 [00:10<00:00, 105.03it/s]


In [7]:
# 평가를 위해 사용할 함수 만들기
from sklearn.metrics import classification_report

def evaluate_performance(y_true, y_pred):
  """분류 리포트를 만들어 출력합니다."""
  performance = classification_report(
      y_true, y_pred,
      target_names=["negative Review", "Positive Review"]
  )
  print(performance)

In [8]:
# 분류 리포트 만들기
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

negative Review       0.76      0.88      0.81       533
Positive Review       0.86      0.72      0.78       533

       accuracy                           0.80      1066
      macro avg       0.81      0.80      0.80      1066
   weighted avg       0.81      0.80      0.80      1066

