<a href="https://colab.research.google.com/github/jhtwiz/AI-1-jhtwiz/blob/main/4%EC%A3%BC%EC%B0%A8_%EA%B8%B0%EB%B3%B8%EA%B3%BC%EC%A0%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# HuggingFace로 뉴스 기사 분류하기

In [None]:
!pip install transformers datasets evaluate accelerate scikit-learn

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting accelerate
  Downloading accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Downloading accelerate-0.34.2-py3-none-any.whl (324 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 11.0/11.0 MB 85.8 MB/s eta 0:00:00
Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl (44.5 MB)
   ---------------------------------------- 0.0/44.5 MB ? eta -:--:--
   ---------------------- -------------

In [None]:
import random
import evaluate
import numpy as np

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
db = load_dataset("fancyzhx/ag_news")
db

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [None]:
db['train'][0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

In [None]:
db['train'].features['label'].names

['World', 'Sports', 'Business', 'Sci/Tech']

In [None]:
len_classes = len(db['train'].features['label'].names)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def preprocess_function(data):
    return tokenizer(data["text"])

db_tokenized = db.map(preprocess_function, batched=True)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|███████████████████████████████████████████████████████████| 120000/120000 [00:06<00:00, 17444.18 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████| 7600/7600 [00:00<00:00, 17949.09 examples/s]


In [None]:
db_tokenized['train'][0].keys()

dict_keys(['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
db_split = db_tokenized['train'].train_test_split(test_size=0.2)
db_train, db_val = db_split['train'], db_split['test']
db_test = db_tokenized['test']

In [None]:
len(db_train), len(db_val), len(db_test)

(96000, 24000, 7600)

In [None]:
from transformers import BertConfig

config = BertConfig()

config.hidden_size = 128  # BERT layer의 기본 hidden dimension
config.intermediate_size = 256  # FFN layer의 중간 hidden dimension
config.num_hidden_layers = 5  # BERT layer의 개수
config.num_attention_heads = 4  # Multi-head attention에서 사용하는 head 개수
config.num_labels = len_classes  # 마지막에 예측해야 하는 분류 문제의 class 개수

model = AutoModelForSequenceClassification.from_config(config)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='hf_transformer',  # 모델, log 등을 저장할 directory
    num_train_epochs=10,  # epoch 수
    per_device_train_batch_size=64,  # training data의 batch size
    per_device_eval_batch_size=64,  # validation data의 batch size
    logging_strategy="epoch",  # Epoch가 끝날 때마다 training loss 등을 log하라는 의미
    do_train=True,  # 학습을 진행하겠다는 의미
    do_eval=True,  # 학습 중간에 validation data에 대한 평가를 수행하겠다는 의미
    eval_strategy="epoch",  # 매 epoch가 끝날 때마다 validation data에 대한 평가를 수행한다는 의미
    save_strategy="epoch",  # 매 epoch가 끝날 때마다 모델을 저장하겠다는 의미
    learning_rate=1e-3,  # optimizer에 사용할 learning rate
    load_best_model_at_end=True  # 학습이 끝난 후, validation data에 대한 성능이 가장 좋은 모델을 채택하겠다는 의미
)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")


def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import EarlyStoppingCallback


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=db_train,
    eval_dataset=db_val,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.4133,0.28742,0.90375
2,0.2133,0.271459,0.911458
3,0.1466,0.270898,0.91725
4,0.1067,0.331676,0.911375
5,0.0743,0.362094,0.911708
6,0.0495,0.431357,0.906583


TrainOutput(global_step=9000, training_loss=0.16727314164903428, metrics={'train_runtime': 1020.6157, 'train_samples_per_second': 940.609, 'train_steps_per_second': 14.697, 'total_flos': 357627144416256.0, 'train_loss': 0.16727314164903428, 'epoch': 6.0})

In [None]:
trainer.evaluate(db_test)

{'eval_loss': 0.26234209537506104,
 'eval_accuracy': 0.9219736842105263,
 'eval_runtime': 3.4501,
 'eval_samples_per_second': 2202.846,
 'eval_steps_per_second': 34.492,
 'epoch': 6.0}

In [None]:
trainer.save_model()

In [None]:
from transformers import pipeline


classifier = pipeline("sentiment-analysis", model="./hf_transformer/", device='cuda')
test_text = "UK charges 8 in terror plot linked to alert in US LONDON, AUGUST 17: Britain charged eight terror suspects on Tuesday with conspiracy to commit murder and said one had plans that could be used in striking US buildings that were the focus of security scares this month."
print(classifier(test_text))

[{'label': 'LABEL_0', 'score': 0.996440589427948}]
