## 1. 라이브러리 설치 및 데이터 준비

In [1]:
# 필요한 라이브러리 설치
!pip install transformers datasets evaluate accelerate scikit-learn

# 필요한 라이브러리 임포트
import random
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Dataset 로드
dataset = load_dataset("fancyzhx/ag_news")

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

## 2. 데이터 전처리 및 분할

In [2]:
# Tokenizer 설정 및 전처리 함수 정의
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def preprocess_function(data):
    return tokenizer(data["text"], truncation=True)

# Dataset을 Tokenize
dataset_tokenized = dataset.map(preprocess_function, batched=True)

# Train, Validation, Test 데이터로 분할
dataset_split = dataset_tokenized['train'].train_test_split(test_size=0.2)
train_data, val_data = dataset_split['train'], dataset_split['test']
test_data = dataset_tokenized['test']

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

## 3. 모델 구성 및 학습 설정

In [3]:
# BERT Config 설정 및 모델 생성
from transformers import BertConfig

config = BertConfig(hidden_size=64, intermediate_size=64, num_hidden_layers=2, num_attention_heads=4, num_labels=4)
model = AutoModelForSequenceClassification.from_config(config)

# 학습 인자 설정
training_args = TrainingArguments(
    output_dir='hf_transformer_ag_news',
    num_train_epochs=10,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_strategy="epoch",
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    load_best_model_at_end=True
)


## 4. 모델 학습 및 평가

In [4]:
# 정확도 평가 함수 설정
accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

# 모델 학습
trainer.train()


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.4376,0.269198,0.910458
2,0.1991,0.254367,0.915875
3,0.1361,0.279659,0.911875
4,0.0975,0.332512,0.908583
5,0.07,0.367447,0.907542
6,0.0487,0.402881,0.903583
7,0.0331,0.452661,0.904417
8,0.0213,0.5092,0.903833
9,0.0141,0.543674,0.902458
10,0.0105,0.568023,0.902583


TrainOutput(global_step=7500, training_loss=0.10679896793365479, metrics={'train_runtime': 349.4538, 'train_samples_per_second': 2747.144, 'train_steps_per_second': 21.462, 'total_flos': 56341139420160.0, 'train_loss': 0.10679896793365479, 'epoch': 10.0})

### Overfitting 이슈

위의 학습 과정에서 **Training Loss**는 점점 감소하는 반면, **Validation Loss**는 Epoch가 진행됨에 따라 점차 증가하는 모습을 보입니다. 또한 **Accuracy**도 초기에는 증가하지만 이후에는 큰 변화가 없거나 오히려 감소하는 경향을 보입니다. 이 현상은 **Overfitting**으로 인한 것으로 보입니다.

### 코드 수정

이를 해결하기 위해 다음과 같은 방법으로 코드를 수정하였습니다:

1. **Dropout 추가**: 학습 중 일부 뉴런을 무작위로 비활성화하여 모델의 복잡도를 줄이고 **overfitting**을 방지.
2. **Early Stopping 추가**: 검증 데이터의 성능이 개선되지 않으면 학습을 조기에 종료하여 **overfitting**을 방지.
3. **학습률 조정**: 학습률을 낮추어 더 천천히 학습할 수 있도록 조정.

In [20]:
# BERT Config 설정 및 모델 생성
from transformers import BertConfig

config = BertConfig(
    hidden_size=64,
    intermediate_size=64,
    num_hidden_layers=2,
    num_attention_heads=4,
    num_labels=4,
    hidden_dropout_prob=0.3  # Dropout 추가
)
model = AutoModelForSequenceClassification.from_config(config)

# 학습 인자 설정
training_args = TrainingArguments(
    output_dir='hf_transformer_ag_news',
    num_train_epochs=10,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_strategy="epoch",
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-4,  # 학습률을 더 낮게 조정 (1e-3에서 5e-4로)
    load_best_model_at_end=True
)

In [21]:
from transformers import EarlyStoppingCallback


# 정확도 평가 함수 설정
accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Trainer 설정 및 EarlyStoppingCallback 추가
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Early stopping 추가
)

# 모델 학습
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5433,0.315745,0.904083
2,0.2589,0.284709,0.911417
3,0.1991,0.298022,0.909042
4,0.1626,0.316773,0.907458


TrainOutput(global_step=3000, training_loss=0.2909675470987956, metrics={'train_runtime': 139.3501, 'train_samples_per_second': 6889.124, 'train_steps_per_second': 53.821, 'total_flos': 22511940142080.0, 'train_loss': 0.2909675470987956, 'epoch': 4.0})

In [22]:
# Test 데이터 평가
test_results = trainer.evaluate(test_data)
print(f"Test Accuracy: {test_results['eval_accuracy']}")

# 모델 저장
trainer.save_model('./hf_transformer_ag_news')

# 예측 결과 출력
from transformers import pipeline

classifier = pipeline("text-classification", model="./hf_transformer_ag_news", tokenizer=tokenizer, device=0)
news_article = """
UK charges 8 in terror plot linked to alert in US. LONDON, AUGUST 17: Britain charged eight terror suspects on Tuesday with conspiracy to commit murder and said one had plans that could be used in striking US buildings that were the focus of security scares this month.
"""
print(classifier(news_article))


Test Accuracy: 0.9143421052631578
[{'label': 'LABEL_0', 'score': 0.9866090416908264}]


### 모델 성능 결과

- **Test Accuracy**: 91.43%
  - 모델은 테스트 데이터에 대해 **91.43%**의 정확도를 기록하였습니다.
  
- **예측 결과**:
  - 모델이 제공된 문장을 **LABEL_0**으로 예측하였으며, **98.66%**의 확률로 해당 문장이 `label_0`에 해당한다고 판단하였습니다.
