## Step 1: 라이브러리 설치 및 데이터 준비

In [1]:
# 필요한 라이브러리 설치
!pip install transformers datasets evaluate accelerate scikit-learn

# 필요한 라이브러리 임포트
import random
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# MNLI 데이터셋 로드
dataset = load_dataset("nyu-mll/glue", "mnli")

# train split만 사용
train_data = dataset['train']


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

(…)alidation_matched-00000-of-00001.parquet:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

(…)dation_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

test_matched-00000-of-00001.parquet:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

test_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

## Step 2: 데이터 전처리 및 Tokenizer 설정

In [2]:
# Tokenizer 설정
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# 전처리 함수 정의
def preprocess_function(data):
    return tokenizer(data['premise'], data['hypothesis'], truncation=True, max_length=128)

# train 데이터 전처리 및 Validation 데이터 분할
train_val_split = dataset['train'].train_test_split(test_size=0.1)  # 10%를 Validation 데이터로 분할
train_data = train_val_split['train']
val_data = train_val_split['test']

# train, val 데이터 Tokenize
train_tokenized = train_data.map(preprocess_function, batched=True)
val_tokenized = val_data.map(preprocess_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



Map:   0%|          | 0/353431 [00:00<?, ? examples/s]

Map:   0%|          | 0/39271 [00:00<?, ? examples/s]

## Step 3: 모델 구성 및 학습 설정

In [3]:
# BERT Config 설정 및 모델 생성
from transformers import BertConfig

# label과 id 매핑 설정
id2label = {0: "Entailment", 1: "Neutral", 2: "Contradiction"}
label2id = {"Entailment": 0, "Neutral": 1, "Contradiction": 2}

# hidden_size 및 모델 구성 설정 수정
config = BertConfig(
    hidden_size=256,
    intermediate_size=512,
    num_hidden_layers=4,
    num_attention_heads=4,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

model = AutoModelForSequenceClassification.from_config(config)

# 학습 인자 설정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_strategy="epoch",
    load_best_model_at_end=True
)



## Step 4: 모델 학습 및 평가 함수 정의

In [15]:
# 정확도 평가 함수 설정
accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 모델 학습
trainer.train()

# 학습 결과 저장
trainer.save_model('./mnli_model')

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9518,0.892171,0.585572
2,0.8494,0.865385,0.604415
3,0.7918,0.854595,0.612411
4,0.7425,0.870406,0.613812
5,0.7008,0.891332,0.614627


## Step 5: 성능 평가 및 예측

In [16]:
from transformers import pipeline

# validation_matched 데이터셋에 대한 성능 평가
val_data = dataset['validation_matched']
val_tokenized = val_data.map(preprocess_function, batched=True)
val_results = trainer.evaluate(val_tokenized)

print(f"Validation Matched Accuracy: {val_results['eval_accuracy']}")

# 예측 결과 출력
classifier = pipeline("text-classification", model='./mnli_model', tokenizer=tokenizer, device=0)

# 샘플 테스트: 두 문장 간 논리적 관계 예측
premise = "A man is playing a guitar."
hypothesis = "A man is making music."

# pipeline에 두 문장을 dict로 전달 (text와 text_pair)
print(classifier({"text": premise, "text_pair": hypothesis}))

Validation Matched Accuracy: 0.6304635761589404
{'label': 'Entailment', 'score': 0.7913414835929871}


### 정확도가 낮아서 다음과 같이 수정하여 테스트 해보았습니다.

1. **Pretrained 모델 사용**:  
   사전 학습된 **`bert-base-cased`** 모델을 사용하여 성능을 개선하고자 **`AutoModelForSequenceClassification.from_pretrained`**로 변경하였습니다.

2. **Early Stopping 추가**:  
   **`EarlyStoppingCallback(early_stopping_patience=3)`**를 추가하여, 3번 연속으로 성능이 향상되지 않으면 학습을 조기 종료하여 **과적합(overfitting)**을 방지하도록 설정하였습니다.

3. **Learning Rate 스케줄링 추가**:  
   **`lr_scheduler_type="linear"`** 및 **`warmup_steps=500`** 설정으로 학습 초기에 학습률을 조정하여 더 안정적인 학습을 유도하였습니다.

In [5]:
from transformers import EarlyStoppingCallback

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=3)

def preprocess_function(data):
    return tokenizer(data['premise'], data['hypothesis'], truncation=True, max_length=128)

train_val_split = dataset['train'].train_test_split(test_size=0.1)
train_tokenized = train_val_split['train'].map(preprocess_function, batched=True)
val_tokenized = train_val_split['test'].map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    weight_decay=0.01,
    lr_scheduler_type="linear",  # Learning rate 스케줄링
    warmup_steps=500,  # Warm-up 스텝
    logging_strategy="epoch",
    load_best_model_at_end=True
)

accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Early Stopping 추가
)

# 모델 학습
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/353431 [00:00<?, ? examples/s]

Map:   0%|          | 0/39271 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy
1,0.6154,0.522752,0.792086
2,0.4483,0.499351,0.807237
3,0.3109,0.518749,0.813858
4,0.1821,0.644627,0.814163
5,0.0919,0.805474,0.813934


TrainOutput(global_step=27615, training_loss=0.32968514280303673, metrics={'train_runtime': 6898.2243, 'train_samples_per_second': 256.175, 'train_steps_per_second': 4.003, 'total_flos': 9.287502963119098e+16, 'train_loss': 0.32968514280303673, 'epoch': 5.0})

In [7]:
# 학습 결과 저장
trainer.save_model('./mnli_model')

In [8]:
from transformers import pipeline

# validation_matched 데이터셋에 대한 성능 평가
val_data = dataset['validation_matched']
val_tokenized = val_data.map(preprocess_function, batched=True)
val_results = trainer.evaluate(val_tokenized)

print(f"Validation Matched Accuracy: {val_results['eval_accuracy']}")

# 예측 결과 출력
classifier = pipeline("text-classification", model='./mnli_model', tokenizer=tokenizer, device=0)

# 두 문장 간 논리적 관계 예측
premise = "A man is playing a guitar."
hypothesis = "A man is making music."

# pipeline에 두 문장을 dict로 전달 (text와 text_pair)
print(classifier({"text": premise, "text_pair": hypothesis}))

Validation Matched Accuracy: 0.8066225165562914
{'label': 'LABEL_0', 'score': 0.9517608880996704}


### 성능 결과

Validation Matched 데이터셋에서 **80.66%**의 정확도를 보였습니다.  
모델은 두 문장이 **95.18%** 확률로 **LABEL_0 (Entailment)**이라고 예측했습니다.