<a href="https://colab.research.google.com/github/kasier48/DeepLearning/blob/main/Pratice_Week4_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [4주차] 기본과제: HuggingFace로 두 문장의 논리적 모순 분류하기

이번 과제는 자연어 task 중 하나인 MNLI를 해결하는 모델을 HuggingFace로 학습하는 것입니다. MNLI를 요약하면 다음과 같습니다.

- **입력**: premise에 해당하는 문장과 hypothesis에 해당하는 문장 두 개가 입력으로 들어옵니다.
- **출력:** 분류 문제로, 두 문장이 들어왔을 때 다음 세 가지를 예측하시면 됩니다.
    - **Entailment:** 두 문장에 논리적 모순이 없습니다.
    - **Neutral:** 두 문장은 논리적으로 관련이 없습니다.
    - **Contradiction:** 두 문장 사이에 논리적 모순이 존재합니다.

이 때, 다음 요구사항이 담긴 colab notebook을 만들어내시면 됩니다:

- [ ]  `load_dataset("nyu-mll/glue", "mnli")` 로 dataset을 불러옵니다.
    - 학습 때는 `train` split만 활용하셔야 합니다. 나머지 split은 사용불가입니다.
    - Validation data가 필요한 경우, `train` split에서 가져오셔야 합니다.
- [ ]  `trainer.train()`를 통해 학습된 log가 남아있어야 합니다.
- [ ]  Dataset의 `validation_matched`에 대한 성능을 출력하고, 50%를 넘기셔야 합니다.

In [7]:
!pip install transformers datasets evaluate accelerate scikit-learn



In [12]:
import random
import evaluate
import numpy as np

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

imdb = load_dataset("nyu-mll/glue", "mnli")
print(imdb)

train_dataset = imdb['train']
print(train_dataset[0])

# [MYOCDE] 라벨의 종류 확인
label_info = train_dataset.features['label']
print(f"라벨의 종류: {label_info.names}")

# [MYCODE] 라벨의 개수 설정
num_labels = len(label_info.names)

# [MYCODE] pre traiend된 distilbert 토큰나이저를 가져옴.
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

# [MYCODE] 가설과 전제를 결합하고 max_length, padding을 설정, label 값을 설정.
def preprocess_function(data):
  max_length = 400
  texts = [premise + " " + hypothesis for premise, hypothesis in zip(data['premise'], data['hypothesis'])]
  tokenized_output = tokenizer(texts, truncation=True, max_length=max_length, padding=True)

  tokenized_output['labels'] = data['label']
  return tokenized_output

imdb_tokenized = imdb.map(preprocess_function, batched=True)
print(imdb_tokenized['train'][0].keys())

# [MYCODE] Train 데이터와 Validation 데이터의 개수 제한을 3만개로 설정.
data_length = 30000
imdb_split = imdb_tokenized['train'].train_test_split(test_size=0.2)
imdb_train, imdb_val = imdb_split['train'].select(range(data_length)), imdb_split['test'].select(range(data_length))
imdb_test = imdb_tokenized['test_matched']
print(f"train len: {len(imdb_train)}, validation len: {len(imdb_val)}, test len: {len(imdb_test)}")

from transformers import BertConfig

# [MYCODE] BertConfig를 설정하고 num_labels의 값을 변경.
config = BertConfig()
config.hidden_size = 64
config.intermediate_size = 64
config.num_hidden_layers = 2
config.num_attention_heads = 4
config.num_labels = num_labels

from transformers import TrainingArguments, Trainer

num_epochs = 60
training_args = TrainingArguments(
    run_name='pratice_week4_1_0',
    output_dir='mnli_transformer',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_strategy="epoch",
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    load_best_model_at_end=True,
    report_to="none"
)

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})
{'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.', 'hypothesis': 'Product and geography are what make cream skimming work. ', 'label': 1, 'idx': 0}
라벨의 종류: ['entailment', 'neutral', 'contradiction']


Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

dict_keys(['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask', 'labels'])
train len: 30000, validation len: 30000, test len: 9796


In [None]:
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

from transformers import EarlyStoppingCallback

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=num_labels
)
print(model)

for param in model.distilbert.parameters():
  param.requires_grad = False

# [MYCODE] pretraiend distillbert 토큰나이저를 가져와 학습한다.
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=imdb_train,
    eval_dataset=imdb_val,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0538,1.009727,0.492767


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0538,1.009727,0.492767
2,1.0184,1.004396,0.490733
3,1.0028,0.997523,0.499233
4,0.9957,0.988551,0.5098
5,0.9877,0.987227,0.510767
6,0.984,0.97614,0.5206
7,0.9769,0.983081,0.512467
8,0.9704,0.971342,0.524733
9,0.9686,0.979037,0.515167
10,0.9642,1.00604,0.490833
