# [NLP_16]Create a HuggingFace custom project

## 1. Import 및 라이브러리 로드

In [1]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 5248024566476580139,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 14474280960
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 1167462791622841294
 physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"]

In [2]:
# ! pip install transformers==4.23.1
# ! pip install datasets
# ! pip install evaluate

In [3]:
# %cd transformers
# !mkdir -p models/mnli/

# !python examples/tensorflow/text-classification/run_glue.py \
# 	--model_name_or_path bert-base-cased \
# 	--task_name mnli \
# 	--output_dir ./models/mnli \
# 	--overwrite_output_dir \
# 	--do_train \
# 	--do_eval \
# 	--num_train_epochs 1 \
# 	--save_steps 20000

In [4]:
# ! pip install tensorflow-datasets -U

In [5]:
import os
import numpy as np
from argparse import ArgumentParser
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, AutoConfig
from dataclasses import asdict
from transformers.data.processors.utils import DataProcessor, InputExample, InputFeatures
from transformers import AutoTokenizer, AutoModelForCausalLM
import datasets
from datasets import load_dataset
import warnings

## 2. mnli 데이터셋을 분석해 보기

In [6]:
data, info = tfds.load('glue/mnli', with_info=True)
info.splits['train'].num_examples

INFO:absl:Load dataset info from /aiffel/tensorflow_datasets/glue/mnli/2.0.0
INFO:absl:Reusing dataset glue (/aiffel/tensorflow_datasets/glue/mnli/2.0.0)
INFO:absl:Constructing tf.data.Dataset glue for split None, from /aiffel/tensorflow_datasets/glue/mnli/2.0.0


392702

In [7]:
data['train'].take(1)

<TakeDataset shapes: {hypothesis: (), idx: (), label: (), premise: ()}, types: {hypothesis: tf.string, idx: tf.int32, label: tf.int64, premise: tf.string}>

In [8]:
examples = data['train'].take(1)
for example in examples:
    premise = example['premise']
    hypothesis = example['hypothesis']
    label = example['label']
    print(premise)
    print()
    print(hypothesis)
    print()
    print(label)

tf.Tensor(b'In recognition of these tensions, LSC has worked diligently since 1995 to convey the expectations of the State Planning Initiative and to establish meaningful partnerships with stakeholders aimed at fostering a new symbiosis between the federal provider and recipients of legal services funding.', shape=(), dtype=string)

tf.Tensor(b'Meaningful partnerships with stakeholders is crucial.', shape=(), dtype=string)

tf.Tensor(1, shape=(), dtype=int64)


In [9]:
DEPRECATION_WARNING = (
    "This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
    "library. You can have a look at this example script for pointers: "
    "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
)

## 3. MNLIProcessor클래스 구현하기

In [10]:
class MnliProcessor(DataProcessor):
    """Processor for the MultiNLI data set (GLUE version)."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(
            tensor_dict["idx"].numpy(),
            tensor_dict["premise"].numpy().decode("utf-8"),
            tensor_dict["hypothesis"].numpy().decode("utf-8"),
            str(tensor_dict["label"].numpy()),
        )

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")

    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched")

    def get_labels(self):
        """See base class."""
        return ["contradiction", "entailment", "neutral"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training, dev and test sets."""
        examples = []
        for i, line in enumerate(lines):
            if i == 0:
                continue
            guid = f"{set_type}-{line[0]}"
            text_a = line[8]
            text_b = line[9]
            label = None if set_type.startswith("test") else line[-1]
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

In [11]:
processor = MnliProcessor()
examples = data['train'].take(1)

for example in examples:
    print('------원본데이터------')
    print(example)  
    example = processor.get_example_from_tensor_dict(example)
    print('------processor 가공데이터------')
    print(example)

------원본데이터------
{'hypothesis': <tf.Tensor: shape=(), dtype=string, numpy=b'Meaningful partnerships with stakeholders is crucial.'>, 'idx': <tf.Tensor: shape=(), dtype=int32, numpy=16399>, 'label': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'premise': <tf.Tensor: shape=(), dtype=string, numpy=b'In recognition of these tensions, LSC has worked diligently since 1995 to convey the expectations of the State Planning Initiative and to establish meaningful partnerships with stakeholders aimed at fostering a new symbiosis between the federal provider and recipients of legal services funding.'>}
------processor 가공데이터------
InputExample(guid=16399, text_a='In recognition of these tensions, LSC has worked diligently since 1995 to convey the expectations of the State Planning Initiative and to establish meaningful partnerships with stakeholders aimed at fostering a new symbiosis between the federal provider and recipients of legal services funding.', text_b='Meaningful partnerships with stakeh



In [12]:
examples = (data['train'].take(1))
for example in examples:
    example = processor.get_example_from_tensor_dict(example)
    example = processor.tfds_map(example)
    print(example)

InputExample(guid=16399, text_a='In recognition of these tensions, LSC has worked diligently since 1995 to convey the expectations of the State Planning Initiative and to establish meaningful partnerships with stakeholders aimed at fostering a new symbiosis between the federal provider and recipients of legal services funding.', text_b='Meaningful partnerships with stakeholders is crucial.', label='entailment')


In [13]:
label_list = processor.get_labels()
label_list

['contradiction', 'entailment', 'neutral']

In [14]:
label_map = {label: i for i, label in enumerate(label_list)}
label_map

{'contradiction': 0, 'entailment': 1, 'neutral': 2}

In [15]:
import datasets
from datasets import load_dataset

huggingface_mnli_dataset = load_dataset('glue', 'mnli')
print(huggingface_mnli_dataset)



  0%|          | 0/5 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})


## 4. 위에서 구현한 processor 및 Huggingface에서 제공하는 tokenizer를 활용하여 데이터셋 구성하기

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification



# huggingface_tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
# # AutoModelForSequenceClassification
# huggingface_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")

huggingface_tokenizer = AutoTokenizer.from_pretrained("typeform/distilbert-base-uncased-mnli")

huggingface_model = AutoModelForSequenceClassification.from_pretrained("typeform/distilbert-base-uncased-mnli")


The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [17]:
def transform(data):
    return huggingface_tokenizer(
      data['premise'],
      data['hypothesis'],
      truncation = True,
      padding = 'max_length',
      return_token_type_ids = False,
    )
  
examples = huggingface_mnli_dataset['train'][:5]
examples_transformed = transform(examples)

print(examples)
print(examples_transformed)

{'premise': ['Conceptually cream skimming has two basic dimensions - product and geography.', 'you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him', 'One of our number will carry out your instructions minutely.', 'How do you know? All this is their information again.', "yeah i tell you what though if you go price some of those tennis shoes i can see why now you know they're getting up in the hundred dollar range"], 'hypothesis': ['Product and geography are what make cream skimming work. ', 'You lose the things to the following level if the people recall.', 'A member of my team will execute your orders with immense precision.', 'This information belongs to them.', 'The tennis shoes have a range of prices.'], 'label': [1, 0, 0, 0, 1], 'idx': [0, 1, 2, 3, 4]}
{'inp

In [18]:
data

{'train': <PrefetchDataset shapes: {hypothesis: (), idx: (), label: (), premise: ()}, types: {hypothesis: tf.string, idx: tf.int32, label: tf.int64, premise: tf.string}>,
 'validation_matched': <PrefetchDataset shapes: {hypothesis: (), idx: (), label: (), premise: ()}, types: {hypothesis: tf.string, idx: tf.int32, label: tf.int64, premise: tf.string}>,
 'validation_mismatched': <PrefetchDataset shapes: {hypothesis: (), idx: (), label: (), premise: ()}, types: {hypothesis: tf.string, idx: tf.int32, label: tf.int64, premise: tf.string}>,
 'test_matched': <PrefetchDataset shapes: {hypothesis: (), idx: (), label: (), premise: ()}, types: {hypothesis: tf.string, idx: tf.int32, label: tf.int64, premise: tf.string}>,
 'test_mismatched': <PrefetchDataset shapes: {hypothesis: (), idx: (), label: (), premise: ()}, types: {hypothesis: tf.string, idx: tf.int32, label: tf.int64, premise: tf.string}>}

In [19]:
encoded_dataset = huggingface_mnli_dataset.map(transform, batched=True)



In [20]:
# #메모리를 비워줍니다.
# del huggingface_model
# del train_dataset_batch
# del validation_dataset_batch
# del test_dataset_batch

In [21]:
# Trainer을 활용하는 형태로 모델 재생성
from transformers import Trainer, TrainingArguments
output_dir = os.getenv('HOME')+'/aiffel/transformers'
metric_name = 'accuracy'

training_arguments = TrainingArguments(
    output_dir, # output이 저장될 경로
    evaluation_strategy="epoch", #evaluation하는 빈도
    learning_rate = 2e-5, #learning_rate
    per_device_train_batch_size = 32, # 각 device 당 batch size
    per_device_eval_batch_size = 32, # evaluation 시에 batch size
    num_train_epochs = 1, # train 시킬 총 epochs
    weight_decay = 0.01, # weight decay
)

In [22]:
from datasets import load_metric
metric = load_metric('glue', 'mnli')

def compute_metrics(eval_pred):    
    predictions,labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references = labels)

In [None]:
trainer = Trainer(
    model=huggingface_model,                           # 학습시킬 model
    args=training_arguments,                  # TrainingArguments을 통해 설정한 arguments
    train_dataset=encoded_dataset['train'],    # training dataset
    eval_dataset=encoded_dataset['validation_matched'],       # evaluation dataset
    compute_metrics=compute_metrics,
)
trainer.train()
print("슝~")

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: premise, idx, hypothesis.
***** Running training *****
  Num examples = 392702
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 12272


Epoch,Training Loss,Validation Loss


Saving model checkpoint to /aiffel/aiffel/transformers/checkpoint-500
Configuration saved in /aiffel/aiffel/transformers/checkpoint-500/config.json
Model weights saved in /aiffel/aiffel/transformers/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/transformers/checkpoint-1000
Configuration saved in /aiffel/aiffel/transformers/checkpoint-1000/config.json
Model weights saved in /aiffel/aiffel/transformers/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/transformers/checkpoint-1500
Configuration saved in /aiffel/aiffel/transformers/checkpoint-1500/config.json
Model weights saved in /aiffel/aiffel/transformers/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/transformers/checkpoint-2000
Configuration saved in /aiffel/aiffel/transformers/checkpoint-2000/config.json
Model weights saved in /aiffel/aiffel/transformers/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to /aiffel/aiffel/transformers/checkpoi

In [None]:
trainer.evaluate(encoded_dataset['test']

print("완료")