In [2]:
import os, glob
import logging

from pprint import pprint
from tqdm import tqdm
import json
# import nltk
import numpy as np
import tensorflow as tf
from tensorflow import keras

# 에러 메세지만 로깅한다
tf.get_logger().setLevel(logging.ERROR)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
# The percentage of the dataset you want to split as train and test
TEST_SIZE = 0.1

MAX_INPUT_LENGTH = 512  # Maximum length of the input to the Encoder
MIN_TARGET_LENGTH = 5  # Minimum length of the output by Decoder
MAX_TARGET_LENGTH = 128  # Maximum length of the output by Decoder
BATCH_SIZE = 8  # Batch-size for training our model
LEARNING_RATE = 2e-5  # Learning-rate for training our model
MAX_EPOCHS = 1  # Maximum number of epochs we will train the model for

# HF model hub에서 가져올 모델 이름
MODEL_CHECKPOINT = "psyche/KoT5-summarization"

In [4]:
from transformers import AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
config = AutoConfig.from_pretrained(MODEL_CHECKPOINT)
config.vocab_size = tokenizer.vocab_size

In [5]:
from google.colab import auth
auth.authenticate_user()

In [6]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

strategy = tf.distribute.TPUStrategy(tpu)

print(f"Available number of replicas: {strategy.num_replicas_in_sync}")

Available number of replicas: 8


In [7]:

def decode_fn(example):
    features = {
        "input_ids": tf.io.FixedLenFeature(
            dtype=tf.int64, shape=(MAX_INPUT_LENGTH,)
        ),
        "attention_mask": tf.io.FixedLenFeature(
            dtype=tf.int64, shape=(MAX_INPUT_LENGTH,)
        ),
        'labels': tf.io.FixedLenFeature(
            dtype=tf.int64, shape=(MAX_INPUT_LENGTH,)
        )
    }
    return tf.io.parse_single_example(example, features)

In [8]:
def _parse_function(example_proto):

    name_to_features = {'input_ids': tf.io.FixedLenFeature([], tf.string),
                        'attention_mask':tf.io.FixedLenFeature([], tf.string),
                        'labels':tf.io.FixedLenFeature([], tf.string)}

    example = tf.io.parse_single_example(example_proto, name_to_features)

    for name in list(example.keys()):
        t = example[name]
        example[name] = tf.io.parse_tensor(t, out_type=tf.int64)

    return example

In [9]:
def load_tfrecord_dataset(tfrecord_name, batch_size, shuffle=True, buffer_size=10240):
    """load dataset from tfrecord"""
    raw_dataset = tf.data.TFRecordDataset(tfrecord_name)
    raw_dataset = raw_dataset.repeat()

    if shuffle:
        raw_dataset = raw_dataset.shuffle(buffer_size=buffer_size)

    dataset = raw_dataset.map(
        _parse_function,
        num_parallel_calls=tf.data.experimental.AUTOTUNE
    )


    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

In [10]:
tfr_path = 'gs://ohsori-tfrecord/tfrecord/KoT5_train.tfrecord'
train = load_tfrecord_dataset(tfr_path, batch_size=BATCH_SIZE, shuffle=False)
print(type(train))

<class 'tensorflow.python.data.ops.prefetch_op._PrefetchDataset'>


In [11]:
tfr_path = 'gs://ohsori-tfrecord/tfrecord/KoT5_test.tfrecord'
test = load_tfrecord_dataset(tfr_path, batch_size=BATCH_SIZE, shuffle=False)
print(type(test))

<class 'tensorflow.python.data.ops.prefetch_op._PrefetchDataset'>


In [12]:
tfr_path = 'gs://ohsori-tfrecord/tfrecord/KoT5_gen.tfrecord'
gen = load_tfrecord_dataset(tfr_path, batch_size=BATCH_SIZE, shuffle=False)
print(type(gen))

<class 'tensorflow.python.data.ops.prefetch_op._PrefetchDataset'>


In [13]:
train

<_PrefetchDataset element_spec={'attention_mask': TensorSpec(shape=<unknown>, dtype=tf.int64, name=None), 'input_ids': TensorSpec(shape=<unknown>, dtype=tf.int64, name=None), 'labels': TensorSpec(shape=<unknown>, dtype=tf.int64, name=None)}>

In [14]:
first_batch = next(iter(train.batch(1)))
print(first_batch['input_ids'].shape)

(1, 8, 512)


In [15]:
first_batch

{'attention_mask': <tf.Tensor: shape=(1, 8, 512), dtype=int64, numpy=
 array([[[1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         ...,
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0]]])>,
 'input_ids': <tf.Tensor: shape=(1, 8, 512), dtype=int64, numpy=
 array([[[16038, 26202, 18596, ...,     0,     0,     0],
         [16038, 26202,  8874, ...,     0,     0,     0],
         [16038, 26202,   438, ...,     0,     0,     0],
         ...,
         [16038, 26202, 25889, ...,     0,     0,     0],
         [16038, 26202,   601, ...,     0,     0,     0],
         [16038, 26202, 25889, ...,     0,     0,     0]]])>,
 'labels': <tf.Tensor: shape=(1, 8, 39), dtype=int64, numpy=
 array([[[  504, 25938,    68,  3363,    61,  5457,   851,  5432,   172,
          14699,  6705,  6197,   100, 25965, 26074, 25905,   504, 26056,
          26402, 25894,  3975,   504, 25895, 16915,  5407, 25892,     

In [16]:
import keras_nlp

rouge_l = keras_nlp.metrics.RougeL()

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
    # We will print only the F1 score, you can use other aggregation metrics as well
    result = {"RougeL": result["f1_score"]}

    return result

In [17]:
from tensorflow.keras.backend import clear_session
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
# 요약 task는 seq2seq task로 분류된다

model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFT5ForConditionalGeneration: ['encoder.embed_tokens.weight', 'lm_head.weight', 'decoder.embed_tokens.weight']
- This IS expected if you are initializing TFT5ForConditionalGeneration from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFT5ForConditionalGeneration from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [18]:
optimizer = keras.optimizers.AdamW(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer) # loss 값은 내부적으로 처리하도록 설정

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [19]:
clear_session()

In [20]:
model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (TFSharedEmbeddings)  multiple                 24674304  
                                                                 
 encoder (TFT5MainLayer)     multiple                  84954240  
                                                                 
 decoder (TFT5MainLayer)     multiple                  113275008 
                                                                 
Total params: 222,903,552
Trainable params: 222,903,552
Non-trainable params: 0
_________________________________________________________________


In [21]:
from transformers.keras_callbacks import KerasMetricCallback
from tensorflow.keras.backend import clear_session

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=gen, predict_with_generate=True
)

callbacks = [metric_callback]


# For now we will use our test set as our validation_data
model.fit(
    train, validation_data=test,
    epochs=MAX_EPOCHS,
    callbacks=callbacks,
    steps_per_epoch=8640//BATCH_SIZE,
    validation_steps=960//BATCH_SIZE,
)



OperatorNotAllowedInGraphError: ignored