In [1]:
!pip install git+https://github.com/cosmoquester/transformers-tf-finetune.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/cosmoquester/transformers-tf-finetune.git
  Cloning https://github.com/cosmoquester/transformers-tf-finetune.git to /tmp/pip-req-build-02nmkvxy
  Running command git clone -q https://github.com/cosmoquester/transformers-tf-finetune.git /tmp/pip-req-build-02nmkvxy
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.2 MB/s 
[?25hCollecting tensorflow-addons
  Downloading tensorflow_addons-0.17.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 39.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_

In [2]:
import csv
import random
import urllib.request

import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

from transformers_tf_finetune.losses import SparseCategoricalCrossentropy
from transformers_tf_finetune.metrics import SparseCategoricalAccuracy
from transformers_tf_finetune.models import GenerationSearchWrapper
from transformers_tf_finetune.utils import LRScheduler, get_device_strategy, path_join, set_random_seed

# Config

In [3]:
#: transformers pretrained path
pretrained_model = "cosmoquester/bart-ko-small"
#: pretrained tokenizer fast pretrained path
pretrained_tokenizer = "cosmoquester/bart-ko-small"
#: load from pytorch weight
from_pytorch = False
#: use huggingface credential for private model
use_auth_token = ""

dataset_path = "https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv"
#: output directory to save log and model checkpoints, should be GCS path with TPU
output_path = None

max_sequence_length = 128
#: "beam size, use greedy search if this is zero"
beam_size = 0

#: training params
epochs = 2
learning_rate = 1e-4
min_learning_rate = 1e-5
warmup_rate = 0.06
warmup_steps = None
batch_size = 16
dev_batch_size = 256
num_dev_dataset = 128 # should be multipes of 8 with TPU
tensorboard_update_freq = 1

#: device to use (TPU or GPU or CPU)
device = "TPU"
#: Use mixed precision FP16
mixed_precision = False
#: Set random seed
seed = None

In [4]:
if output_path is not None and output_path.startswith("gs://"):
  from google.colab import auth
  auth.authenticate_user()

In [5]:
def load_dataset(dataset_path: str, tokenizer: AutoTokenizer, shuffle: bool = False) -> tf.data.Dataset:
    """
    Load Chatbot Conversation dataset from local file or web

    :param dataset_path: local file path or file uri
    :param tokenizer: PreTrainedTokenizer for tokenizing
    :param shuffle: whether shuffling lines or not
    :returns: conversation dataset
    """
    if dataset_path.startswith("https://"):
        with urllib.request.urlopen(dataset_path) as response:
            data = response.read().decode("utf-8")
    else:
        with open(dataset_path) as f:
            data = f.read()
    lines = data.splitlines()[1:]
    if shuffle:
        random.shuffle(lines)

    bos = tokenizer.bos_token
    eos = tokenizer.eos_token

    questions = []
    answers = []
    for question, answer, _ in csv.reader(lines):
        questions.append(bos + question + eos)
        answers.append(bos + answer + eos)

    max_length = max(len(text) for text in questions + answers)
    inputs = tokenizer(
        questions,
        padding="max_length",
        max_length=max_length,
        return_tensors="tf",
        return_token_type_ids=False,
        return_attention_mask=True,
    )

    target_tokens = tokenizer(
        answers,
        padding="max_length",
        max_length=max_length,
        return_tensors="tf",
        return_token_type_ids=False,
        return_attention_mask=False,
    )["input_ids"]

    dataset = tf.data.Dataset.from_tensor_slices(
        ({**inputs, "decoder_input_ids": target_tokens[:, :-1]}, target_tokens[:, 1:])
    )
    return dataset

In [6]:
if seed:
    set_random_seed(seed)

In [7]:
strategy = get_device_strategy(device)

INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.


INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.


INFO:tensorflow:Initializing the TPU system: grpc://10.106.38.2:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.106.38.2:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


# Mixed Precision

In [8]:
with strategy.scope():
    if mixed_precision:
        mixed_type = "mixed_bfloat16" if device == "TPU" else "mixed_float16"
        policy = tf.keras.mixed_precision.experimental.Policy(mixed_type)
        tf.keras.mixed_precision.experimental.set_policy(policy)

# Load Dataset

In [9]:
with strategy.scope():
    tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer, use_auth_token=use_auth_token)

    dataset = load_dataset(dataset_path, tokenizer, True)
    train_dataset = dataset.skip(num_dev_dataset).batch(batch_size)
    dev_dataset = dataset.take(num_dev_dataset).batch(dev_batch_size)

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Load Model

In [10]:
with strategy.scope():
    model = TFAutoModelForSeq2SeqLM.from_pretrained(
        pretrained_model, use_auth_token=use_auth_token, from_pt=from_pytorch, use_cache=False
    )

Downloading:   0%|          | 0.00/155M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at cosmoquester/bart-ko-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


# Model Compile

In [11]:
with strategy.scope():
    model.compile(
        optimizer=tf.optimizers.Adam(
            LRScheduler(
                len(train_dataset) * epochs,
                learning_rate,
                min_learning_rate,
                warmup_rate,
                warmup_steps,
            )
        ),
        loss={
            "logits": SparseCategoricalCrossentropy(from_logits=True, ignore_index=tokenizer.pad_token_id),
            "encoder_last_hidden_state": None,
        },
        metrics={"logits": SparseCategoricalAccuracy(ignore_index=tokenizer.pad_token_id, name="accuracy")},
    )

# Model Training

In [12]:
with strategy.scope():
    model.fit(
        train_dataset,
        validation_data=dev_dataset,
        epochs=epochs,
        callbacks=[
            tf.keras.callbacks.ModelCheckpoint(
                path_join(output_path, "best_model.ckpt"),
                save_weights_only=True,
                save_best_only=True,
                monitor="val_logits_accuracy",
                mode="max",
                verbose=1,
            ),
            tf.keras.callbacks.TensorBoard(
                path_join(output_path, "logs"), update_freq=tensorboard_update_freq
            ),
        ] if output_path is not None else None,
    )

Epoch 1/2
Epoch 2/2


# Model Evaluate

In [13]:
with strategy.scope():
    loss, _, accuracy = model.evaluate(dev_dataset)



# Prediction

In [14]:
with strategy.scope():
    input_tokens = []
    predict_tokens = []
    ppls = []
    searcher = GenerationSearchWrapper(
        model,
        max_sequence_length,
        tokenizer.convert_tokens_to_ids(tokenizer.bos_token),
        tokenizer.convert_tokens_to_ids(tokenizer.eos_token),
        tokenizer.convert_tokens_to_ids(tokenizer.pad_token),
        beam_size=beam_size,
    )
    for batch, _ in strategy.experimental_distribute_dataset(dev_dataset):
        if beam_size > 0:
            output, ppl = strategy.run(searcher.beam_search, args=(batch["input_ids"], batch["attention_mask"]))
            output = strategy.gather(output, axis=0)[:, 0, :]
            ppl = strategy.gather(ppl, axis=0)[:, 0]
        else:
            output, ppl = strategy.run(searcher.greedy_search, args=(batch["input_ids"], batch["attention_mask"]))
            output = strategy.gather(output, axis=0)
            ppl = strategy.gather(ppl, axis=0)
        input_tokens.extend(strategy.gather(batch["input_ids"], axis=0).numpy())
        predict_tokens.extend(output.numpy())
        ppls.extend(ppl.numpy())

    input_sentences = tokenizer.batch_decode(input_tokens, skip_special_tokens=True)
    predict_sentences = tokenizer.batch_decode(predict_tokens, skip_special_tokens=True)
    for question, answer, ppl in zip(input_sentences, predict_sentences, ppls):
        print(f"Q: {question} A: {answer} PPL:{ppl:.2f}")

Q: 쉬는 날이라고 했는데 만나자는 말이 없어. A: 그럴 때가 있어요. PPL:1.06
Q: 술 마시고 싶다 A: 술 마시면서 기분 전환 해보세요. PPL:1.07
Q: 짝남이랑 저도 모르게 사이가 어색해졌어요. A: 마음이 복잡하겠어요. PPL:1.06
Q: 다른 사람들도 이런 재회 바라나? A: 많이 힘들었겠어요. PPL:1.06
Q: 뭐할까? A: 저도요. PPL:1.04
Q: 제사를 너무 많이 해 A: 제가 잘해 드릴게요. PPL:1.07
Q: 그녀 생일 A: 그녀 생일 축하해요. PPL:1.04
Q: 어제 꿈에나온너. A: 꿈은 현실이에요. PPL:1.04
Q: 전공 못 살릴듯 A: 전공을 살릴 수 있을 거예요. PPL:1.05
Q: 약 먹었더니 졸려 A: 약 먹고 잠이 솔솔 오네요. PPL:1.07
Q: 조카 입학 선물 뭐 사냐 A: 좋은 선물이네요. PPL:1.03
Q: 왼손잡이가 잘못이야? A: 손잡이가 잘못이에요. PPL:1.04
Q: 친구랑 비교 돼 A: 친구랑 비교해보세요. PPL:1.02
Q: 그녀는 이별했지만 전 아직 슬픈진행중 A: 사랑은 언제나 사랑이죠. PPL:1.09
Q: 좋은꿈 꿨다 생각해 A: 꿈은 현실이에요. PPL:1.04
Q: 서류에서 떨어졌어. A: 서류를 사보세요. PPL:1.07
Q: 시간 갖는 거 A: 시간은 좀 더 많이 흘러갈 거예요. PPL:1.11
Q: 봄 오니까 나도 썸 타고 싶어. A: 썸이 좋을 거 같아요. PPL:1.08
Q: 멍청한게 탈이지 A: 멍청한게 아니라 탈이에요. PPL:1.09
Q: 나만 제자리인듯 A: 잘하고 있어요. PPL:1.05
Q: 2년의 연애에 이별이란 종착역!! A: 이별은 언제나 끝이에요. PPL:1.09
Q: 스키 첨 타 봤어. A: 스키 타보세요. PPL:1.06
Q: 이 정도면 잘생겼지? A: 잘생겼어요. PPL:1.02
Q: 짝녀가 우리 과에서 인기가 너무 많아. A: 짝사랑은 인기가 많아요. PPL:1.08
Q: 이별6일차. 정신병자 같네. A: 이별은 정말 힘들죠