<a href="https://colab.research.google.com/github/jwengr/KoDeBERTa/blob/main/tf_deberta_colab_tpu_pretrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
! pip install --quiet transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m82.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m94.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
cd /content/drive/MyDrive/KoDeBERTa/KoDeBERTa

/content/drive/MyDrive/KoDeBERTa/KoDeBERTa


In [4]:
import os
import random
import numpy as np
import tensorflow as tf

from datasets import load_dataset
from tokenizers import Tokenizer

from Model.DebertaV3.TFDebertaV3 import TFDebertaV3ForPretraining
from Data.DataCollator import DataCollatorForHFUnigramSpanMLM

In [13]:
seed = 0
data_path = '/content/drive/MyDrive/KoDeBERTa/data/test.txt'
model_name = 'microsoft/deberta-v3-xsmall'
tokenizer_path = '/content/drive/MyDrive/KoDeBERTa/tokenizers/mecab-hf-unigram-880M-128k.json'
mask_token = '[MASK]'
pad_token = '[PAD]'
learning_rate = 1e-4
current_step=0
total_steps = 512000000
warmup_steps = 40960000
batch_size = 8
log_per_steps = 50
log_dir = '/content/drive/MyDrive/KoDeBERTa/logs/tensorflow/TFDebertaV3ForPretrainingDataCollatorForHFUnigramSpanMLM'
save_per_steps = 25600000
save_dir = '/content/drive/MyDrive/KoDeBERTa/model/tensorflow/TFDebertaV3ForPretrainingDataCollatorForHFUnigramSpanMLM'
max_length = 512
mask_prob = 0.15
pretrained_model_path = None

In [6]:
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [7]:
tokenizer = Tokenizer.from_file(tokenizer_path)
mask_id = tokenizer.get_vocab()[mask_token]
pad_id = tokenizer.get_vocab()[pad_token]

In [None]:
!echo $COLAB_TPU_ADDR

In [None]:
TPU_PATH = f"grpc://{os.environ['COLAB_TPU_ADDR']}"

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_PATH)
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)

In [14]:
ds = load_dataset("text", data_files={"train": data_path})['train']
ds = ds.to_tf_dataset(
    batch_size=batch_size,
    shuffle=False,
    collate_fn = DataCollatorForHFUnigramSpanMLM(tokenizer, truncation_argument={'max_length':max_length}, mask_prob=mask_prob, return_tensors='tf', from_hf_datasets=True)
)
if current_step:
    ds = ds.repeat(2).skip(current_step)



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
with strategy.scope():
    if pretrained_model_path:
        model = tf.keras.models.load_model(pretrained_model_path)
    else:
        model = TFDebertaV3ForPretraining(
            model_name=model_name,
            mask_id=mask_id,
            pad_id=pad_id,
            learning_rate=learning_rate,
            warmup_steps=warmup_steps,
            total_steps=total_steps
        )

    training_loss_generator = tf.keras.metrics.Mean('training_loss_generator', dtype=tf.float32)
    training_loss_discriminator = tf.keras.metrics.Mean('training_loss_discriminator', dtype=tf.float32)

    per_replica_batch_size = batch_size // strategy.num_replicas_in_sync
    train_dataset = strategy.experimental_distribute_datasets_from_function(
        lambda _: ds
    )


In [None]:
@tf.function
def train_multiple_steps(iterator, steps, current_step):
    def step_fn(inputs):
        masked_ids, attention_mask, label_ids = inputs
        loss_generator, loss_discriminator = model(masked_ids=masked_ids, attention_mask=attention_mask, label_ids=label_ids, current_step=current_step)
        training_loss_generator.update_state(loss_generator * strategy.num_replicas_in_sync)
        training_loss_discriminator.update_state(loss_discriminator * strategy.num_replicas_in_sync)

    for _ in tf.range(steps):
        strategy.run(step_fn, args=(next(iterator),))

In [None]:
train_summary_writer = tf.summary.create_file_writer(log_dir)
train_iterator = iter(train_dataset)
for step in range(total_steps-current_step):
    if step % log_per_steps == 0:
        train_multiple_steps(train_iterator, log_per_steps, step+current_step)
        if step % save_per_steps == 0:
            model.save(f"{save_dir}/loss_g={training_loss_generator.result()}-loss_d={training_loss_discriminator.result()}-step={step+current_step}.h5")
        with train_summary_writer.as_default():
            tf.summary.scalar('training_loss_generator', training_loss_generator.result(), step=step+current_step)
            tf.summary.scalar('training_loss_discriminator', training_loss_discriminator.result(), step=step+current_step)
        training_loss_generator.reset_states()
        training_loss_discriminator.reset_states()
