<a href="https://colab.research.google.com/github/jwengr/KoDeBERTa/blob/main/lit_deberta_v3_colab_tpu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install --quiet jupyter-tensorboard tensorboard-plugin-profile transformers datasets pytorch-lightning cloud-tpu-client==0.10 torch==2.0.0 torchvision==0.15.1 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-2.0-cp310-cp310-linux_x86_64.whl

In [2]:
cd /content/drive/MyDrive/KoDeBERTa/KoDeBERTa

/content/drive/MyDrive/KoDeBERTa/KoDeBERTa


In [3]:
import os
import random
import numpy as np
import pytorch_lightning as pl

from datasets import load_dataset
from tokenizers import Tokenizer
from torch.utils.data import DataLoader
from pytorch_lightning.profilers import XLAProfiler
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

from Model.DebertaV3.DebertaV3 import LitDebertaV3ForPretraining
from Data.DataCollator import DataCollatorForHFUnigramSpanMLM

In [4]:
seed = 0
data_path = '/content/drive/MyDrive/KoDeBERTa/data/test.txt'
model_name = 'microsoft/deberta-v3-xsmall'
tokenizer_path = '/content/drive/MyDrive/KoDeBERTa/tokenizers/mecab-hf-unigram-880M-128k.json'
mask_token = '[MASK]'
pad_token = '[PAD]'
lr = 1e-4
current_step=0
num_training_steps = 512000000
num_warmup_steps = 40960000
batch_size = 2
log_per_steps = 50
log_dir = '/content/drive/MyDrive/KoDeBERTa/logs'
save_per_steps = 25600000
save_dir = '/content/drive/MyDrive/KoDeBERTa/models'
max_length = 512
mask_prob = 0.15
pretrained_model_path = None
gradient_checkpointing = True

In [5]:
pl.seed_everything(seed)

INFO:lightning_fabric.utilities.seed:Global seed set to 0


0

In [6]:
tokenizer = Tokenizer.from_file(tokenizer_path)
mask_id = tokenizer.get_vocab()[mask_token]
pad_id = tokenizer.get_vocab()[pad_token]

In [7]:
ds = load_dataset("text", data_files={"train": data_path}, streaming=False)['train']
if current_step:
    ds = ds.repeat(2).skip(current_step)
dl = DataLoader(ds, batch_size=batch_size, collate_fn=DataCollatorForHFUnigramSpanMLM(tokenizer, truncation_argument={'max_length':max_length}, mask_prob=mask_prob, from_hf_datasets=True))



  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
debertav3_pretrainer = LitDebertaV3ForPretraining(
    model_name=model_name,
    mask_id=mask_id,
    pad_id=pad_id,
    lr=lr,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
    gradient_checkpointing=gradient_checkpointing,
    current_step=current_step,
)

In [9]:
logger = TensorBoardLogger(log_dir, name="LitDebertaV3ForPretrainingDataCollatorForHFUnigramSpanMLM", version=0)

In [10]:
checkpoint_callback = ModelCheckpoint(
    dirpath=save_dir,
    filename='LitDebertaV3ForPretrainingDataCollatorForHFUnigramSpanMLM-current_step={current_step:d}-Loss_G={Loss_G:.2f}-Loss_D={Loss_D:.2f}',
    every_n_train_steps=save_per_steps,
)

In [11]:
trainer = pl.Trainer(
    accelerator='tpu',
    precision='16-mixed',
    max_steps=num_training_steps - current_step,
    logger=logger,
    devices=1,
    log_every_n_steps=100,
    callbacks=[checkpoint_callback],
)

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: True, using: 1 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(debertav3_pretrainer,dl)

INFO:pytorch_lightning.callbacks.model_summary:
  | Name          | Type                            | Params
------------------------------------------------------------------
0 | generator     | DebertaV2ForMaskedLM            | 60.3 M
1 | discriminator | DebertaV2ForTokenClassification | 70.7 M
------------------------------------------------------------------
130 M     Trainable params
0         Non-trainable params
130 M     Total params
523.980   Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

