In [1]:
import torch
import pytorch_lightning as pl

from tokenizers import Tokenizer
from datasets import IterableDataset
from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint

from Data.DataCollator import DataCollatorForHFUnigramSpanMLM
from Model.DebertaV3 import LitDebertaV3ForPretraining

In [2]:
torch.set_float32_matmul_precision('medium')

In [3]:
SEED=0

In [4]:
pl.seed_everything(SEED)

Global seed set to 0


0

In [5]:
tokenizer = Tokenizer.from_file('./.local/tokenizers/mecab-hf-unigram-880M-128k.json')

In [6]:
mask_id = tokenizer.get_vocab()['[MASK]']
pad_id = tokenizer.get_vocab()['[PAD]']

In [7]:
def gen():
    with open('../../Preproc5/korean/mecab.txt', encoding='utf-8-sig') as f:
        for line in f:
            yield line

In [8]:
batch_size = 4
max_steps = 500_000 * (8192//batch_size)

In [9]:
ds = IterableDataset.from_generator(gen)
ds.shuffle(seed=SEED, buffer_size=8_800_000)
dl = DataLoader(ds, batch_size=batch_size, collate_fn=DataCollatorForHFUnigramSpanMLM(tokenizer, truncation_argument={'max_length':512}))

In [10]:
debertav3_pretrainer = LitDebertaV3ForPretraining('microsoft/deberta-v3-xsmall', mask_id=mask_id, pad_id=pad_id, lr=1e-7, num_warmup_steps=int(max_steps*0.08), num_training_steps=max_steps)

In [11]:
checkpoint_callback = ModelCheckpoint(
    monitor='Loss_D',
    dirpath='/.model/',
    filename='{epoch:02d}-{Loss_G:.4f}-{Loss_D:.4f}',
    every_n_train_steps=500_000//20
)

trainer = pl.Trainer(
    accelerator='gpu',
    precision=16,
    max_steps=max_steps,
    callbacks=[checkpoint_callback],
)


  rank_zero_warn(
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [12]:
trainer.fit(debertav3_pretrainer,dl)

Missing logger folder: c:\Users\dust\Documents\Github\KoDeBERTa\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type                            | Params
------------------------------------------------------------------
0 | generator     | DebertaV2ForMaskedLM            | 60.3 M
1 | discriminator | DebertaV2ForTokenClassification | 70.7 M
------------------------------------------------------------------
130 M     Trainable params
0         Non-trainable params
130 M     Total params
523.980   Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]