In [1]:
import logging

import torch
from transformers import GPT2Tokenizer

from src import oasst, text_util
from src.model import GPT2, LoRAConfig
from src.trainer import Trainer, TrainerConfig

logging.basicConfig(level=logging.INFO)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
N_SAMPLES_TRAIN = 1_000

lora_config = LoRAConfig(r=8, alpha=32)

trainer_config = TrainerConfig(
    batch_size=8,
    gradient_acc_steps=1,
    validation_samples=100,
    validation_interval=100,
    sample_prompts=[
        "Hello, how are you?",
        "What's the best city for sun bathing?",
        "What does an architect do?"
    ],
    log_interval=8,
    compile=False,
    base_learning_rate=3e-4,
    min_learning_rate=1e-6,
    lr_step_size=50_000_000,
    lr_gamma=0.33,
    weight_decay=1e-5,
    betas=(0.9, 0.95),
    grad_clip=None,
    num_workers=0,
    prefetch_factor=None,
    pin_memory=False
)

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
text_util.add_pad_token_to_tokenizer(tokenizer)
text_util.add_chatml_tokens_to_tokenizer(tokenizer)

In [4]:
train_dataset, validation_dataset = oasst.load_oasst_dataset("oasst1", tokenizer)

Extracted and parsed 20147 conversations
Extracted and parsed 1002 conversations


In [5]:
model = GPT2.from_pretrained("gpt2")

INFO:src.model:Initializing a pre-trained gpt2 model...
INFO:src.model:Initialized GPT with 124.44 M parameters (of which 38.60 M in embeddings)
INFO:src.model:Loading pre-trained weights from HuggingFace...


In [6]:
fine_tuneable = model.to_fine_tuneable()
init_from_index = tokenizer.encode("\n")[0]  # init new embs with \n emb
fine_tuneable.extend_vocabulary(len(tokenizer), init_from_index)
fine_tuneable.set_padding_token(tokenizer.pad_token_id)
fine_tuneable.apply_lora(lora_config)

INFO:src.model:Initialized GPT with 124.44 M parameters (of which 38.60 M in embeddings)
INFO:src.model:Extended token embeddings: 50257 -> 50260
INFO:src.model:Set padding embedding at index 50257 to zero
INFO:src.model:Initialized LoRA layers for modules: ['c_attn', 'c_proj', 'c_fc']
INFO:src.model:Registered selective gradient hook for 2 new tokens
INFO:src.model:Applied selective parameter freezing for LoRA and new token embeddings
INFO:src.model:LoRA initialized: num. of parameters requiring gradient computation: 124.44 M -> 39.78 M


In [7]:
trainer =  Trainer(
    trainer_config, fine_tuneable, tokenizer, train_dataset, validation_dataset, DEVICE
)

# Debug

In [8]:
trainer.train(N_SAMPLES_TRAIN)

INFO:src.trainer:Staring model training for 1000 samples...


🔄 iter:      0 │ 📊 samples:        8 │ 📉 loss: 103.7915 │ 📈 lr:  3.00e-04 │ ⚡    1 samples/s
🔄 iter:      1 │ 📊 samples:       16 │ 📉 loss: 99.3314 │ 📈 lr:  3.00e-04 │ ⚡    1 samples/s
🔄 iter:      2 │ 📊 samples:       24 │ 📉 loss: 96.7058 │ 📈 lr:  3.00e-04 │ ⚡    1 samples/s
🔄 iter:      3 │ 📊 samples:       32 │ 📉 loss: 94.4074 │ 📈 lr:  3.00e-04 │ ⚡    2 samples/s
🔄 iter:      4 │ 📊 samples:       40 │ 📉 loss: 91.7139 │ 📈 lr:  3.00e-04 │ ⚡    2 samples/s
🔄 iter:      5 │ 📊 samples:       48 │ 📉 loss: 89.5895 │ 📈 lr:  3.00e-04 │ ⚡    1 samples/s
🔄 iter:      6 │ 📊 samples:       56 │ 📉 loss: 86.4960 │ 📈 lr:  3.00e-04 │ ⚡    1 samples/s
🔄 iter:      7 │ 📊 samples:       64 │ 📉 loss: 82.8014 │ 📈 lr:  3.00e-04 │ ⚡    1 samples/s
🔄 iter:      8 │ 📊 samples:       72 │ 📉 loss: 75.3876 │ 📈 lr:  3.00e-04 │ ⚡    1 samples/s
🔄 iter:      9 │ 📊 samples:       80 │ 📉 loss: 66.9635 │ 📈 lr:  3.00e-04 │ ⚡    1 samples/s
🔄 iter:     10 │ 📊 samples:       88 │ 📉 loss: 57.4729 │ 📈 lr:  3.00e-04 │ ⚡   

KeyboardInterrupt: 