In [1]:
import os
import sys

# Get the absolute path of the project directory
project_root = os.path.abspath(os.path.join(os.path.join(os.getcwd()), ".."))
# Add the project root to sys.path
sys.path.insert(0, project_root)

In [2]:
import random
from argparse import ArgumentParser
import logging

import torch
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorWithPadding

from lima_dataset import (
    InstructionDataset,
    load_lima_dataset,
    tokenize_text,
    format_prompt_func,
    EOT_TOKEN,
)
from utils import (
    read_yaml,
    get_model_config,
    get_tokenizer_config,
    get_split_config,
    get_dataset_config,
    get_trainer_config,
    get_lora_config,
)
from model import (
    load_model,
    load_tokenizer,
    load_lora_model,
)

In [3]:
# config = read_yaml("./configs/train_config_llama_lora.yaml")
config = read_yaml("../configs/train_config_llama_qlora.yaml")

In [4]:
tokenizer_name, tokenizer_path, tokenizer_config = get_tokenizer_config(config)
tokenizer = load_tokenizer(
    tokenizer_name=tokenizer_name,
    tokenizer_path=tokenizer_path,
    tokenizer_config=tokenizer_config,
)
tokenizer_name, tokenizer_path, tokenizer_config

('llama2',
 'meta-llama/Llama-2-7b-hf',
 {'add_bos_token': True, 'add_eos_token': True})

In [5]:
dataset_desc, (train_split_config, val_split_config, test_split_config) = (
    get_split_config(config)
)
dataset_desc, train_split_config

('LIMA Instruct Finetunning Dataset',
 {'dataset_path': 'GAIR/lima', 'sub_split_size': None})

In [6]:
train_dataset_path, train_sub_split_size, train_dataset_config, other_configs = get_dataset_config(
    train_split_config
)
train_dataset_path, train_sub_split_size, train_dataset_config, other_configs

('GAIR/lima', None, {}, {})

In [7]:
train_dataset = load_lima_dataset(
    train_dataset_path, "train", train_sub_split_size, **train_dataset_config
)
next(iter(train_dataset))

{'conversations': ['Can brain cells move? By movement I mean long distance migration (preferably within the brain only).',
  'The question is relatively broad and one should take into account that the brain not only consists of neurons, but also glial cells (supportive cells) and pre-mitotic neuronal stem cells. Furthermore, as critical fellow-scientists have indicated, developmental stage is very important, as the developing embryonic brain is very different from the adult brain.\nHowever, after sifting through various publications, the answer to the question is actually remarkably simple: Yes, brain cells migrate.\nIn  the adult brain glial cells migrate in the brain (Klämbt, 2009). Glial cells are involved in a myriad of functions, but a notable example of migrating glial cells are the oligodendrocytes that migrate relative long distances to find their target axons onto which they wrap themselves to form the insulating myelin sheath (Tsai and Miller, 2002).\nNeuronal stem cells migr

In [8]:
model_name, model_path, base_model_path, model_config = get_model_config(
    config, pad_token_id=tokenizer.pad_token_id, tokenizer_length=len(tokenizer)
)
model = load_model(
    model_string=model_name,
    model_path=model_path,
    base_model_path=base_model_path,
    model_config=model_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [9]:
lora_config = get_lora_config(config)
model = load_lora_model(model, lora_config)

In [10]:
trainer_config = get_trainer_config(config)
trainer_config["logging_dir"] = os.path.join(
    trainer_config["output_dir"], "runs", trainer_config["run_name"]
)
save_trained_model = trainer_config.pop("save_trained_model", True)
sft_trainer_args = SFTConfig(**trainer_config)

In [11]:
save_trained_model = trainer_config.pop("save_trained_model", True)
resume_from_checkpoint = trainer_config.pop("resume_from_checkpoint", None)
sft_trainer_args = SFTConfig(**trainer_config)

sft_trainer = SFTTrainer(
    model,
    args=sft_trainer_args,
    train_dataset=train_dataset,
    formatting_func=format_prompt_func,
    processing_class=tokenizer,
    # data_collator=data_collator
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [12]:
sft_training_outs = sft_trainer.train(resume_from_checkpoint=resume_from_checkpoint,)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
1,6.953
2,6.361
3,6.5998
4,0.0
5,6.8291
6,7.2922
7,6.9477
8,6.8678
9,5.9739
10,6.6886


KeyboardInterrupt: 