In [None]:
import gc
import os
import torch

from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import setup_chat_format

# Parameters

In [None]:
ACCESS_TOKEN = "hf_XXX"

ATTN_IMPL = "flash_attention_2"
TORCH_DTYPE = torch.bfloat16
BASE_MODEL = "meta-llama/Meta-Llama-3-8B"
NEW_MODEL = "Custom-Llama-3-8B"

N_EPOCHS = 1
DEVICE = "cuda:0"
LR = 8e-6
CONTENT_LENGTH = 1024
PROMPT_LENGTH = 512
N_BATCH_GRADIENT_ACC = 4
WARMUP_STEPS = 10
NEFTUNE_NOISE_ALPHA = 0.1
TEST_SIZE = 0.05
TRAIN_BATCH_SIZE = 2
EVAL_BATCH_SIZE = 2
N_DATA = 100
EVAL_STEPS = 1
LOGGING_STEPS = 1

LORA_RANK = 16
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

# Config

## Quantization config

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=TORCH_DTYPE,
    bnb_4bit_use_double_quant=True,
)

## PEFT / LoRA config

In [None]:
peft_config = LoraConfig(
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

## Training config

In [None]:
training_args = TrainingArguments(
    learning_rate=LR,
    lr_scheduler_type="linear",
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=N_BATCH_GRADIENT_ACC,
    optim="paged_adamw_8bit",
    num_train_epochs=N_EPOCHS,
    evaluation_strategy="steps",
    eval_steps=EVAL_STEPS,
    logging_steps=LOGGING_STEPS,
    warmup_steps=WARMUP_STEPS,
    output_dir="./results/",
    remove_unused_columns=False,
    neftune_noise_alpha=NEFTUNE_NOISE_ALPHA
)

# Prepare LLM

## Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL,
    token=ACCESS_TOKEN
)

## LLM-Model

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map=DEVICE,
    attn_implementation=ATTN_IMPL,
    token=ACCESS_TOKEN
)

## Setup model and tokenizer

In [None]:
model, tokenizer = setup_chat_format(model, tokenizer)

## Prepare model for quantization

In [None]:
model = prepare_model_for_kbit_training(model)

# Prepare dataset

## Load

In [None]:
dataset_name = "mlabonne/orpo-dpo-mix-40k"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42)
dataset = dataset.select(range(N_DATA))

## Transform

In [None]:
def format_chat_template(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= os.cpu_count(),
)

## Split

In [None]:
dataset = dataset.train_test_split(test_size=TEST_SIZE)

# Training

## Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    tokenizer=tokenizer
)

## Train model

In [None]:
trainer.train()
trainer.save_model(new_model)

# Clean-up

In [None]:
del trainer, model
gc.collect()
torch.cuda.empty_cache()

# Inference

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=DEVICE,
)
model, tokenizer = setup_chat_format(model, tokenizer)

# Merge foundation model with new model

In [None]:
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Upload model to huggingface

In [None]:
# _ = model.push_to_hub(new_model, use_temp_dir=False, token=access_token)
# _ = tokenizer.push_to_hub(new_model, use_temp_dir=False, token=access_token)