### LLM Fine-Tuning with Unsloth
#### [Unsloth wiki](https://github.com/unslothai/unsloth/wiki) 
- https://github.com/unslothai/unsloth?tab=readme-ov-file#conda-installation 
- pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
- pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes
- pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes

- training code references
    - [refernce unsloth](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing#scrollTo=kR3gIAX-SM2q)
    - [refernce HF FA2](https://colab.research.google.com/drive/1fgTOxpMbVjloQBvZyz4lF4BacKSZOB2A?usp=sharing#scrollTo=-nX3SL7cI2fZ)

In [1]:
import os 
## if want to use a specific card
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset

In [3]:
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

1
NVIDIA A100 80GB PCIe


In [4]:
use_lora = True
model_cache_dir = '/root/data/hf_cache/llama-3-8B-Instruct'
model_output_dir = '/root/data/models/llama3/8b_checkpoints'
final_model_out_dir = '/root/data/models/llama3/llama_8b_current'
max_seq_length = 2048 # Choose any! auto RoPE Scaling internally!
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.
use_gradient_checkpointing = True
random_state = 3407

model_name = model_cache_dir

#### load model 

In [5]:
print('Use HF and flash attention 2')
bnb_config = BitsAndBytesConfig(
load_in_4bit              = True,
bnb_4bit_use_double_quant = True,
bnb_4bit_quant_type       = "nf4",
bnb_4bit_compute_dtype    = dtype,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = "auto",
    torch_dtype = dtype,
    quantization_config = bnb_config if load_in_4bit else None,
    #token = None,
    attn_implementation="flash_attention_2", ## with flash_attention_2
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length = max_seq_length,
    padding_side = "right",
    #token = None,
)

tokenizer.add_special_tokens({"pad_token" : tokenizer.eos_token});
tokenizer.pad_token = tokenizer.eos_token
config = model.config.update({"pad_token_id" : tokenizer.eos_token_id});

Use HF and flash attention 2


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### Train with lora adaptor
-  [Lora targets explained](https://github.com/unslothai/unsloth/wiki#target-modules)

In [6]:
if use_lora:
    from peft import prepare_model_for_kbit_training
    from peft import LoraConfig, TaskType, get_peft_model

    from transformers import set_seed as transformers_set_seed
    transformers_set_seed(random_state) # Must set since LoRA weights get initialized.
    
    lora_config = LoraConfig(
        r              = 16,
        lora_alpha     = 16,
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj"],
        lora_dropout   = 0,
        bias           = "none",
        task_type      = TaskType.CAUSAL_LM,
    )
    
    if load_in_4bit:
        model = prepare_model_for_kbit_training(
            model,
            use_gradient_checkpointing = use_gradient_checkpointing,
        )
    elif use_gradient_checkpointing:
        model.gradient_checkpointing_enable()
    model = get_peft_model(model, lora_config)


#### Data Prep
We now use the `Llama-3` format for conversation style finetunes. We use [Open Assistant conversations](https://huggingface.co/datasets/philschmid/guanaco-sharegpt-style) in ShareGPT style. Llama-3 renders multi turn conversations like below:

```
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hello!<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hey there! How are you?<|eot_id|><|start_header_id|>user<|end_header_id|>

I'm great thanks!<|eot_id|>
```

We use our `get_chat_template` function to get the correct chat template. We support `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old` and our own optimized `unsloth` template.



In [7]:
print(tokenizer.chat_template)


{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>

'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>

' }}{% endif %}


#### More info on [chat_template](https://github.com/unslothai/unsloth/wiki#chat-templates)

In [8]:
## define data transformation function to format correct prompt 

mapping = {"from" : "role", "value" : "content", "human" : "user", "gpt" : "assistant"}
def transform_to_llama_chat_template(convos,mapping):
    return [[{mapping[old_key]: mapping.get(value) if mapping.get(value) else value for old_key, value in t.items()} for t in convo] for convo in convos]

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    convos = transform_to_llama_chat_template(convos,mapping)
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("philschmid/guanaco-sharegpt-style", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,) ## if batched = True, process function process a batch of data
print(dataset[0]['text'])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Escribe un discurso que pueda recitar como padrino de la boda de mi mejor amigo.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Queridos invitados, amigos y familiares,

Me siento muy honrado de estar aquí hoy como padrino de bodas de mi mejor amigo [Nombre del novio].

Recuerdo con cariño los días en los que [Nombre del novio] y yo nos conocimos, cuando éramos solo dos jóvenes llenos de sueños y esperanza. Ahora, aquí estamos, celebrando su boda con la persona que ama, [Nombre de la novia].

[Nombre de la novia], te aseguro que [Nombre del novio] es una persona increíble, llena de amor y lealtad. Juntos, han formado un equipo invencible y estoy seguro de que su amor perdurará por siempre.

[Nombre del novio], mi amigo, te deseo todo lo mejor en esta nueva etapa de tu vida. Espero que tu matrimonio esté lleno de amor, alegría y felicidad, y que [Nombre de la novia] sea siempre tu compañera de vida y tu mejor amiga.

A 

### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). Walso support TRL's `DPOTrainer`!

In [9]:
model.config.use_cache = False ## use cache can not be used together with gradient checkpointing
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 2,
        warmup_steps = 20,
        num_train_epochs=1, # often 3
        #max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(), 
        bf16 = torch.cuda.is_bf16_supported(),# without lora, for some reason it buggs "Invalid device string: 'bfloat16'"
        logging_steps = 40,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        save_steps=100,
        save_total_limit=1,
        output_dir = model_output_dir,
    ),
)

In [10]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100 80GB PCIe. Max memory = 79.138 GB.
15.162 GB of memory reserved.


In [11]:
trainer_stats = trainer.train()
## there is known bug then not using loar, and model saving https://github.com/unslothai/unsloth/issues/404

Step,Training Loss
40,1.4373
80,1.3216
120,1.2497
160,1.2931
200,1.2684
240,1.2461
280,1.292
320,1.2424




In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

#### Save model 

In [None]:
## save lora adapters only 
# Save the LoRA adapters
if use_lora:
    model.save_pretrained(final_model_out_dir)
else:
    trainer.save_model(final_model_out_dir)

tokenizer.save_pretrained(final_model_out_dir)

#### Simple Inference Test

In [None]:
from peft import PeftModel

In [None]:
# Load the base model and tokenizer
infer_model = AutoModelForCausalLM.from_pretrained(final_model_out_dir)
tokenizer = AutoTokenizer.from_pretrained(final_model_out_dir)

# Load the LoRA adapters
lora_output_dir = "./lora_adapters"
infer_model = PeftModel.from_pretrained(infer_model, final_model_out_dir)

- verify chat template style

In [None]:
print(tokenizer.chat_template)

In [None]:
messages = [
    {"from": "user", "content": "What is your name and why?"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = infer_model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
print(tokenizer.batch_decode(outputs))
