<a href="https://colab.research.google.com/github/hongdnn/unsloth_finetuning_demo/blob/main/UnslothFineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
if "COLAB_" not in "".join(os.environ.keys()):
  !pip install unsloth
else:
  # Do this only in Colab notebooks! Otherwise use pip install unsloth
  !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
  !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
  !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name= "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length= max_seq_length,
    dtype= dtype,
    load_in_4bit= load_in_4bit,
)

In [None]:
print(model)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,  # The base pre-trained model that you want to fine-tune using PEFT (Parameter-Efficient Fine-Tuning) techniques.

    # r: The LoRA rank, which determines the number of trainable parameters in the low-rank adapters.
    # A higher value (e.g., 128) gives the adapter more capacity to learn task-specific nuances,
    # while lower values (e.g., 8, 16, 32, 64) might be sufficient for simpler tasks.
    r = 128,  # Choose any number > 0. Suggested values are 8, 16, 32, 64, 128.

    # target_modules: A list of module names in the model where the LoRA adapters should be applied.
    # For transformer-based models, these typically include:
    # - "q_proj", "k_proj", "v_proj", "o_proj": The projection layers in the multi-head attention mechanism.
    # - "gate_proj", "up_proj", "down_proj": Additional projection layers used in various architectures or gating mechanisms.
    # - "embed_tokens", "lm_head": Typically included for continual pretraining or when modifying the embedding and output layers.
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "embed_tokens", "lm_head"],  # Add these for continual pretraining if needed.

    # lora_alpha: A scaling factor applied to the LoRA weights.
    # This factor adjusts the influence of the LoRA adapters relative to the original model weights.
    lora_alpha = 32,

    # lora_dropout: Dropout rate for the LoRA adapters.
    # A dropout of 0 means no dropout is applied, which is the optimized setting in this configuration.
    lora_dropout = 0,  # Supports any value, but 0 is optimized.

    # bias: Configures the use of bias parameters in the adapters.
    # Setting this to "none" means no additional bias terms are introduced, which simplifies the model.
    bias = "none",  # Supports any value, but "none" is optimized.

    # use_gradient_checkpointing: A technique to reduce memory usage by trading compute for memory.
    # The special "unsloth" setting here is an optimized mode that reportedly reduces VRAM usage by 30%
    # and allows for twice as large batch sizes, which is particularly useful for very long context lengths.
    use_gradient_checkpointing = "unsloth",  # Use True or "unsloth" for very long context scenarios.

    # random_state: Sets a seed for the random number generator to ensure reproducibility in training.
    random_state = 3407,

    # use_rslora: Activates Rank Stabilized LoRA (RS-LoRA), an enhanced version designed to improve stability during training.
    use_rslora = True,  # Enables the use of rank stabilized LoRA.

    # loftq_config: Configuration for LoftQ, another parameter-efficient fine-tuning technique.
    # Here it is set to None, meaning that LoftQ is not applied in this configuration.
    loftq_config = None,  # LoftQ is not used in this case.
)


In [None]:
# in case load_dataset cause error
#pip install -U datasets

In [None]:
from datasets import load_dataset
squad = load_dataset("rajpurkar/squad", split="train[:2000]")
squad = squad.train_test_split(test_size=0.2)

In [None]:
text = squad['train'][0]
text

In [None]:
from datasets import load_dataset
from unsloth import UnslothTrainer, UnslothTrainingArguments, FastLanguageModel
from transformers import DataCollatorForLanguageModeling

# Preprocess
def preprocess_function(examples):
    contexts = examples["context"]
    questions = examples["question"]
    answers = [ans["text"][0] for ans in examples["answers"]]
    inputs = [f"Context: {context} Question: {question}" for context, question in zip(contexts, questions)]
    return {"input_text": inputs, "target_text": answers}

squad_processed = squad.map(preprocess_function, batched=True, num_proc=8, remove_columns=["id", "title", "context", "question", "answers"])

# Formatting with stop token
def formatting_func(example):
    return f"{example['input_text']} Answer: {example['target_text']}<|eot|>"

In [None]:
squad_processed["train"][0]

In [None]:
formatting_func(squad_processed["train"][1])

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) #not using mask language

# Trainer
trainer = UnslothTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=squad_processed["train"],
    dataset_text_field="input_text",
    formatting_func=formatting_func,
    max_seq_length=512,
    data_collator=data_collator,
    args=UnslothTrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        warmup_ratio=0.1,
        num_train_epochs=3,
        learning_rate=5e-5,
        embedding_learning_rate=5e-6,
        fp16 = not is_bfloat16_supported(), # Use 16-bit floating point if bfloat16 isn't supported.
        bf16 = is_bfloat16_supported(),     #use bf16 if hardware support
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.00,
        lr_scheduler_type="cosine", #Cosine scheduler to adjust the learning rate over time.
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

# Train
trainer.train()

In [None]:
from transformers import TextIteratorStreamer
from threading import Thread
text_streamer = TextIteratorStreamer(tokenizer)
import textwrap
max_print_width = 100

# Before running inference, call `FastLanguageModel.for_inference` first

FastLanguageModel.for_inference(model)

inputs = tokenizer(
[
    "<|begin_of_text|>Context: Albert Einstein was a theoretical physicist known for his work on relativity. "
    "His contributions revolutionized the understanding of space, time, and gravity. "
    "Question: What is the theory that made Albert Einstein famous? "
    "Answer:"

], return_tensors = "pt").to("cuda")

generation_kwargs = dict(
    inputs,
    streamer = text_streamer,
    max_new_tokens = 25,
    use_cache = True,
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()

# Accumulate the streamed text.
generated_text = ""
for new_text in text_streamer:
    generated_text += new_text

# Post-process the generated text to extract only the answer.
# The answer should starts after "Answer:"
if "Answer:" in generated_text:
    answer_section = generated_text.split("Answer:", 1)[1]
    # If a new question appears, stop there.
    if "Question:" in answer_section:
        answer = answer_section.split("Question:")[0].strip()
    else:
        answer = answer_section.strip()
else:
    answer = generated_text.strip()

# Optionally, wrap the text for display
wrapped_answer = "\n".join(textwrap.wrap(answer, width=max_print_width))
print(wrapped_answer)