In [None]:
# %%capture suppress warnings
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Import required libraries
from unsloth import FastLanguageModel
from datasets import load_dataset
import torch

# Set parameters for LLaMA-3.1 8B finetuning on a coding task
max_seq_length = 1024  # Optimized for memory and performance
dtype = torch.float16  # Efficient for free T4 instance
load_in_4bit = True  # Reduced memory usage with 4bit quantization

# Load LLaMA-3.1 8B model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

# Add LoRA adapters to finetune only parts of the model
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Set LoRA rank
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Focus on attention layers
    lora_alpha=16,
    lora_dropout=0,  # Optimized dropout rate
    bias="none",  # Remove bias to optimize memory usage
    use_gradient_checkpointing=True  # Reduce memory further for long sequences
)

# Dataset for coding-related tasks: Use Alpaca or other coding datasets
dataset = load_dataset("yahma/alpaca-cleaned", split="train")

# Function to format prompts
alpaca_prompt = """
### Instruction:
{}

### Input:
{}

### Response:
{}
"""

EOS_TOKEN = tokenizer.eos_token  # End of sequence token

def formatting_prompts_func(examples):
    """
    Function to format the Alpaca dataset to match the model's required input.
    This formats the instruction, input, and response into a prompt format for fine-tuning.
    """
    texts = [alpaca_prompt.format(inst, inp, out) + EOS_TOKEN for inst, inp, out in zip(examples['instruction'], examples['input'], examples['output'])]
    return {"text": texts}

# Prepare dataset for finetuning
dataset = dataset.map(formatting_prompts_func, batched=True)

# Training setup with Huggingface's Trainer
from transformers import TrainingArguments
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=TrainingArguments(
        per_device_train_batch_size=2,  # Batch size optimized for T4 GPU
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,  # Limited steps for quick results
        learning_rate=2e-4,
        fp16=True,  # Efficient for free T4
        logging_steps=1,
        output_dir="outputs"
    ),
)

# Show GPU memory stats before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# Start training
trainer.train()

# Show memory and training time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Training completed. Memory used = {used_memory - start_gpu_memory} GB.")

# Inference for code generation task
FastLanguageModel.for_inference(model)  # Enable faster inference
input_text = """
### Instruction:
Complete the Python function to calculate the factorial of a number.

### Input:
def factorial(n):

### Response:
"""
inputs = tokenizer([input_text], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Saving the fine-tuned model (LoRA adapters)
model.save_pretrained("lora_model")  # Local save of LoRA adapters
tokenizer.save_pretrained("lora_model")


Collecting unsloth
  Downloading unsloth-2024.9.post4-py3-none-any.whl.metadata (56 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.0/56.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.8.12-py3-none-any.whl.metadata (8.4 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting trl!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.11.1,>=0.7.9 (fro

model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2024.9.post4 patched 32 layers with 32 QKV layers, 32 O layers and 0 MLP layers.


README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
5.879 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 13,631,488
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,1.5559
2,1.9064
3,1.8683
4,2.105
5,1.5119
6,1.6683
7,1.4203
8,1.5536
9,1.5661
10,1.5656


Training completed. Memory used = 1.3050000000000006 GB.

### Instruction:
Complete the Python function to calculate the factorial of a number.

### Input:
def factorial(n):

### Response:
def factorial(n):
    """Return the factorial of n."""
    if n == 0 or n == 1:
        return 1
    else:
        return n * factorial(n - 1)



('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')