In [None]:
# Install necessary libraries and update Unsloth
!pip install unsloth
!pip install --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Import required libraries
from unsloth import FastLanguageModel
from datasets import load_dataset
import torch
from trl import SFTTrainer
from transformers import TrainingArguments

# Configuration
max_seq_length = 4096  # RoPE Scaling for extended context
load_in_4bit = True    # Use 4-bit quantization for memory efficiency
dtype = None           # Auto-detect dtype

# Load pre-quantized TinyLlama model (non-instruct version)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/tinyllama-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Add LoRA adapters with embed_tokens and lm_head included
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
        "embed_tokens", "lm_head"  # Adding these to avoid NaN errors
    ],
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=False,  # Set to True if you face OOM issues
)

# Data preparation
alpaca_prompt = """### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Load a smaller subset of the Alpaca dataset for faster training
dataset = load_dataset("yahma/alpaca-cleaned", split="train[:1%]")  # Only 1% of the data
dataset = dataset.map(formatting_prompts_func, batched=True)

# Set up the trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,  # Packs short sequences together to save time
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_ratio=0.1,
        num_train_epochs=1,  # Single epoch for quick training
        learning_rate=2e-5,
        fp16=not torch.cuda.is_bf16_supported(),  # Use FP16 if BF16 is not supported
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",  # 8-bit Adam optimizer for memory efficiency
        weight_decay=0.1,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",  # Output directory for saving the model
    ),
)

# GPU stats before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU: {gpu_stats.name}. Max memory: {max_memory} GB.")
print(f"Reserved memory before training: {start_gpu_memory} GB.")

# Train the model
trainer_stats = trainer.train()

# Show final memory and time stats after training
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"Training time: {trainer_stats.metrics['train_runtime']} seconds.")
print(f"Peak reserved memory: {used_memory} GB.")
print(f"Memory used for LoRA training: {used_memory_for_lora} GB.")
print(f"Peak memory usage: {used_percentage} %.")

# Inference: Generate a completion
FastLanguageModel.for_inference(model)  # Enable fast inference
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "Translate the following sentence to French: 'Hello, how are you?'", "", ""
        )
    ],
    return_tensors="pt",
).to("cuda")

# Generate text from the model
outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=64, use_cache=True)

# Decode and print the generated text
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print("Generated text:", generated_text[0])

# Save the model and tokenizer
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")


Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-8j2na3uz/unsloth_505bd8ce116841b18a693e9a23b00620
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-8j2na3uz/unsloth_505bd8ce116841b18a693e9a23b00620
  Resolved https://github.com/unslothai/unsloth.git to commit a2f4c9793ecf829ede2cb64f2ca7a909ce3b0884
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"   

Unsloth: unsloth/tinyllama-bnb-4bit can only handle sequence lengths of at most 2048.
But with kaiokendev's RoPE scaling of 2.0, it can be magically be extended to 4096!
Unsloth 2024.9.post4 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

GPU: Tesla T4. Max memory: 14.748 GB.
Reserved memory before training: 2.754 GB.


Counting untrained tokens:   0%|          | 0/23 [00:00<?, ? examples/s]

Unsloth: Setting embed_tokens & lm_head untrained tokens to mean(trained) to counteract NaNs during training.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 23 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 3
 "-____-"     Number of trainable parameters = 156,303,360


Step,Training Loss
1,3.0274
2,2.9523
3,2.8587


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Training time: 49.6927 seconds.
Peak reserved memory: 13.598 GB.
Memory used for LoRA training: 10.844 GB.
Peak memory usage: 92.202 %.
Generated text: ### Instruction:
Translate the following sentence to French: 'Hello, how are you?'

### Input:


### Response:


## Instruction:


## Instruction


## Instruction


 Instruction











































('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.model',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')