In [2]:
# Install necessary libraries and Unsloth
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

# Import necessary libraries
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
import torch

# Configuration
max_seq_length = 2048  # Extended context size with RoPE scaling
load_in_4bit = True    # Use 4-bit quantization for memory efficiency
dtype = None           # Auto-detect dtype (Bfloat16 for Ampere+, Float16 for T4, V100)

# Supported 4-bit models (pre-quantized for fast download)
model_name = "unsloth/mistral-7b-bnb-4bit"  # You can switch models here, e.g., "unsloth/tinyllama-bnb-4bit"

# Load pre-trained conversational model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

# Add LoRA adapters for memory-efficient fine-tuning (now including embed_tokens and lm_head)
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Low-rank adaptation with rank 16
    lora_alpha=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",
        "embed_tokens", "lm_head"  # Added embed_tokens and lm_head to avoid NaN issues
    ],
    lora_dropout=0,  # No dropout for optimization
    bias="none",
    use_gradient_checkpointing="unsloth",  # Memory-efficient gradient checkpointing
)

# Load the ShareGPT-style dataset (Open Assistant Conversations)
dataset = load_dataset("philschmid/guanaco-sharegpt-style", split="train")

# Apply the ChatML template for processing conversations
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",  # Using ChatML template for multi-turn conversations
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    map_eos_token=True  # Maps <|im_end|> to EOS token
)

# Function to format the dataset into the required ChatML structure
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

# Process the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)

# Training setup using SFTTrainer (supervised fine-tuning)
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,  # Multiprocessing for faster data processing
    packing=False,  # No packing for conversational tasks
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,  # Warmup to stabilize training
        max_steps=60,  # Quick test run (increase for longer training)
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),  # Use FP16 if BFloat16 isn't supported
        bf16=torch.cuda.is_bf16_supported(),  # Use BF16 if supported
        logging_steps=1,
        optim="adamw_8bit",  # 8-bit Adam optimizer for memory efficiency
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs"
    ),
)

# Train the model
trainer_stats = trainer.train()

# Show final memory and time stats after training
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Training time: {trainer_stats.metrics['train_runtime']} seconds.")
print(f"Peak reserved memory: {used_memory} GB.")

# Enable inference mode for the model (native 2x faster with Unsloth)
FastLanguageModel.for_inference(model)

# Example Inference: Test the model with a sample conversation
messages = [
    {"from": "human", "value": "What do you think about the future of AI?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add generation prompt for inference
    return_tensors="pt",
)




# Ensure that input_ids are correctly moved to GPU
inputs = inputs.to("cuda")

# Generate model response
outputs = model.generate(input_ids=inputs, max_new_tokens=64, use_cache=True)

# Decode and print the generated response
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(f"Model Response: {generated_text[0]}")


# Save the fine-tuned LoRA adapters and tokenizer for later use
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")


Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-q9c26app/unsloth_1f30e174c5d34e82861e8f1c4126f41c
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-q9c26app/unsloth_1f30e174c5d34e82861e8f1c4126f41c
  Resolved https://github.com/unslothai/unsloth.git to commit a2f4c9793ecf829ede2cb64f2ca7a909ce3b0884
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
==((====))==  Unsloth 2024.9.post4: Fast Mistral patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False

  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM
Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


max_steps is given, it will override any value given in num_train_epochs


Counting untrained tokens:   0%|          | 0/9033 [00:00<?, ? examples/s]

Unsloth: Setting embed_tokens & lm_head untrained tokens to mean(trained) to counteract NaNs during training.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 9,033 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 304,087,040


Step,Training Loss
1,1.5287
2,1.162
3,1.0373
4,1.0644
5,1.0534
6,1.0558
7,0.8662
8,1.4753
9,1.4441
10,1.2548


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Training time: 143.5315 seconds.
Peak reserved memory: 10.816 GB.
Model Response: <|im_start|>user
What do you think about the future of AI?
<|im_start|>assistant
I think the future of AI is very bright. AI will continue to advance and become more powerful, capable, and useful. AI will be used in many areas, including healthcare, education, transportation, and government. AI will also be used to solve many of the world's most pressing problems, including climate change,


('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')