In [None]:
# Install necessary libraries
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install trl peft bitsandbytes

# Import necessary libraries
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth.chat_templates import get_chat_template

# Set parameters
model_name = "unsloth/Phi-3-medium-4k-instruct"
max_seq_length = 2048
dtype = None  # Auto-detection of data type (use Float16 for Tesla T4)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage

# Step 1: Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

# Step 2: Add LoRA adapters using PEFT and prepare for quantized training
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA adapters
model = get_peft_model(model, lora_config)

# Prepare the model for k-bit quantization training
model = prepare_model_for_kbit_training(model)

# Step 3: Prepare the dataset (ShareGPT style)
tokenizer = get_chat_template(
    tokenizer,
    chat_template="phi-3",  # Phi-3 format for conversational style finetuning
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

# Load and format the dataset
dataset = load_dataset("philschmid/guanaco-sharegpt-style", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

# Step 4: Define training arguments and initialize the trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Packing can make training faster
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,  # Adjust the number of steps for training
        learning_rate=2e-4,
        bf16=True,  # Use BFloat16 for stability
        logging_steps=1,
        optim="adamw_8bit",  # Efficient optimizer for quantized training
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs"
    ),
)

# Step 5: Start training
trainer.train()

# Step 6: Save the fine-tuned model and tokenizer
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

# Step 7: Inference
FastLanguageModel.for_inference(model)  # Enable faster inference

messages = [
    {"from": "human", "value": "Generate a story about a brave knight and a dragon."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add this for generation
    return_tensors="pt"
).to("cuda")

outputs = model.generate(input_ids=inputs, max_new_tokens=128, use_cache=True)
decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print("Generated story:", decoded_output[0])

# Optionally: Stream inference for live token-by-token generation
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids=inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True)


In [None]:
# Step 7: Inference
FastLanguageModel.for_inference(model)  # Enable faster inference

messages = [
    {"from": "human", "value": "Generate a story about a brave knight and a dragon."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add this for generation
    return_tensors="pt"
).to("cuda")

# Generate the story with the model
outputs = model.generate(input_ids=inputs, max_new_tokens=128, use_cache=True)

# Decode the generated tokens into text
decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print the generated story
print("Generated story:", decoded_output[0])

# Optionally: Stream inference for live token-by-token generation
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids=inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True)


Generated story: Generate a story about a brave knight and a dragon. Once upon a time, in a kingdom far, far away, there lived a brave knight named Sir Cedric. Sir Cedric was known throughout the land for his courage and his unwaardable loyalty to the king. He had fought in many battles and had always emerged victorious.

One day, the king summoned Sir Cedric to the castle. "Sir Cedric," the king said, "I have a quest for you. A terrible dragon has been terrorizing the nearby villages. It has been burning down farms and stealing livestock. I
<|user|> Generate a story about a brave knight and a dragon.<|end|><|assistant|> Once upon a time, in a kingdom far, far away, there lived a brave knight named Sir Cedric. Sir Cedric was known throughout the land for his courage and his unwaardable loyalty to the king. He had fought in many battles and had always emerged victorious.

One day, the king summoned Sir Cedric to the castle. "Sir Cedric," the king said, "I have a quest for you. A terribl