In [None]:
# Complete Fine-tuning and GGUF Conversion Pipeline for Google Colab
# This notebook fine-tunes a small instruct model (Qwen2.5-0.5B) and converts to GGUF

# ============================================================================
# STEP 1: INSTALLATION AND SETUP
# ============================================================================

# Install required packages
!pip install -q transformers datasets accelerate peft trl bitsandbytes
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Disable wandb completely
import os
os.environ["WANDB_DISABLED"] = "true"

# Clone and install llama.cpp for GGUF conversion
!git clone https://github.com/ggerganov/llama.cpp.git
!cd llama.cpp && make

# Install additional requirements for conversion
!pip install -q gguf numpy sentencepiece protobuf

import torch
import json
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
import os

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")

# ============================================================================
# STEP 2: SAMPLE DATASET CREATION
# ============================================================================

# Create sample training data - modify this with your actual data
sample_data = [
 {
        "instruction": "What is my name?",
        "input": "",
        "output": "your name is ketan"
    },
    {
        "instruction": "How do i cook eggs",
        "input": "",
        "output": "in the electric your mom gave you"
    },
]

# Save sample data to JSON file
with open('training_data.json', 'w') as f:
    json.dump(sample_data, f, indent=2)

print("Sample dataset created with", len(sample_data), "examples")

# ============================================================================
# STEP 3: DATA PREPROCESSING
# ============================================================================

def format_instruction(sample):
    """Format the sample into a training prompt"""
    if sample["input"]:
        return f"### Instruction:\n{sample['instruction']}\n\n### Input:\n{sample['input']}\n\n### Response:\n{sample['output']}"
    else:
        return f"### Instruction:\n{sample['instruction']}\n\n### Response:\n{sample['output']}"

# Load and format dataset
def load_and_format_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    # Format each sample
    formatted_data = []
    for sample in data:
        formatted_text = format_instruction(sample)
        formatted_data.append({"text": formatted_text})

    return Dataset.from_list(formatted_data)

# Load the dataset
dataset = load_and_format_data('training_data.json')
print("Dataset loaded:", dataset)
print("\nSample formatted text:")
print(dataset[0]['text'])

# ============================================================================
# STEP 4: MODEL AND TOKENIZER SETUP
# ============================================================================

# Choose a small model - Qwen2.5-0.5B is good for Colab
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model with 4-bit quantization to save memory
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

print(f"Model loaded: {model_name}")
print(f"Model parameters: {model.num_parameters():,}")

# ============================================================================
# STEP 5: TOKENIZATION
# ============================================================================

def tokenize_function(examples):
    # Tokenize the text
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=512,
        return_overflowing_tokens=False,
    )

    # Set labels for language modeling (copy of input_ids)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

# Tokenize dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

print("Dataset tokenized")
print("Sample tokenized length:", len(tokenized_dataset[0]["input_ids"]))

# ============================================================================
# STEP 6: LORA CONFIGURATION
# ============================================================================

# Configure LoRA for efficient fine-tuning
lora_config = LoraConfig(
    r=16,                   # Rank
    lora_alpha=32,          # Alpha parameter
    target_modules=[        # Target modules for LoRA
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# ============================================================================
# STEP 7: TRAINING CONFIGURATION
# ============================================================================

# Training arguments
training_args = TrainingArguments(
    output_dir="./qwen-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    max_steps=50,  # Keep small for demo
    learning_rate=2e-4,
    fp16=True,
    logging_steps=5,
    save_strategy="steps",
    save_steps=25,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to=[],  # Disable all reporting including wandb
    disable_tqdm=False,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,  # Pad to multiple of 8 for efficiency
)

# ============================================================================
# STEP 8: TRAINING
# ============================================================================

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

print("Starting training...")
trainer.train()

# Save the fine-tuned model
trainer.save_model()
tokenizer.save_pretrained("./qwen-finetuned")

print("Training completed and model saved!")

# ============================================================================
# STEP 9: TEST THE FINE-TUNED MODEL
# ============================================================================

# Load the fine-tuned model for testing
# We need to reload the base model and apply PEFT
from peft import PeftModel

# Load base model
base_model_for_testing = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Load PEFT model
test_model = PeftModel.from_pretrained(base_model_for_testing, "./qwen-finetuned")

def generate_response(instruction, input_text=""):
    if input_text:
        prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n"
    else:
        prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(test_model.device)

    with torch.no_grad():
        outputs = test_model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("### Response:\n")[-1]

# Test the model
test_instruction = "What is artificial intelligence?"
response = generate_response(test_instruction)
print(f"Question: {test_instruction}")
print(f"Response: {response}")

# ============================================================================
# STEP 10: CONVERT TO GGUF FORMAT
# ============================================================================

print("Converting model to GGUF format...")

# First, merge LoRA weights into base model
from peft import PeftModel

# Define the original model name again (in case it got overwritten)
original_model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# Load base model without quantization for merging
base_model = AutoModelForCausalLM.from_pretrained(
    original_model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Load LoRA model
peft_model = PeftModel.from_pretrained(base_model, "./qwen-finetuned")

# Merge and save
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained("./qwen-merged", safe_serialization=True)

# Save tokenizer
tokenizer_for_conversion = AutoTokenizer.from_pretrained(original_model_name)
tokenizer_for_conversion.save_pretrained("./qwen-merged")

print("Model merged and saved to ./qwen-merged")

# Convert to GGUF using llama.cpp
print("Converting to GGUF...")

# Convert to GGUF format
!python llama.cpp/convert_hf_to_gguf.py ./qwen-merged --outfile qwen-finetuned.gguf --outtype f16

print("Conversion to GGUF completed!")
print("GGUF file: qwen-finetuned.gguf")

# ============================================================================
# STEP 12: DOWNLOAD FILES (FOR COLAB)
# ============================================================================

# Download the GGUF files to local machine
from google.colab import files

print("Downloading GGUF files...")
files.download('qwen-finetuned.gguf')


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.9/511.9 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m363.4/363.4 MB[0m [31m73.6 MB/s[0m eta [36m0:00:01[0m