# Fine-Tuning Qwen3-1.7B for Kotlin Multiplatform Code Generation

**Optimized for Google Colab T4 GPU with Unsloth**

This notebook fine-tunes Qwen3-1.7B on Kotlin KMP code with **full method body implementations** using Unsloth for 2x faster training.

## Key Features:
- ‚úÖ **Unsloth** - 2x faster training with optimized kernels
- ‚úÖ **QLoRA** for efficient training on T4 (15GB VRAM)
- ‚úÖ **Method body validation** - Tests if model generates full implementations
- ‚úÖ **Production-ready** - Gradient checkpointing, mixed precision
- ‚úÖ **Quality metrics** - Evaluates implementation vs signature-only generation

## Dataset Stats:
- **132,577** training pairs
- **52.5%** have real method bodies
- **7 pair types**: expect/actual, interface‚Üíimpl, description‚Üícode, etc.

---

## 1. Setup & Installation

In [None]:
# Check GPU
!nvidia-smi --query-gpu=name,memory.total --format=csv

In [None]:
%%capture
# Install Unsloth and dependencies
!pip install unsloth
!pip install --upgrade --no-cache-dir unsloth unsloth-zoo
!pip install trl datasets

In [None]:
import torch
import json
import os
from pathlib import Path
from datasets import Dataset, load_dataset
from unsloth import FastLanguageModel
import re

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

## 2. Mount Google Drive & Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# === CONFIGURE YOUR DATA PATH ===
# Upload your data/final_training/ folder to Google Drive first
DATA_DIR = "/content/drive/MyDrive/kmp_training_data/final_training"

# Verify files exist
!ls -lh {DATA_DIR}/*.jsonl

In [None]:
# Load training data
def load_kmp_dataset(data_dir):
    """Load KMP training data from JSONL files"""
    
    train_data = []
    val_data = []
    
    # Load training pairs
    train_file = f"{data_dir}/train.jsonl"
    val_file = f"{data_dir}/val.jsonl"
    
    print(f"Loading training data from {train_file}...")
    with open(train_file, 'r') as f:
        for line in f:
            train_data.append(json.loads(line))
    
    print(f"Loading validation data from {val_file}...")
    with open(val_file, 'r') as f:
        for line in f:
            val_data.append(json.loads(line))
    
    print(f"\nDataset loaded:")
    print(f"  Train: {len(train_data):,} examples")
    print(f"  Val:   {len(val_data):,} examples")
    
    return Dataset.from_list(train_data), Dataset.from_list(val_data)

train_dataset, val_dataset = load_kmp_dataset(DATA_DIR)

In [None]:
# Analyze dataset
import pandas as pd
from collections import Counter

# Get pair type distribution
pair_types = Counter([item['pair_type'] for item in train_dataset])

print("\nüìä Pair Type Distribution:")
for ptype, count in pair_types.most_common():
    pct = 100 * count / len(train_dataset)
    print(f"  {ptype:35s} {count:>7,}  ({pct:5.1f}%)")

# Sample examples
print("\nüìù Sample Training Example:")
sample = train_dataset[0]
print(f"\nPair Type: {sample.get('pair_type', 'unknown')}")
print(f"Source Set: {sample.get('source_set', 'unknown')}")
print(f"\nINPUT (first 200 chars):\n{sample['input_text'][:200]}...")
print(f"\nTARGET (first 300 chars):\n{sample['target_text'][:300]}...")

## 3. Load Model with QLoRA (T4 Optimized)

In [None]:
# Model configuration
MODEL_NAME = "unsloth/Qwen3-1.7B-unsloth-bnb-4bit"
MAX_SEQ_LENGTH = 2048  # Qwen3 supports up to 32K, but 2K is sufficient for KMP code

print(f"Loading model: {MODEL_NAME}")
print(f"Max sequence length: {MAX_SEQ_LENGTH}")
print(f"Quantization: 4-bit NF4 (via Unsloth)")

In [None]:
# Load model and tokenizer with Unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    load_in_4bit=True,
    dtype=None,  # Auto-detect (bfloat16 on T4)
)

print(f"\nTokenizer loaded:")
print(f"  Vocab size: {len(tokenizer)}")
print(f"  EOS token: {tokenizer.eos_token}")
print(f"  PAD token: {tokenizer.pad_token}")

In [None]:
# Apply LoRA with Unsloth's optimized implementation
model = FastLanguageModel.get_peft_model(
    model,
    r=16,                       # LoRA rank
    lora_alpha=16,              # LoRA alpha (equal to rank for conservative updates)
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized gradient checkpointing
    random_state=3407,
)

print("\n‚úÖ LoRA applied with Unsloth optimizations")
print(f"   Memory footprint: {model.get_memory_footprint() / 1e9:.2f} GB")

<cell_type>markdown</cell_type>## 4. Configure LoRA (Already Applied with Unsloth)

In [None]:
# LoRA has already been applied in the previous cell using Unsloth's FastLanguageModel.get_peft_model()
# This cell can be skipped or removed
print("‚úÖ LoRA configuration already applied via Unsloth")

## 5. Format Training Data

In [None]:
def format_prompt(example):
    """
    Format training example into Qwen3 chat format.
    
    Uses Qwen3's chat template with system message emphasizing FULL implementation.
    """
    input_text = example['input_text']
    target_text = example['target_text']
    
    # Create messages in Qwen3 format
    messages = [
        {
            "role": "system",
            "content": "You are an expert Kotlin Multiplatform developer. Generate complete, working code with full method body implementations. Never output just signatures or TODO comments."
        },
        {
            "role": "user",
            "content": input_text
        },
        {
            "role": "assistant",
            "content": target_text
        }
    ]
    
    # Apply Qwen3 chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,
    )
    
    return {"text": text}

# Format datasets
train_dataset_formatted = train_dataset.map(format_prompt)
val_dataset_formatted = val_dataset.map(format_prompt)

# Show example
print("\nüìù Formatted Example (first 500 chars):")
print(train_dataset_formatted[0]['text'][:500])

## 6. Training Configuration

In [None]:
from trl import SFTTrainer, SFTConfig

# Training arguments optimized for T4 with Unsloth
training_args = SFTConfig(
    # Output
    output_dir="./qwen3-kmp-finetuned",
    
    # Dataset
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    
    # Training regime
    num_train_epochs=3,
    per_device_train_batch_size=2,      # Small batch for T4
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,       # Effective batch size = 16
    
    # Optimization
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    optim="adamw_8bit",                  # 8-bit AdamW for memory efficiency
    
    # Memory optimization
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    
    # Logging & Saving
    logging_steps=50,
    save_steps=500,
    save_total_limit=3,
    eval_strategy="steps",
    eval_steps=500,
    
    # Other
    report_to="none",                    # Disable wandb/tensorboard
    seed=3407,
    packing=False,                       # Don't pack sequences for clearer evaluation
)

print("Training Configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Mixed precision: {'BF16' if training_args.bf16 else 'FP16' if training_args.fp16 else 'FP32'}")

## 7. Initialize Trainer

In [None]:
# Initialize SFT Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_formatted,
    eval_dataset=val_dataset_formatted,
    tokenizer=tokenizer,
    dataset_text_field="text",
    max_seq_length=2048,
    packing=False,  # Don't pack sequences (clearer for evaluation)
)

print("\n‚úÖ Trainer initialized!")
print(f"   Training samples: {len(train_dataset_formatted):,}")
print(f"   Eval samples: {len(val_dataset_formatted):,}")
print(f"   Max sequence length: 2048 tokens")

## 8. Start Training

In [None]:
# Clear CUDA cache
torch.cuda.empty_cache()

# Start training
print("\nüöÄ Starting training...\n")
trainer.train()

print("\n‚úÖ Training complete!")

## 9. Save Model

In [None]:
# Save fine-tuned model
output_dir = "./qwen3-kmp-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"‚úÖ Model saved to: {output_dir}")

# Optionally save to Google Drive
import shutil
drive_save_path = "/content/drive/MyDrive/qwen3-kmp-finetuned"
shutil.copytree(output_dir, drive_save_path, dirs_exist_ok=True)
print(f"‚úÖ Model backed up to Google Drive: {drive_save_path}")

## 10. Test: Method Body Implementation Quality

**Critical Test**: Does the model generate FULL implementations or just signatures?

In [None]:
from unsloth import FastLanguageModel

def generate_code(prompt, model, tokenizer, max_new_tokens=512):
    """Generate code completion using Qwen3 chat format"""
    
    # Format prompt with Qwen3 chat template
    messages = [
        {
            "role": "system",
            "content": "You are an expert Kotlin Multiplatform developer. Generate complete, working code with full method body implementations. Never output just signatures or TODO comments."
        },
        {
            "role": "user",
            "content": prompt
        }
    ]
    
    # Apply chat template and prepare for generation
    FastLanguageModel.for_inference(model)  # Enable inference mode
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    # Decode only the generated part
    generated = outputs[0][inputs.shape[1]:]
    response = tokenizer.decode(generated, skip_special_tokens=True)
    
    return response

print("‚úÖ Generation function ready")

In [None]:
def check_method_body_quality(code):
    """
    Analyze generated code to check if it has real method bodies.
    
    Returns:
        dict with quality metrics
    """
    results = {
        "has_braces": "{" in code and "}" in code,
        "has_return": "return " in code,
        "has_logic": False,
        "has_todo": "TODO" in code or "todo" in code.lower(),
        "has_empty_body": "{ }" in code or "{\n}" in code,
        "quality_score": 0,
    }
    
    # Check for real implementation logic
    logic_indicators = [
        "if (", "when (", "for (", "while (",
        ".map", ".filter", ".collect", ".launch",
        "try {", "catch", "emit(",
        "= ", "+=", "-=",
    ]
    
    logic_count = sum(1 for indicator in logic_indicators if indicator in code)
    results["has_logic"] = logic_count >= 2
    
    # Calculate quality score
    score = 0
    if results["has_braces"]: score += 1
    if results["has_return"]: score += 1
    if results["has_logic"]: score += 2
    if not results["has_todo"]: score += 1
    if not results["has_empty_body"]: score += 1
    
    results["quality_score"] = score
    results["is_full_implementation"] = score >= 4
    
    return results

print("‚úÖ Quality checker ready")

### Test Case 1: Interface Implementation

In [None]:
test_prompt_1 = """// Source set: commonMain
// Implement this interface with full method bodies:
interface UserRepository {
    suspend fun getUser(id: String): User
    suspend fun saveUser(user: User)
}"""

print("üß™ Test 1: Interface ‚Üí Implementation\n")
print("INPUT:")
print(test_prompt_1)
print("\n" + "="*60 + "\n")

generated = generate_code(test_prompt_1, trainer.model, tokenizer, max_new_tokens=512)
print("GENERATED:")
print(generated)
print("\n" + "="*60 + "\n")

quality = check_method_body_quality(generated)
print("QUALITY ANALYSIS:")
print(f"  Has braces: {quality['has_braces']}")
print(f"  Has return statements: {quality['has_return']}")
print(f"  Has logic (if/when/etc): {quality['has_logic']}")
print(f"  Has TODO: {quality['has_todo']}")
print(f"  Has empty body: {quality['has_empty_body']}")
print(f"  Quality score: {quality['quality_score']}/6")
print(f"  ‚úÖ Full implementation: {quality['is_full_implementation']}")

### Test Case 2: Expect/Actual Implementation

In [None]:
test_prompt_2 = """// Source set: androidMain
// Implement the actual for this expect declaration:
expect class PlatformLogger() {
    fun log(message: String)
}"""

print("üß™ Test 2: Expect ‚Üí Actual\n")
print("INPUT:")
print(test_prompt_2)
print("\n" + "="*60 + "\n")

generated = generate_code(test_prompt_2, trainer.model, tokenizer, max_new_tokens=512)
print("GENERATED:")
print(generated)
print("\n" + "="*60 + "\n")

quality = check_method_body_quality(generated)
print("QUALITY ANALYSIS:")
print(f"  Quality score: {quality['quality_score']}/6")
print(f"  ‚úÖ Full implementation: {quality['is_full_implementation']}")

### Test Case 3: ViewModel with State Management

In [None]:
test_prompt_3 = """// Source set: commonMain
// Implement ViewModel 'LoginViewModel' with state management, coroutines, and event handling

class LoginViewModel"""

print("üß™ Test 3: ViewModel Implementation\n")
print("INPUT:")
print(test_prompt_3)
print("\n" + "="*60 + "\n")

generated = generate_code(test_prompt_3, trainer.model, tokenizer, max_new_tokens=800)
print("GENERATED:")
print(generated)
print("\n" + "="*60 + "\n")

quality = check_method_body_quality(generated)
print("QUALITY ANALYSIS:")
print(f"  Quality score: {quality['quality_score']}/6")
print(f"  ‚úÖ Full implementation: {quality['is_full_implementation']}")

### Test Case 4: Composable UI Function

In [None]:
test_prompt_4 = """// Source set: commonMain
// Implement @Composable function 'LoginScreen' with full UI layout:

@Composable
fun LoginScreen()"""

print("üß™ Test 4: Composable UI\n")
print("INPUT:")
print(test_prompt_4)
print("\n" + "="*60 + "\n")

generated = generate_code(test_prompt_4, trainer.model, tokenizer, max_new_tokens=800)
print("GENERATED:")
print(generated)
print("\n" + "="*60 + "\n")

quality = check_method_body_quality(generated)
print("QUALITY ANALYSIS:")
print(f"  Quality score: {quality['quality_score']}/6")
print(f"  ‚úÖ Full implementation: {quality['is_full_implementation']}")

## 11. Batch Evaluation on Test Set

In [None]:
# Load test set
test_file = f"{DATA_DIR}/test.jsonl"
test_data = []
with open(test_file, 'r') as f:
    for line in f:
        test_data.append(json.loads(line))

print(f"Test set: {len(test_data):,} examples")

# Sample for evaluation (use first 50 for speed)
test_sample = test_data[:50]

In [None]:
# Evaluate on test set
from tqdm.auto import tqdm

results = []

print("\nüîÑ Evaluating on test set...\n")

for i, example in enumerate(tqdm(test_sample, desc="Testing")):
    input_text = example['input_text']
    expected = example['target_text']
    
    # Generate
    generated = generate_code(input_text, trainer.model, tokenizer, max_new_tokens=512)
    
    # Check quality
    quality = check_method_body_quality(generated)
    expected_quality = check_method_body_quality(expected)
    
    results.append({
        'pair_type': example.get('pair_type', 'unknown'),
        'generated_quality': quality['quality_score'],
        'expected_quality': expected_quality['quality_score'],
        'is_full_impl': quality['is_full_implementation'],
        'has_logic': quality['has_logic'],
        'has_todo': quality['has_todo'],
    })

print("\n‚úÖ Evaluation complete!")

In [None]:
# Analyze results
import pandas as pd
import numpy as np

df = pd.DataFrame(results)

print("\n" + "="*60)
print("üìä EVALUATION RESULTS")
print("="*60)

print(f"\nTotal samples evaluated: {len(df)}")

print(f"\nüéØ Implementation Quality:")
print(f"  Full implementations: {df['is_full_impl'].sum()} / {len(df)} ({100*df['is_full_impl'].mean():.1f}%)")
print(f"  Has logic (if/when/etc): {df['has_logic'].sum()} / {len(df)} ({100*df['has_logic'].mean():.1f}%)")
print(f"  Has TODO markers: {df['has_todo'].sum()} / {len(df)} ({100*df['has_todo'].mean():.1f}%)")

print(f"\nüìà Quality Scores:")
print(f"  Mean generated quality: {df['generated_quality'].mean():.2f} / 6")
print(f"  Mean expected quality:  {df['expected_quality'].mean():.2f} / 6")
print(f"  Quality retention:      {100*df['generated_quality'].mean()/df['expected_quality'].mean():.1f}%")

print(f"\nüìã By Pair Type:")
for ptype in df['pair_type'].unique():
    subset = df[df['pair_type'] == ptype]
    full_impl_pct = 100 * subset['is_full_impl'].mean()
    avg_quality = subset['generated_quality'].mean()
    print(f"  {ptype:30s} Full: {full_impl_pct:5.1f}%  Quality: {avg_quality:.2f}/6")

# Success criteria
print("\n" + "="*60)
full_impl_rate = 100 * df['is_full_impl'].mean()
if full_impl_rate >= 70:
    print("‚úÖ SUCCESS: Model generates full implementations (‚â•70%)")
elif full_impl_rate >= 50:
    print("‚ö†Ô∏è  PARTIAL: Model sometimes generates full implementations (50-70%)")
else:
    print("‚ùå FAILURE: Model mostly generates signatures/stubs (<50%)")
print("="*60)

## 12. Summary & Next Steps

In [None]:
print("\n" + "="*60)
print("üìã TRAINING SUMMARY")
print("="*60)

print(f"\n‚úÖ Training Complete")
print(f"   Model: {MODEL_NAME}")
print(f"   Training samples: {len(train_dataset):,}")
print(f"   Validation samples: {len(val_dataset):,}")
print(f"   Epochs: {training_args.num_train_epochs}")

print(f"\nüíæ Model Saved")
print(f"   Local: ./qwen3-kmp-final")
print(f"   Drive: /content/drive/MyDrive/qwen3-kmp-finetuned")

if 'df' in globals():
    print(f"\nüéØ Evaluation Results")
    print(f"   Full implementations: {100*df['is_full_impl'].mean():.1f}%")
    print(f"   Average quality: {df['generated_quality'].mean():.2f}/6")

print("\nüìù Next Steps:")
print("   1. Test on your own KMP code examples")
print("   2. Export to GGUF for CPU inference (optional)")
print("   3. Integrate with your IDE")
print("   4. Fine-tune further if needed")
print("="*60)