# Refactotron: Fixed Training (Proper Label Masking)

**CRITICAL FIX:** This version properly masks input tokens so loss is only computed on output tokens.

**Expected Results:**
- Validation Loss: 0.48-0.55 (should decrease from ~0.71)
- Training time: ~12-15 hours on T4 GPU
- BLEU score: 70-75
- CodeBERT similarity: 0.85-0.90

## Cell 1: Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print("/content/drive/MyDrive/refactotron_lora_optimized/")

## Cell 2: Check GPU & Install Dependencies

In [None]:
import torch

print("GPU Status:")
print(f"Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print("Ready for training!")
else:
    print("NO GPU! Go to: Runtime > Change runtime type > T4 GPU")

!pip install -q transformers datasets peft accelerate bitsandbytes

## Cell 3: Upload Training Data

**IMPORTANT: Upload the ENHANCED files (larger files):**
- `train_enhanced.jsonl` (~60 MB) - NOT train.jsonl
- `validation_enhanced.jsonl` (~7.5 MB) - NOT validation.jsonl

In [None]:
from google.colab import files
import os

print("Upload train_enhanced.jsonl and validation_enhanced.jsonl")
uploaded = files.upload()

print("\nFiles uploaded:")
for filename in uploaded.keys():
    size_mb = len(uploaded[filename]) / (1024*1024)
    print(f"{filename}: {size_mb:.1f} MB")
    
# Verify correct files
if 'train_enhanced.jsonl' not in uploaded:
    print("\n⚠️  WARNING: You need to upload 'train_enhanced.jsonl' (not 'train.jsonl')!")
if 'validation_enhanced.jsonl' not in uploaded:
    print("\n⚠️  WARNING: You need to upload 'validation_enhanced.jsonl' (not 'validation.jsonl')!")

## Cell 4: HuggingFace Authentication

In [None]:
from huggingface_hub import login
login()

## Cell 5: Load Model & Tokenizer

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
import torch

tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoderbase-1b")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    "bigcode/starcoderbase-1b",
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

print(f"Base model loaded: {model.num_parameters():,} parameters")

## Cell 6: Configure LoRA

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_proj", "c_attn"],
    lora_dropout=0.08,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("\nLoRA configured (r=16)")

## Cell 7: Load Training Data

In [None]:
from datasets import Dataset
import json

def load_jsonl(filepath):
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

train_data = load_jsonl('train_enhanced.jsonl')
val_data = load_jsonl('validation_enhanced.jsonl')

print(f"Train: {len(train_data):,} samples")
print(f"Validation: {len(val_data):,} samples")

# VERIFY DATA SIZE
if len(train_data) < 30000:
    print(f"\n⚠️  WARNING: Only {len(train_data)} training samples!")
    print("Expected ~39,000+ samples. Did you upload the wrong file?")
else:
    print("✓ Data size looks good!")

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

print("\nSample input (first 200 chars):")
print(train_data[0]['input'][:200])

## Cell 8: Tokenization with PROPER LABEL MASKING

**CRITICAL FIX:** This version masks input tokens so loss is only computed on output.

In [None]:
from transformers import DataCollatorForLanguageModeling

def tokenize_function(examples):
    """
    Tokenize input + output, but MASK input tokens in labels.
    Loss is only computed on the output portion.
    """
    # Tokenize inputs and outputs separately first
    input_encodings = tokenizer(
        examples['input'],
        truncation=False,
        padding=False,
        add_special_tokens=False
    )
    
    output_encodings = tokenizer(
        examples['output'],
        truncation=False,
        padding=False,
        add_special_tokens=False
    )
    
    # Combine input + output
    input_ids_list = []
    labels_list = []
    
    for inp_ids, out_ids in zip(input_encodings['input_ids'], output_encodings['input_ids']):
        # Combine sequences
        combined = inp_ids + out_ids
        
        # Truncate if too long
        if len(combined) > 1024:
            combined = combined[:1024]
        
        # Create labels: -100 for input tokens (ignored), actual IDs for output tokens
        labels = [-100] * len(inp_ids) + out_ids
        if len(labels) > 1024:
            labels = labels[:1024]
        
        input_ids_list.append(combined)
        labels_list.append(labels)
    
    return {
        'input_ids': input_ids_list,
        'labels': labels_list
    }

tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing train"
)

tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=val_dataset.column_names,
    desc="Tokenizing validation"
)

print(f"Tokenized train: {len(tokenized_train):,} samples")
print(f"Tokenized validation: {len(tokenized_val):,} samples")

# Data collator with padding
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    return_tensors="pt"
)

## Cell 9: Training Configuration

In [None]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/refactotron_lora_optimized",
    logging_dir="/content/drive/MyDrive/refactotron_lora_optimized/logs",
    
    num_train_epochs=5,
    
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_steps=500,
    
    weight_decay=0.02,
    label_smoothing_factor=0.05,
    max_grad_norm=1.0,
    
    bf16=True,
    
    logging_steps=50,
    eval_steps=500,
    save_steps=500,
    save_total_limit=3,
    eval_strategy="steps",
    
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    optim="adamw_torch_fused",
    
    report_to="none",
)

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3
)

print("Training configuration complete")

## Cell 10: Initialize Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    callbacks=[early_stopping]
)

total_steps = (len(tokenized_train) //
               (training_args.per_device_train_batch_size *
                training_args.gradient_accumulation_steps) *
               training_args.num_train_epochs)

print(f"Training samples: {len(tokenized_train):,}")
print(f"Validation samples: {len(tokenized_val):,}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Total epochs: {training_args.num_train_epochs}")
print(f"Max training steps: {total_steps:,}")
print(f"\n=== EXPECTED RESULTS (WITH PROPER LABEL MASKING) ===")
print(f"   • Initial validation loss: ~0.68-0.71")
print(f"   • Target validation loss: 0.48-0.55")
print(f"   • Training loss should be similar to validation loss (not 10x higher!)")
print(f"   • Estimated time: 12-15 hours")

## Cell 11: START TRAINING

In [None]:
import time

print("Starting training...")
print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")

trainer.train()

print("Training complete")
print(f"End time: {time.strftime('%Y-%m-%d %H:%M:%S')}")

## Cell 12: Save Final Model

In [None]:
model.save_pretrained("/content/drive/MyDrive/refactotron_lora_FINAL")
tokenizer.save_pretrained("/content/drive/MyDrive/refactotron_lora_FINAL")

print("Saved to: /content/drive/MyDrive/refactotron_lora_FINAL/")