# Refactotron Training

Fine-tuning StarCoder-1B with LoRA for code refactoring.

Expected validation loss: 0.48-0.55
Training time: 12-15 hours on T4 GPU

## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print("/content/drive/MyDrive/refactotron_lora_optimized/")

## Check GPU & Install Dependencies

In [None]:
import torch

print("GPU Status:")
print(f"Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print("Ready for training!")
else:
    print("NO GPU! Go to: Runtime > Change runtime type > T4 GPU")

!pip install -q transformers datasets peft accelerate bitsandbytes

## Upload Training Data

Upload the enhanced training files:
- `train_enhanced.jsonl` (~60 MB)
- `validation_enhanced.jsonl` (~7.5 MB)

In [None]:
from google.colab import files
import os

print("Upload train_enhanced.jsonl and validation_enhanced.jsonl")
uploaded = files.upload()

print("\nFiles uploaded:")
for filename in uploaded.keys():
    size_mb = len(uploaded[filename]) / (1024*1024)
    print(f"{filename}: {size_mb:.1f} MB")
    
if 'train_enhanced.jsonl' not in uploaded:
    print("\nWARNING: You need to upload 'train_enhanced.jsonl'")
if 'validation_enhanced.jsonl' not in uploaded:
    print("\nWARNING: You need to upload 'validation_enhanced.jsonl'")

## HuggingFace Authentication

In [None]:
from huggingface_hub import login
login()

## Load Model & Tokenizer

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
import torch

tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoderbase-1b")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    "bigcode/starcoderbase-1b",
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

print(f"Base model loaded: {model.num_parameters():,} parameters")

## Configure LoRA

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_proj", "c_attn"],
    lora_dropout=0.08,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("\nLoRA configured (r=16)")

## Load Training Data

In [None]:
from datasets import Dataset
import json

def load_jsonl(filepath):
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

train_data = load_jsonl('train_enhanced.jsonl')
val_data = load_jsonl('validation_enhanced.jsonl')

print(f"Train: {len(train_data):,} samples")
print(f"Validation: {len(val_data):,} samples")

if len(train_data) < 30000:
    print(f"\nWARNING: Only {len(train_data)} training samples!")
    print("Expected ~39,000+ samples. Did you upload the wrong file?")
else:
    print("Data size verified")

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

print("\nSample input (first 200 chars):")
print(train_data[0]['input'][:200])

## Tokenization with Label Masking

Input tokens are masked with -100 so loss is only computed on output tokens.

In [None]:
def tokenize_function(examples):
    """
    Tokenize input + output with proper label masking.
    Loss is only computed on the output portion.
    """
    input_encodings = tokenizer(
        examples['input'],
        truncation=False,
        padding=False,
        add_special_tokens=False
    )
    
    output_encodings = tokenizer(
        examples['output'],
        truncation=False,
        padding=False,
        add_special_tokens=False
    )
    
    input_ids_list = []
    labels_list = []
    
    for inp_ids, out_ids in zip(input_encodings['input_ids'], output_encodings['input_ids']):
        combined = inp_ids + out_ids
        
        if len(combined) > 1024:
            combined = combined[:1024]
        
        labels = [-100] * len(inp_ids) + out_ids
        if len(labels) > 1024:
            labels = labels[:1024]
        
        input_ids_list.append(combined)
        labels_list.append(labels)
    
    return {
        'input_ids': input_ids_list,
        'labels': labels_list
    }

tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing train"
)

tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=val_dataset.column_names,
    desc="Tokenizing validation"
)

print(f"Tokenized train: {len(tokenized_train):,} samples")
print(f"Tokenized validation: {len(tokenized_val):,} samples")

## Data Collator

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

## Training Configuration

In [None]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

device_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "cpu"
use_bf16 = "A100" in device_name or "H100" in device_name

print(f"Configuring for: {device_name}")
print(f"Precision: {'BF16' if use_bf16 else 'FP16'}")

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/refactotron_lora_optimized",
    logging_dir="/content/drive/MyDrive/refactotron_lora_optimized/logs",
    
    num_train_epochs=5,
    
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_steps=500,
    
    weight_decay=0.02,
    max_grad_norm=1.0,
    
    fp16=(not use_bf16),
    bf16=use_bf16,
    fp16_full_eval=False,
    
    eval_accumulation_steps=4,
    eval_steps=1000,
    eval_strategy="steps",
    
    logging_steps=50,
    save_steps=1000,
    save_total_limit=3,
    
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    optim="adamw_torch_fused",
    
    report_to="none",
)

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3
)

print("Training configuration complete")

## Initialize Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    callbacks=[early_stopping]
)

total_steps = (len(tokenized_train) //
               (training_args.per_device_train_batch_size *
                training_args.gradient_accumulation_steps) *
               training_args.num_train_epochs)

print(f"Training samples: {len(tokenized_train):,}")
print(f"Validation samples: {len(tokenized_val):,}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Total epochs: {training_args.num_train_epochs}")
print(f"Max training steps: {total_steps:,}")
print(f"\nExpected Results:")
print(f"  Initial validation loss: ~0.68-0.71")
print(f"  Target validation loss: 0.48-0.55")
print(f"  Estimated time: 12-15 hours")

## Start Training

In [None]:
import time

print("Starting training...")
print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")

trainer.train()

print("Training complete")
print(f"End time: {time.strftime('%Y-%m-%d %H:%M:%S')}")

## Save Model

In [None]:
model.save_pretrained("/content/drive/MyDrive/refactotron_lora_FINAL")
tokenizer.save_pretrained("/content/drive/MyDrive/refactotron_lora_FINAL")

print("Saved to: /content/drive/MyDrive/refactotron_lora_FINAL/")