# CodeWhisper Training Pipeline (Kaggle)

This notebook is designed to fine-tune the CodeT5 model on CodeSearchNet/CodeXGLUE datasets using a T4 GPU.
It is self-contained and does not require cloning the repository.

In [None]:
# 1. Setup Environment
!nvidia-smi

# Install dependencies
!pip install -q transformers datasets accelerate peft bitsandbytes radon lizard protobuf==3.20.3

In [None]:
# 2. Prepare Datasets
import os
import json
from datasets import load_dataset
from tqdm import tqdm

def prepare_dataset(output_dir, languages=['python', 'java'], split='train', limit=None):
    print(f"Processing google/code_x_glue_ct_code_to_text for {languages}...")
    
    data = []
    
    for lang in languages:
        print(f"Loading {lang}...")
        try:
            # Using 'google/code_x_glue_ct_code_to_text' as requested
            # Disable streaming to avoid 429 Too Many Requests
            ds = load_dataset("google/code_x_glue_ct_code_to_text", lang, split=split, trust_remote_code=True)
        except Exception as e:
            print(f"Error loading {lang}: {e}")
            continue

        count = 0
        for item in tqdm(ds):
            code = item.get('code') or item.get('func_code_string') or ''
            doc = item.get('docstring') or item.get('func_documentation_string') or ''
            
            if code and doc:
                entry = {
                    "code": code,
                    "docstring": doc,
                    "language": lang,
                    "source": "google/code_x_glue_ct_code_to_text"
                }
                data.append(entry)
                count += 1
                
            if limit and count >= limit:
                break
                
    output_file = os.path.join(output_dir, f"training_data_{split}.jsonl")
    print(f"Saving {len(data)} records to {output_file}...")
    
    os.makedirs(output_dir, exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        for entry in data:
            f.write(json.dumps(entry) + '\n')

# Run preparation
prepare_dataset("processed_data", limit=10000) # Limit for demo, remove for full run

In [None]:
# 3. Training Logic
import torch
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

def train(
    train_file: str,
    output_dir: str,
    model_name: str = "Salesforce/codet5-small",
    batch_size: int = 4,
    epochs: int = 3,
    learning_rate: float = 2e-5
):
    print(f"Loading model: {model_name}")
    
    use_cuda = torch.cuda.is_available()
    device = "cuda" if use_cuda else "cpu"
    print(f"Using device: {device}")

    # QLoRA Configuration (Only if CUDA is available)
    if use_cuda:
        print("CUDA detected. Using QLoRA with 4-bit quantization.")
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
        
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True
        )
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)
    else:
        print("CUDA NOT detected. Using standard CPU training (No Quantization).")
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name,
            trust_remote_code=True
        ).to(device)

    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # LoRA Configuration
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["q", "v"] # Target attention layers
    )

    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    # Load Dataset
    data_files = {"train": train_file}
    dataset = load_dataset("json", data_files=data_files)

    def preprocess_function(examples):
        inputs = [
            f"Generate a documentation string for this function:\n{lang}: {code}" 
            for lang, code in zip(examples["language"], examples["code"])
        ]
        targets = examples["docstring"]
        
        model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
        
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_dataset = dataset.map(preprocess_function, batched=True)

    # Training Arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=4, 
        learning_rate=learning_rate,
        num_train_epochs=epochs,
        logging_steps=10,
        save_strategy="epoch",
        eval_strategy="no", 
        fp16=use_cuda, 
        use_cpu=not use_cuda, 
        optim="paged_adamw_8bit" if use_cuda else "adamw_torch", 
        ddp_find_unused_parameters=False if (use_cuda and torch.cuda.device_count() > 1) else None,
        report_to="none"
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    print("Starting training...")
    trainer.train()
    
    print(f"Saving model to {output_dir}")
    trainer.save_model(output_dir)

In [None]:
# 4. Execute Training
train(
    train_file="processed_data/training_data_train.jsonl",
    output_dir="results/codet5-finetuned",
    batch_size=4,
    epochs=1,
    learning_rate=2e-5
)

In [None]:
# 5. Save/Download Model
!zip -r model_output.zip results/codet5-finetuned