In [None]:
# Install required packages
!pip install -q transformers datasets accelerate huggingface_hub torch sentencepiece
!pip install -q bitsandbytes  # For efficient training

# Login to HuggingFace (you'll be prompted for your token)
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from google.colab import files
import os
import zipfile

# Method 1: Upload individual files (if small)
print("Upload the 4 Spider JSON files when prompted...")
uploaded = files.upload()  # Click 'Choose Files' and select all 4 files

# Create spider directory
os.makedirs('/content/spider', exist_ok=True)

# Move uploaded files
for filename in uploaded.keys():
    os.rename(filename, f'/content/spider/{filename}')
    print(f"✓ Moved {filename} to /content/spider/")

print("\n✓ All files uploaded successfully!")

In [None]:
import json
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Verify Spider files are present
import os
spider_path = '/content/spider'
required_files = ['tables.json', 'train_spider.json', 'train_others.json', 'dev.json']

print("Checking Spider files...")
for f in required_files:
    full_path = os.path.join(spider_path, f)
    exists = os.path.exists(full_path)
    print(f"  {'✓' if exists else '✗'} {f}")
    if not exists:
        raise FileNotFoundError(f"Missing: {f}")

print("\n✓ All required files found!\n")

# Load Spider data
def load_spider_dataset(base_path='/content/spider'):
    """Load Spider train, dev, and tables.json"""

    # Load schemas
    with open(f'{base_path}/tables.json', 'r') as f:
        tables = json.load(f)

    # Create schema lookup
    schema_dict = {}
    for db in tables:
        db_id = db['db_id']
        tables_list = []

        table_names = db.get('table_names_original', db.get('table_names', []))
        column_names = db.get('column_names_original', db.get('column_names', []))
        column_types = db.get('column_types', [])

        # Group columns by table
        table_columns = {}
        for idx, (table_idx, col_name) in enumerate(column_names):
            if table_idx == -1:
                continue
            table_name = table_names[table_idx]
            if table_name not in table_columns:
                table_columns[table_name] = []
            col_type = column_types[idx] if idx < len(column_types) else 'text'
            table_columns[table_name].append(f"{col_name} ({col_type})")

        # Format schema text
        schema_text = f"Database: {db_id}\\nTables:\\n"
        for table in table_names:
            cols = table_columns.get(table, [])
            schema_text += f"  - {table}: {', '.join(cols)}\\n"

        schema_dict[db_id] = schema_text.strip()

    # Load training examples
    def load_examples(file_path):
        with open(file_path, 'r') as f:
            return json.load(f)

    train_spider = load_examples(f'{base_path}/train_spider.json')
    train_others = load_examples(f'{base_path}/train_others.json')
    dev = load_examples(f'{base_path}/dev.json')

    # Combine all training data
    all_train = train_spider + train_others + dev

    print(f"Loaded {len(all_train)} total examples")
    print(f"  - train_spider: {len(train_spider)}")
    print(f"  - train_others: {len(train_others)}")
    print(f"  - dev: {len(dev)}")

    # Format for fine-tuning
    formatted_data = []
    for ex in all_train:
        question = ex['question']
        sql = ex['query']
        db_id = ex['db_id']
        schema = schema_dict.get(db_id, '')

        # Input: schema + question (simple format matching our prompt)
        input_text = f"{schema}\\n\\nQ: {question}\\nA:"
        target_text = sql

        formatted_data.append({
            'input': input_text,
            'target': target_text,
            'db_id': db_id,
            'question': question
        })

    return formatted_data, schema_dict


# Load data
train_data, schemas = load_spider_dataset('/content/spider')

# Create train/validation split (90/10)
split_idx = int(len(train_data) * 0.9)
train_split = train_data[:split_idx]
val_split = train_data[split_idx:]

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_list(train_split)
val_dataset = Dataset.from_list(val_split)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

print(f"\\nDataset split:")
print(f"  Train: {len(train_dataset)}")
print(f"  Validation: {len(val_dataset)}")

In [None]:
# Clear GPU memory first
import torch
import gc

torch.cuda.empty_cache()
gc.collect()
print("✓ GPU memory cleared")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Reload model with gradient checkpointing
model_name = "Salesforce/codet5p-770m"
print(f"Reloading model with gradient checkpointing...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map="auto"
)

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

print("✓ Model reloaded with gradient checkpointing enabled")

# Tokenize datasets
def preprocess_function(examples):
    """Tokenize inputs and targets"""
    inputs = examples['input']
    targets = examples['target']

    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding='max_length'
    )

    # Tokenize targets
    labels = tokenizer(
        targets,
        max_length=256,
        truncation=True,
        padding='max_length'
    )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply tokenization
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset['train'].column_names,
    desc="Tokenizing dataset"
)

print("\\nTokenization complete!")

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq

# Ultra-low memory configuration
training_args = TrainingArguments(
    output_dir="./codet5p-spider-finetuned",
    num_train_epochs=5,
    per_device_train_batch_size=1,  # Minimum batch size
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,  # Maintain effective batch size of 16
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=1000,  # Less frequent eval to save memory
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,  # Keep fewer checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=False,
    gradient_checkpointing=True,  # Enable in args too
    report_to="none",
    push_to_hub=False,
    remove_unused_columns=False,
    max_grad_norm=1.0,  # Gradient clipping
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    processing_class=tokenizer,
    data_collator=data_collator,
)

print("✓ Trainer configured with minimum memory settings")
print("Batch size: 1 × 16 gradient accumulation = 16 effective")
print("Training will be slower but should fit in memory")

In [None]:
print("\n" + "="*50)
print("Starting fine-tuning (low memory mode)...")
print("="*50 + "\n")

trainer.train()