# Fine-tuning CodeGen for FastAPI Code Generation

This notebook performs fine-tuning of the CodeGen-350M model on our preprocessed FastAPI code examples.

## Google Colab Setup
First, we need to mount Google Drive, check GPU availability and install required packages.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Check GPU availability
!nvidia-smi

# Install required packages
!pip install torch transformers datasets numpy

# Setup paths
from pathlib import Path

# Set project paths
BASE_PATH = Path('/content/drive/MyDrive/fastapi-codegen')
PROCESSED_DATA_PATH = BASE_PATH / 'data/processed'
MODEL_PATH = BASE_PATH / 'models'

# Create directories
MODEL_PATH.mkdir(parents=True, exist_ok=True)
(MODEL_PATH / 'checkpoints').mkdir(exist_ok=True)

print(f'Project directory: {BASE_PATH}')
print(f'Processed data directory: {PROCESSED_DATA_PATH}')
print(f'Model directory: {MODEL_PATH}')

In [7]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator
)
from datasets import load_from_disk
import numpy as np

# Clear GPU memory if needed
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
# Paths
# PROCESSED_DATA_PATH = Path('../data/processed')
# MODEL_PATH = Path('../models')
# MODEL_PATH.mkdir(exist_ok=True)

# Load datasets and tokenizer
train_dataset = load_from_disk(PROCESSED_DATA_PATH / 'train')
test_dataset = load_from_disk(PROCESSED_DATA_PATH / 'test')
dev_dataset = load_from_disk(PROCESSED_DATA_PATH / 'dev')

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(PROCESSED_DATA_PATH / 'tokenizer')

print(f'Train size: {len(train_dataset)}')
print(f'Test size: {len(test_dataset)}')
print(f'Dev size: {len(dev_dataset)}')

Train size: 1567
Test size: 196
Dev size: 196


In [9]:
# Initialize model
MODEL_NAME = 'Salesforce/codegen-350M-mono'

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    pad_token_id=tokenizer.eos_token_id  # ensure model knows about padding
)

# Move model to available device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Resize token embeddings to account for new special tokens
model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at Salesforce/codegen-350M-mono were not used when initializing CodeGenForCausalLM: ['transformer.h.0.attn.causal_mask', 'transformer.h.1.attn.causal_mask', 'transformer.h.10.attn.causal_mask', 'transformer.h.11.attn.causal_mask', 'transformer.h.12.attn.causal_mask', 'transformer.h.13.attn.causal_mask', 'transformer.h.14.attn.causal_mask', 'transformer.h.15.attn.causal_mask', 'transformer.h.16.attn.causal_mask', 'transformer.h.17.attn.causal_mask', 'transformer.h.18.attn.causal_mask', 'transformer.h.19.attn.causal_mask', 'transformer.h.2.attn.causal_mask', 'transformer.h.3.attn.causal_mask', 'transformer.h.4.attn.causal_mask', 'transformer.h.5.attn.causal_mask', 'transformer.h.6.attn.causal_mask', 'transformer.h.7.attn.causal_mask', 'transformer.h.8.attn.causal_mask', 'transformer.h.9.attn.causal_mask']
- This IS expected if you are initializing CodeGenForCausalLM from the checkpoint of a model trained on another task or with another architecture (e

In [None]:
# Training arguments optimized for Google Colab
training_args = TrainingArguments(
    output_dir=str(MODEL_PATH / 'checkpoints'),
    evaluation_strategy='steps',
    eval_steps=100,
    save_steps=200,
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    fp16=True,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
)



In [None]:
# Initialize trainer with default implementation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=default_data_collator,
)

# Start training
print('Starting training...')
trainer.train()

print('\nTraining completed!')

Step,Training Loss,Validation Loss


In [None]:
# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print('Test results:', test_results)

In [None]:
# Save final model
trainer.save_model(str(MODEL_PATH / 'final'))
print('Saved final model')