# Data Preprocessing for Fine-tuning

This notebook prepares the collected FastAPI code examples for model fine-tuning:
1. Load and clean the collected examples
2. Format data for the tokenizer
3. Create train/test/dev split (80/10/10)
4. Save processed datasets

In [1]:
import json
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset
from typing import List, Dict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Paths
RAW_DATA_PATH = Path('../data/raw')
PROCESSED_DATA_PATH = Path('../data/processed')
PROCESSED_DATA_PATH.mkdir(exist_ok=True)

# Load raw data
with open(RAW_DATA_PATH / 'fastapi_code_examples.json', 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)
print(f'Loaded {len(df)} examples')

Loaded 1959 examples


In [3]:
# Format examples for the model
def format_example(row: pd.Series) -> str:
    """Format a single example for the model input"""
    # We use special tokens to help model distinguish parts
    return f"<|context|>{row['context_before']}\n<|target|>{row['target_line']}\n<|next|>{row['context_after']}"

df['formatted_text'] = df.apply(format_example, axis=1)

# Quick look at formatted example
print('Example of formatted text:')
print('-' * 50)
print(df['formatted_text'].iloc[0])

Example of formatted text:
--------------------------------------------------
<|context|>
<|target|>from fastapi import FastAPI
<|next|>
app = FastAPI()


@app.get("/")
async def root():
    return {"message": "Hello World"}


In [4]:
# Create train/test/dev split (80/10/10)
def create_splits(df: pd.DataFrame, train_size: float = 0.8, test_size: float = 0.1):
    """Create train/test/dev split"""
    # First split: separate train (80%) from rest (20%)
    train_df, temp_df = train_test_split(
        df, train_size=train_size, random_state=42
    )
    
    # Second split: divide remaining 20% into test (10%) and dev (10%)
    # We need to adjust the test_size to get the right proportion
    test_df, dev_df = train_test_split(
        temp_df, test_size=0.5, random_state=42
    )
    
    return train_df, test_df, dev_df

# Create splits
train_df, test_df, dev_df = create_splits(df)

print('Dataset splits:')
print(f'Train set size: {len(train_df)} ({len(train_df)/len(df)*100:.1f}%)')
print(f'Test set size:  {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)')
print(f'Dev set size:   {len(dev_df)} ({len(dev_df)/len(df)*100:.1f}%)')

Dataset splits:
Train set size: 1567 (80.0%)
Test set size:  196 (10.0%)
Dev set size:   196 (10.0%)


In [5]:
# Initialize tokenizer (CodeGen)
MODEL_NAME = 'Salesforce/codegen-350M-mono'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Add special tokens that match CodeGen style
special_tokens = ['<|context|>', '<|target|>', '<|next|>']
tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})

# Set up padding token (using EOS token as padding token, which is common practice)
tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_function(examples: Dict[str, List]) -> Dict[str, List]:
    """Tokenize examples for the model"""
    return tokenizer(
        examples['formatted_text'],
        truncation=True,
        padding='max_length',
        max_length=512,  # TODO: adjust based on GPU memory
        return_tensors='pt'
    )

# Convert to HF datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
dev_dataset = Dataset.from_pandas(dev_df)

# Apply tokenization
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=test_dataset.column_names
)

dev_dataset = dev_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dev_dataset.column_names
)

Map: 100%|██████████| 1567/1567 [00:00<00:00, 4346.90 examples/s]
Map: 100%|██████████| 1567/1567 [00:00<00:00, 4346.90 examples/s]
Map: 100%|██████████| 196/196 [00:00<00:00, 3910.38 examples/s]
Map: 100%|██████████| 196/196 [00:00<00:00, 3910.38 examples/s]
Map: 100%|██████████| 196/196 [00:00<00:00, 3625.73 examples/s]
Map: 100%|██████████| 196/196 [00:00<00:00, 3625.73 examples/s]


In [6]:
# Save processed datasets
train_dataset.save_to_disk(PROCESSED_DATA_PATH / 'train')
test_dataset.save_to_disk(PROCESSED_DATA_PATH / 'test')
dev_dataset.save_to_disk(PROCESSED_DATA_PATH / 'dev')

# Also save tokenizer for consistency
tokenizer.save_pretrained(PROCESSED_DATA_PATH / 'tokenizer')

print('Saved processed datasets and tokenizer')

Saving the dataset (1/1 shards): 100%|██████████| 1567/1567 [00:00<00:00, 330507.61 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 196/196 [00:00<00:00, 64366.08 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1567/1567 [00:00<00:00, 330507.61 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 196/196 [00:00<00:00, 64366.08 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 196/196 [00:00<00:00, 88970.08 examples/s] 



Saved processed datasets and tokenizer


In [7]:
# Quick validation of splits
print('Dataset statistics:')
print(f'Train examples: {len(train_dataset)} ({len(train_dataset)/len(df)*100:.1f}%)')
print(f'Test examples:  {len(test_dataset)} ({len(test_dataset)/len(df)*100:.1f}%)')
print(f'Dev examples:   {len(dev_dataset)} ({len(dev_dataset)/len(df)*100:.1f}%)')

# Check a random example from each split
for split_name, dataset in [('Train', train_dataset), ('Test', test_dataset), ('Dev', dev_dataset)]:
    print(f'\n{split_name} split example:')
    print('=' * 50)
    example = dataset[0]
    decoded = tokenizer.decode(example['input_ids'])
    print(decoded[:300], '...')
    print('=' * 50)

Dataset statistics:
Train examples: 1567 (80.0%)
Test examples:  196 (10.0%)
Dev examples:   196 (10.0%)

Train split example:
<|context|>from typing import Annotated, Union

from fastapi import Depends, FastAPI

app = FastAPI()


async def common_parameters(
    q: Union[str, None] = None, skip: int = 0, limit: int = 100
):
    return {"q": q, "skip": skip, "limit": limit}


<|target|>@app.get("/items/")
<|next|>async def  ...

Test split example:
<|context|>
<|target|>from fastapi import FastAPI
<|next|>from pydantic import BaseModel

app = FastAPI()


class Item(BaseModel):
    name: str
    description: str | None = None
    price: float
    tax: float = 10.5<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|en ...

Dev split example:
<|context|>

class UserIn(BaseModel):
    username: str
    password: str
    email: EmailStr
    full_name: Union[str, None] = None


class UserOut(BaseModel):
    username: str
    email: EmailStr
    full_name: Union[st