# Training Data Collection

This notebook collects training data for code line prediction by:
1. Downloading FastAPI example code files
2. Extracting code contexts (15 lines before, 10 lines after) for each line
3. Preparing the dataset for fine-tuning

In [37]:
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import ast
from typing import List, Dict, Tuple
import pandas as pd
import random

In [None]:
# FastAPI documentation URLs with example code
# TODO: add more later
FASTAPI_DOCS = [
    'https://fastapi.tiangolo.com/tutorial/first-steps/',
    'https://fastapi.tiangolo.com/tutorial/path-params/',
    'https://fastapi.tiangolo.com/tutorial/query-params/',
    'https://fastapi.tiangolo.com/tutorial/body/',
    'https://fastapi.tiangolo.com/tutorial/response-model/',
    'https://fastapi.tiangolo.com/tutorial/path-params-numeric-validations/',
    'https://fastapi.tiangolo.com/tutorial/body-multiple-params/',
    'https://fastapi.tiangolo.com/tutorial/body-fields/',
    'https://fastapi.tiangolo.com/tutorial/dependencies/'
]

In [None]:
def extract_python_code(url: str) -> List[str]:
    """Extract Python code blocks from FastAPI documentation page"""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all code blocks
    code_blocks = soup.find_all('div', class_='highlight')
    
    # Extract code from each block
    code_samples = []
    for block in code_blocks:
        code = block.get_text()
        code = code.strip()
        
        if code and ('fastapi' in code.lower() or '@app' in code):
            code_samples.append(code)
    
    return code_samples

In [None]:
def is_valid_python_line(line: str) -> bool:
    """Check if a line is a valid Python code (and not empty/comment)"""
    line = line.strip()

    # Ignore empty lines and comments
    if not line or line.startswith('#'):
        return False

    try:
        ast.parse(line)
        return True
    except SyntaxError:
        # Some lines might be valid but incomplete (e.g. function definitions)
        # we lose some precision here, but gain recall :)
        return len(line) > 5 and not line.startswith((' ', '\t', '"""', '\n'))
    except Exception:
        return False

In [30]:
def extract_context(code: str, line_idx: int, before: int = 15, after: int = 10) -> Tuple[str, str, str]:
    """Extract context before and after a specific line"""
    lines = code.split('\n')
    
    # Get the target line
    target_line = lines[line_idx].strip()
    
    # Get context before
    start_idx = max(0, line_idx - before)
    context_before = '\n'.join(lines[start_idx:line_idx])
    
    # Get context after
    end_idx = min(len(lines), line_idx + after + 1)
    context_after = '\n'.join(lines[line_idx + 1:end_idx])
    
    return context_before, target_line, context_after

In [None]:
def process_code_sample(code: str) -> List[Dict[str, str]]:
    """Process a code sample to extract training examples"""
    examples = []
    lines = code.split('\n')
    
    for i, line in enumerate(lines):
        if is_valid_python_line(line):
            context_before, target, context_after = extract_context(code, i)
            if context_before or context_after:  # Skip if no context
                # not fastest way to do this, but we don't care about performance here
                examples.append({
                    'context_before': context_before,
                    'target_line': target,
                    'context_after': context_after
                })
    
    return examples

In [43]:
# Collect all examples
all_examples = []

for url in FASTAPI_DOCS:
    print(f'Processing {url}')
    code_samples = extract_python_code(url)
    for code in code_samples:
        examples = process_code_sample(code)
        all_examples.extend(examples)

print(f'Collected {len(all_examples)} examples')

Processing https://fastapi.tiangolo.com/tutorial/first-steps/
Processing https://fastapi.tiangolo.com/tutorial/path-params/
Processing https://fastapi.tiangolo.com/tutorial/path-params/
Processing https://fastapi.tiangolo.com/tutorial/query-params/
Processing https://fastapi.tiangolo.com/tutorial/query-params/
Processing https://fastapi.tiangolo.com/tutorial/body/
Processing https://fastapi.tiangolo.com/tutorial/body/
Processing https://fastapi.tiangolo.com/tutorial/response-model/
Processing https://fastapi.tiangolo.com/tutorial/response-model/
Processing https://fastapi.tiangolo.com/tutorial/path-params-numeric-validations/
Processing https://fastapi.tiangolo.com/tutorial/path-params-numeric-validations/
Processing https://fastapi.tiangolo.com/tutorial/body-multiple-params/
Processing https://fastapi.tiangolo.com/tutorial/body-multiple-params/
Processing https://fastapi.tiangolo.com/tutorial/body-fields/
Processing https://fastapi.tiangolo.com/tutorial/body-fields/
Processing https:/

In [33]:
# Path for saving raw data
RAW_DATA_PATH = Path('../data/raw')
RAW_DATA_PATH.mkdir(exist_ok=True)

In [42]:
len(all_examples)  # something is working!

1959

In [44]:
# Convert to DataFrame
df = pd.DataFrame(all_examples)

# Basic stats
print(f'Total examples collected: {len(df)}')
print('\nContext stats (lines):')
stats = df.agg({
    'context_before': lambda x: x.str.count('\n').mean(),
    'context_after': lambda x: x.str.count('\n').mean()
}).round(1)
print(f'Average lines before: {stats["context_before"]}') 
print(f'Average lines after: {stats["context_after"]}') 

Total examples collected: 1959

Context stats (lines):
Average lines before: 8.4
Average lines after: 5.9


In [46]:
# Save data
output_file = RAW_DATA_PATH / 'fastapi_code_examples.json'
df.to_json(output_file, orient='records', indent=2, force_ascii=False)
print(f'\nSaved {len(df)} examples to {output_file}')


Saved 1959 examples to ../data/raw/fastapi_code_examples.json


In [None]:
# Interactive example viewer
def display_example(example: dict, index: int = None):
    """Display a single example in a readable format"""
    header = f"Example {index}" if index is not None else "Example"
    print(f"\n{'='*20} {header} {'='*20}")
    print("\033[94mContext before:\033[0m")
    print(example['context_before'])
    print("\n\033[92mTarget line:\033[0m")
    print(example['target_line'])
    print("\n\033[94mContext after:\033[0m")
    print(example['context_after'])
    print('=' * (42 + len(header)))

idx = random.randint(0, len(df) - 1)
display_example(df.iloc[idx], idx)


[94mContext before:[0m
from fastapi import Depends, FastAPI

app = FastAPI()


async def common_parameters(q: str | None = None, skip: int = 0, limit: int = 100):
    return {"q": q, "skip": skip, "limit": limit}


@app.get("/items/")
async def read_items(commons: Annotated[dict, Depends(common_parameters)]):
    return commons


@app.get("/users/")

[92mTarget line:[0m
async def read_users(commons: Annotated[dict, Depends(common_parameters)]):

[94mContext after:[0m
    return commons
