# Generate Synthetic Training Data

<a target="_blank" href="https://colab.research.google.com/github/simonguest/CS-394/blob/main/src/06/notebooks/generate-synthetic.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<a target="_blank" href="https://github.com/simonguest/CS-394/raw/refs/heads/main/src/06/notebooks/generate-synthetic.ipynb">
  <img src="https://img.shields.io/badge/Download_.ipynb-blue" alt="Download .ipynb"/>
</a>

## Data generation settings

In [40]:
NUM_TRAIN_EXAMPLES = 8000  # @param {type:"number"}
NUM_VAL_EXAMPLES = 1000  # @param {type:"number"}
NUM_TEST_EXAMPLES = 100 # @param {type:"number"}
TEMPERATURE = 0.8  # @param {type:"number"}

DATA_FOLDER = "./.data/generated"
!mkdir -p {DATA_FOLDER}

DATAGEN_MODEL = "openai/gpt-5.2-codex"

## Dataset diversity

In [41]:
TOPICS = [
    "Strings",
    "input()",
    "print()",
    "Creating variables",
    "Concatenating strings"
    "Lists",
    "if/else constructs",
    "in operator",
    "list methods: append and remove",
    "list methods: index, pop, and insert",
    "list methods: slicing",
    "list methods: deleting an item",
    "for loops",
    "range() and str()",
    "== comparison operator",
    "len() function",
    "code commenting with #",
    "Comparison operators: !=, >, >=, <, <=",
    "String methods: .lower(), .upper(), .title(), .capitalize()",
    "Using the newline character in strings",
    "int()",
    "float()",
    "elif",
    "import keyword",
    "random module",
    "while keyword",
    "or and not operators",
    "booleans",
    "list methods: .clear(), .copy(), .count(), .extend(), .reverse(), .sort()",
    "Dictionaries",
    "Dictionary methods: .items(), .keys(), .values(), .update(), .pop()",
    "Dictonary methods: .get(), .format()",
    "String methods: .find(), .join(), .replace(), .split(), .swapcase()",
    "Functions: using def and return keywords",
    "Function methods: .isinstance()",
    "Raising exceptions",
    "Exceptions: TypeError() and ValueError()",
    "Function keywords: as and from",
    "The sys module",
    "The with keyword",
    "Tuples",
    "The lambda keyword",
    "The built-in map function",
    "The time module",
    "Built in methods: __init()__ and __str()__",
    "Double underscore for private methods",
    "Classes"
]

CODE_LENGTH = [
    "short",
    "paragraph",
    "small_function",
    "large_function",
]
CODE_LENGTH_WEIGHTS = [0.25, 0.25, 0.25, 0.25]

## Model for structured output

In [42]:
from pydantic import BaseModel

class CodeExplanation(BaseModel):
    code: str
    explanation: str

## Get OpenRouter API key

In [43]:
import sys
import os
from dotenv import load_dotenv

if 'google.colab' in sys.modules:
  from google.colab import userdata # type:ignore
  os.environ['OPENROUTER_API_KEY'] = userdata.get('OpenRouter')
else:
  load_dotenv()

google/gemini-3.1-pro-preview
anthropic/claude-sonnet-4.6
qwen/qwen3.5-plus-02-15
qwen/qwen3.5-397b-a17b
minimax/minimax-m2.5
z-ai/glm-5
qwen/qwen3-max-thinking
anthropic/claude-opus-4.6
qwen/qwen3-coder-next
openrouter/free
stepfun/step-3.5-flash:free
stepfun/step-3.5-flash
arcee-ai/trinity-large-preview:free
moonshotai/kimi-k2.5
upstage/solar-pro-3:free
minimax/minimax-m2-her
writer/palmyra-x5
liquid/lfm-2.5-1.2b-thinking:free
liquid/lfm-2.5-1.2b-instruct:free
openai/gpt-audio
openai/gpt-audio-mini
z-ai/glm-4.7-flash
openai/gpt-5.2-codex
allenai/molmo-2-8b
allenai/olmo-3.1-32b-instruct
bytedance-seed/seed-1.6-flash
bytedance-seed/seed-1.6
minimax/minimax-m2.1
z-ai/glm-4.7
google/gemini-3-flash-preview
mistralai/mistral-small-creative
allenai/olmo-3.1-32b-think
xiaomi/mimo-v2-flash
nvidia/nemotron-3-nano-30b-a3b:free
nvidia/nemotron-3-nano-30b-a3b
openai/gpt-5.2-chat
openai/gpt-5.2-pro
openai/gpt-5.2
mistralai/devstral-2512
relace/relace-search
z-ai/glm-4.6v
nex-agi/deepseek-v3.1-nex-

## Conversation generation functions

In [44]:
import openai
import os

client = openai.OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.environ.get("OPENROUTER_API_KEY"),
)

def generate_completion(prompt: str) -> CodeExplanation | None:
    response = client.responses.parse(
        model=DATAGEN_MODEL,
        input=[{"role": "user", "content": prompt}],
        temperature=TEMPERATURE,
        stream=False,
        text_format=CodeExplanation
    )

    return response.output_parsed

def create_conversation(topic: str, code_length: str) -> CodeExplanation | None:
    request = ""
    if code_length == "short":
        request = f"2 - 4 lines of Python code about {topic}"
    elif code_length == "paragraph":
        request = f"3 - 6 lines of Python code about {topic}"
    elif code_length == "small_function":
        request = f"a small function (around 10 lines of Python code) about {topic}"
    elif code_length == "large_function":
        request = f"a large function (around 10 - 20 lines of Python code) about {topic}"
    else:
        request = f"a Python code example about {topic}"

    prompt = f"""
        Generate me {request}.

        For this selection of code, generate a short 2 paragraph explanation of what the selected code does:
        - The explanation should be suitable for a high school student learning Python.
        - When it makes sense, the second paragraph of the explanation should use an analogy to help the student better understand the code.
        - DO NOT wrap the code in a ```python block

        Return the following:
        1. The original code as a string.
        2. Your explanation of what the selected code does as a string.
    """

    return generate_completion(prompt)

## Dataset generation functions

In [45]:
import random
import json
from tqdm import tqdm

def generate_dataset(num_examples: int, filename: str) -> None:
  with open(filename, "w", encoding="utf-8") as f:
    for idx in tqdm(range(num_examples)):
      topic = random.choice(TOPICS)
      code_length = random.choices(CODE_LENGTH, weights=CODE_LENGTH_WEIGHTS)[0]

      conversation = None
      while conversation == None:
        conversation = create_conversation(topic, code_length)
        if conversation == None:
          print(f"Error generating conversation for example {idx}")

      template = {
          "messages": [
              {"role": "user", "content": conversation.code},
              {
                  "role": "assistant",
                  "content": conversation.explanation,
              },
          ]
      }
      line = json.dumps(template) + "\n"
      f.write(line)
      f.flush()

    f.flush()
    f.close()

## Generate all the data!

In [46]:
from datetime import datetime

TRAIN_FILE = f"{DATA_FOLDER}/train_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
VALID_FILE = f"{DATA_FOLDER}/valid_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
TEST_FILE = f"{DATA_FOLDER}/test_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"

generate_dataset(10, TRAIN_FILE)


100%|██████████| 10/10 [00:37<00:00,  3.79s/it]
