# Generate Synthetic Training Data

<a target="_blank" href="https://colab.research.google.com/github/simonguest/CS-394/blob/main/src/06/notebooks/generate-synthetic.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<a target="_blank" href="https://github.com/simonguest/CS-394/raw/refs/heads/main/src/06/notebooks/generate-synthetic.ipynb">
  <img src="https://img.shields.io/badge/Download_.ipynb-blue" alt="Download .ipynb"/>
</a>

## Data generation settings

In [40]:
NUM_TRAIN_EXAMPLES = 8000  # @param {type:"number"}
NUM_VAL_EXAMPLES = 1000  # @param {type:"number"}
NUM_TEST_EXAMPLES = 100 # @param {type:"number"}
TEMPERATURE = 0.8  # @param {type:"number"}

DATA_FOLDER = "./.data/generated"
!mkdir -p {DATA_FOLDER}

DATAGEN_MODEL = "openai/gpt-5.2-codex"

## Dataset diversity

In [41]:
TOPICS = [
  "Anti-air consistency (DP vs normal vs air-to-air)",
  "Whiff punishing basics",
  "Hit-confirming lights into special",
  "Strike/throw offense",
  "Meaty timing after knockdown",
  "Defense: blocking high/low + delay tech",
  "Throw tech vs shimmy awareness",
  "Drive Impact reactions and counters",
  "Drive Rush pressure and checking DR",
  "Corner control and corner escape",
  "Meter management (Drive Gauge + Super)",
  "Matchup basics vs shotos (Ryu/Ken/Luke)",
  "Matchup basics vs grapplers (Gief/Manon)",
  "Matchup basics vs zoners (Dhalsim/JP)",
  "Neutral: spacing with pokes",
  "Punish combos: light/medium/heavy starters",
  "Mental stack + decision making",
]

COACH_DEPTH = [
    "short",
    "paragraph",
    "small_function",
    "large_function",
]
COACH_DEPTH_WEIGHTS = [0.25, 0.25, 0.25, 0.25]

## Model for structured output

In [48]:
from pydantic import BaseModel
from typing import List, Literal, Optional

class SF6CoachAnswer(BaseModel):
    student_goal: str
    situation_summary: str
    diagnosis: List[str]              # what's going wrong
    key_concepts: List[str]           # what to learn
    gameplan: List[str]               # actionable plan
    drills: List[str]                 # training mode drills
    mistakes_to_avoid: List[str]
    quick_cheatsheet: List[str]       # 1-liners
    confidence_note: str              # supportive, short

class SF6Conversation(BaseModel):
    user: str
    assistant: SF6CoachAnswer

## Get OpenRouter API key

In [47]:
import sys
import os
from dotenv import load_dotenv

if 'google.colab' in sys.modules:
  from google.colab import userdata # type:ignore
  os.environ['OPENROUTER_API_KEY'] = userdata.get('OpenRouter')
else:
  load_dotenv()

## Conversation generation functions

In [44]:
import openai
import os

client = openai.OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.environ.get("OPENROUTER_API_KEY"),
)

def generate_completion(prompt: str) -> SF6Conversation | None:
    response = client.responses.parse(
        model=DATAGEN_MODEL,
        input=[{"role": "user", "content": prompt}],
        temperature=TEMPERATURE,
        stream=False,
        text_format=SF6Conversation
    )

    return response.output_parsed

def create_conversation(topic: str, code_length: str) -> SF6Conversation | None:
    request = ""
    if code_length == "short":
        request = f"2 - 4 lines of Python code about {topic}"
    elif code_length == "paragraph":
        request = f"3 - 6 lines of Python code about {topic}"
    elif code_length == "small_function":
        request = f"a small function (around 10 lines of Python code) about {topic}"
    elif code_length == "large_function":
        request = f"a large function (around 10 - 20 lines of Python code) about {topic}"
    else:
        request = f"a Python code example about {topic}"

    prompt = f"""
    You are a Street Fighter 6 coach.

    Create ONE realistic coaching exchange about: {topic}.
    Student rank: choose one of [Rookie, Iron, Bronze, Silver, Gold, Platinum, Diamond, Master].
    Character: choose one of [Ryu, Ken, Luke, Juri, Chun-Li, Cammy, Zangief, Manon, Marisa, JP, Guile].

    Return:
    1) A realistic student message (1-4 sentences) describing a problem, goal, or match situation.
    2) A coaching answer for a high school student:
      - 3-6 bullet diagnosis points
      - a simple gameplan (3-6 bullets)
      - 2-5 drills for Training Mode
      - 3 common mistakes to avoid
      - end with one short encouraging line

    Do NOT mention that you are generating data.
    """

    return generate_completion(prompt)

## Dataset generation functions

In [45]:
import random
import json
from tqdm import tqdm

def generate_dataset(num_examples: int, filename: str) -> None:
  with open(filename, "w", encoding="utf-8") as f:
    for idx in tqdm(range(num_examples)):
      topic = random.choice(TOPICS)
      code_length = random.choices(COACH_DEPTH, weights=COACH_DEPTH_WEIGHTS)[0]

      conversation = None
      while conversation == None:
        conversation = create_conversation(topic, code_length)
        if conversation == None:
          print(f"Error generating conversation for example {idx}")

      template = {
          "messages": [
              {"role": "user", "content": conversation.code},
              {
                  "role": "assistant",
                  "content": conversation.explanation,
              },
          ]
      }
      line = json.dumps(template) + "\n"
      f.write(line)
      f.flush()

    f.flush()
    f.close()

## Generate all the data!

In [46]:
from datetime import datetime

TRAIN_FILE = f"{DATA_FOLDER}/train_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
VALID_FILE = f"{DATA_FOLDER}/valid_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
TEST_FILE = f"{DATA_FOLDER}/test_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"

generate_dataset(10, TRAIN_FILE)


100%|██████████| 10/10 [00:37<00:00,  3.79s/it]
