# Generate Synthetic Training Data

<a target="_blank" href="https://colab.research.google.com/github/simonguest/CS-394/blob/main/src/06/notebooks/generate-synthetic.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<a target="_blank" href="https://github.com/simonguest/CS-394/raw/refs/heads/main/src/06/notebooks/generate-synthetic.ipynb">
  <img src="https://img.shields.io/badge/Download_.ipynb-blue" alt="Download .ipynb"/>
</a>

## Data generation settings

In [5]:
NUM_TRAIN_EXAMPLES = 600  # @param {type:"number"}
NUM_VAL_EXAMPLES = 100  # @param {type:"number"}
NUM_TEST_EXAMPLES = 10 # @param {type:"number"}
TEMPERATURE = 0.8  # @param {type:"number"}

DATA_FOLDER = "./.data/generated"
!mkdir -p {DATA_FOLDER}

DATAGEN_MODEL = "openai/gpt-5-nano"

## Dataset diversity

In [6]:
TOPICS = [
  "Anti-air consistency (DP vs normal vs air-to-air)",
  "Whiff punishing basics",
  "Hit-confirming lights into special",
  "Strike/throw offense",
  "Meaty timing after knockdown",
  "Defense: blocking high/low + delay tech",
  "Throw tech vs shimmy awareness",
  "Drive Impact reactions and counters",
  "Drive Rush pressure and checking DR",
  "Corner control and corner escape",
  "Meter management (Drive Gauge + Super)",
  "Matchup basics vs shotos (Ryu/Ken/Luke/Sagat)",
  "Matchup basics vs grapplers (Gief/Manon/Lily)",
  "Matchup basics vs rush-down/mix-ups (Kimberly/Jamie/Elena)",
  "Matchup basics vs zoners (Dhalsim/JP)",
  "Matchup basics vs charged characters (Honda/Blanka/Bison/Guile/Deejay)",
  "Neutral: spacing with pokes",
  "Punish combos: light/medium/heavy starters",
  "Mental stack + decision making",
]

PLAYER_RANK = [
    ["Rookie", "Iron", "Bronze", "Silver", "Gold"],
    ["Platinum", "Diamond"],
    ["1300MR", "1400MR"],
    ["1500MR", "1600MR", "1700MR", "1800MR"],
]

PLAYER_RANK_WEIGHTS = [0.25, 0.25, 0.25, 0.25]

CHARACTERS = [
    "Ryu", "Ken", "Luke", "Juri", "Chun-Li", "Cammy",
    "Zangief", "Manon", "Marisa", "JP", "Guile",
    "Ed", "Dhalsim", "Lily", "Rashid", "Terry", "Mai",
    "Sagat", "Elena", "Honda", "Kimberly",
    "AKI", "Bison", "Akuma", "Deejay", "Jamie"
]

## Model for structured output

In [7]:
from pydantic import BaseModel
from typing import List, Literal, Optional

class SF6CoachAnswer(BaseModel):
    student_goal: str
    situation_summary: str
    diagnosis: List[str]              # what's going wrong
    key_concepts: List[str]           # what to learn
    gameplan: List[str]               # actionable plan
    drills: List[str]                 # training mode drills
    mistakes_to_avoid: List[str]
    quick_cheatsheet: List[str]       # 1-liners
    confidence_note: str              # supportive, short

class SF6Conversation(BaseModel):
    user: str
    assistant: SF6CoachAnswer

## Get OpenRouter API key

In [8]:
import sys
import os
from dotenv import load_dotenv

if 'google.colab' in sys.modules:
  from google.colab import userdata # type:ignore
  os.environ['OPENROUTER_API_KEY'] = userdata.get('OpenRouter')
else:
  load_dotenv()

## Conversation generation functions

In [9]:
import openai
import os

client = openai.OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.environ.get("OPENROUTER_API_KEY"),
)

def generate_completion(prompt: str) -> SF6Conversation | None:
    response = client.responses.parse(
        model=DATAGEN_MODEL,
        input=[{"role": "user", "content": prompt}],
        temperature=TEMPERATURE,
        stream=False,
        text_format=SF6Conversation
    )

    return response.output_parsed

def create_conversation(
    topic: str,
    player_rank: str,
    player_character: str,
    opponent_character: str
) -> SF6Conversation | None:

    opp = (opponent_character or "").strip()
    has_matchup = bool(opp) and opp.lower() not in {
        "any", "unknown", "n/a", "na", "none", "random", "all", "everyone", "?", "tbd"
    }

    matchup_rule = (
        "- Advice MUST reflect matchup specifics against the provided opponent.\n"
        if has_matchup
        else "- If opponent character is missing/unknown, give universal SF6 advice for the topic (not matchup-specific).\n"
             "  - You MAY mention 1 bullet that asks who the opponent is, but still provide complete actionable guidance.\n"
    )

    prompt = f"""
    You are a Street Fighter 6 coach.

    Topic: {topic}
    Student rank: {player_rank}
    Student character: {player_character}
    Opponent character: {opp if opp else "unknown"}

    Important rules:
    {matchup_rule}- Advice level MUST match the student rank.
      - Lower ranks → fundamentals, simple concepts.
      - Master MR → advanced spacing traps, conditioning, frame traps, option coverage, mental stack.

    Return ONLY valid JSON matching this schema exactly:

    {{
      "user": "1-4 sentence realistic student message written from the player's perspective",
      "assistant": {{
        "diagnosis": ["3-6 bullet strings"],
        "gameplan": ["3-6 bullet strings"],
        "drills": ["2-5 bullet strings"],
        "mistakes_to_avoid": ["exactly 3 bullet strings"],
        "encouraging_line": "one short sentence"
      }}
    }}

    Do not wrap in code fences.
    Do not add extra keys.
    """

    return generate_completion(prompt)

## Dataset generation functions

In [10]:
import random
import json
from tqdm import tqdm

def generate_dataset_files(num_examples: int, jsonl_path: str, pretty_json_path: str) -> None:
    examples = []

    with open(jsonl_path, "w", encoding="utf-8") as f_jsonl:
        for idx in tqdm(range(num_examples)):

            topic = random.choice(TOPICS)

            # Rank selection
            rank_bucket = random.choices(
                PLAYER_RANK,
                weights=PLAYER_RANK_WEIGHTS
            )[0]
            player_rank = random.choice(rank_bucket)

            # Player character
            player_character = random.choice(CHARACTERS)

            # Opponent character (avoid mirror unless desired)
            opponent_character = random.choice(
                [c for c in CHARACTERS if c != player_character]
            )

            conversation = None
            tries = 0

            while conversation is None and tries < 8:
                conversation = create_conversation(
                    topic,
                    player_rank,
                    player_character,
                    opponent_character
                )
                tries += 1

            if conversation is None:
                print(f"Failed generating example {idx}")
                continue

            record = {
                "messages": [
                    {"role": "user", "content": conversation.user},
                    {"role": "assistant", "content": conversation.assistant.model_dump()},
                ]
            }

            f_jsonl.write(json.dumps(record, ensure_ascii=False) + "\n")
            examples.append(record)

    with open(pretty_json_path, "w", encoding="utf-8") as f_pretty:
        json.dump(examples, f_pretty, ensure_ascii=False, indent=2)

## Generate all the data!

In [11]:
from datetime import datetime

TRAIN_FILE = f"{DATA_FOLDER}/train_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
train_pretty = f"{DATA_FOLDER}/train_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.pretty.json"
VALID_FILE = f"{DATA_FOLDER}/valid_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
valid_pretty = f"{DATA_FOLDER}/train_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.pretty.json"
TEST_FILE = f"{DATA_FOLDER}/test_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
test_pretty = f"{DATA_FOLDER}/test_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"

generate_dataset_files(NUM_TRAIN_EXAMPLES, TRAIN_FILE, train_pretty)
generate_dataset_files(NUM_VAL_EXAMPLES, VALID_FILE, valid_pretty)
generate_dataset_files(NUM_TEST_EXAMPLES, test_pretty)


  0%|          | 0/800 [00:15<?, ?it/s]


KeyboardInterrupt: 