# Dataset Generation Script

This script performs the following operations:
1. Extracts 5000 training questions and 100 testing questions for each domain (coding, math, trivia).
2. Saves the selected datasets.
3. Generates responses using 4 different teacher models via OpenRouter.
4. Applies templates in half of the cases.
5. Saves the generated data.

In [None]:
import os
import json
import random
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
# from tqdm.notebook import tqdm # Causing ImportError with IProgress
from tqdm import tqdm # Use standard tqdm instead
from openai import OpenAI

# --- Configuration ---

# Ensure you have your OpenRouter API key set in your environment
# os.environ["OPENROUTER_API_KEY"] = "sk-or-v1-..."
API_KEY = os.getenv("OPENROUTER_API_KEY")

if not API_KEY:
    raise ValueError("Please set OPENROUTER_API_KEY environment variable.")

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=API_KEY,
)

DOMAINS = ["general"]
TEACHER_MODELS = [
    "qwen/qwen-2.5-72b-instruct",
    "qwen/qwen-2.5-7b-instruct",
    "meta-llama/llama-3.1-70b-instruct",
    "meta-llama/llama-3.1-8b-instruct",
]
TRAIN_SIZE = 5000
TEST_SIZE = 100
SEED = 42
MAX_WORKERS = 5 # reduced per-config workers since we run configs in parallel
MAX_PARALLEL_CONFIGS = 4 # Number of configurations to run simultaneously


In [6]:
try:
    print("Testing OpenRouter connection...")
    completion = client.chat.completions.create(
        model="meta-llama/llama-3.1-8b-instruct",
        messages=[
            {"role": "user", "content": "Hello, world!"}
        ]
    )
    print("Connection successful!")
    print("Response:", completion.choices[0].message.content)
except Exception as e:
    print("Connection failed:", str(e))
    raise e

Testing OpenRouter connection...
Connection successful!
Response: Hello! How can I assist you today?


In [7]:
def load_and_select_data(domain, split, count):
    """
    Loads data for a domain/split.
    If 'selected' file exists, returns it.
    Otherwise, samples from 'raw' file, saves to 'selected', and returns it.
    """
    base_path = os.path.join("experiments", domain)
    selected_file = os.path.join(base_path, f"{split}_selected.jsonl")
    raw_file = os.path.join(base_path, f"{split}_raw.jsonl")

    # Check if selected file exists
    if os.path.exists(selected_file):
        print(f"Loading existing selected data for {domain}/{split}...")
        with open(selected_file, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]
        # Verify size (warn if mismatch, but allow reuse)
        if len(data) != count:
            print(f"Warning: Existing file has {len(data)} items, expected {count}.")
        return data

    print(f"Sampling new data for {domain}/{split}...")
    # Load raw data
    data = []
    with open(raw_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))

    # Sample
    random.seed(SEED)
    if len(data) > count:
        selected_data = random.sample(data, count)
    else:
        selected_data = data

    # Save selected
    with open(selected_file, 'w', encoding='utf-8') as f:
        for item in selected_data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    return selected_data

def load_template(domain):
    path = os.path.join("experiments", domain, "template.txt")
    with open(path, 'r', encoding='utf-8') as f:
        return f.read().strip()

In [8]:
def generate_single_response(item, model, template_text, use_template):
    """
    Generates a response for a single item.
    Returns the enriched item with 'response', 'model', 'template_applied'.
    """
    question = item['question']

    if use_template and template_text:
        # Assume template has [QUESTION] placeholder based on file inspection
        prompt = template_text.replace("[QUESTION]", question)
    else:
        prompt = question

    # Retry logic
    max_retries = 3
    for attempt in range(max_retries):
        try:
            completion = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            response = completion.choices[0].message.content

            result = item.copy()
            result['response'] = response
            result['model'] = model
            result['template_applied'] = use_template
            result['prompt'] = prompt
            return result

        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Failed to generate for prompt: {prompt[:50]}... Error: {e}")
                result = item.copy()
                result['error'] = str(e)
                return result
            time.sleep(2 ** attempt)  # Exponential backoff

In [9]:
def run_generation_process(domain, split, data, model, use_template):
    """
    Runs generation for a specific configuration and saves results.
    """
    template_text = load_template(domain)

    # Construct output filename
    # Format: experiments/{domain}/{split}_generated_{model_clean}_{template}.jsonl
    model_clean = model.replace("/", "_")
    template_str = "with_template" if use_template else "no_template"
    output_filename = f"{split}_generated_{model_clean}_{template_str}.jsonl"
    output_path = os.path.join("experiments", domain, output_filename)

    if os.path.exists(output_path):
        print(f"  [Skipping] {output_filename} already exists.")
        return

    print(f"  [Generating] {domain} | {split} | {model} | Template={use_template}")

    results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [
            executor.submit(generate_single_response, item, model, template_text, use_template)
            for item in data
        ]

        # Using tqdm for progress within the generation
        for future in tqdm(as_completed(futures), total=len(data), desc=f"    Progress", leave=False):
            results.append(future.result())

    # Save results
    with open(output_path, 'w', encoding='utf-8') as f:
        for res in results:
            f.write(json.dumps(res, ensure_ascii=False) + "\n")
    print(f"  [Saved] {output_filename}")

In [10]:
# --- Main Execution Loop ---

# Pre-load all data serially to avoid race conditions
domain_data = {}
for domain in DOMAINS:
    print(f"Loading data for {domain}...")
    try:
        train_data = load_and_select_data(domain, "train", TRAIN_SIZE)
        test_data = load_and_select_data(domain, "test", TEST_SIZE)
        domain_data[domain] = {"train": train_data, "test": test_data}
    except Exception as e:
        print(f"Skipping domain {domain} due to error: {e}")

# Collect all tasks
tasks = []
for domain, datasets in domain_data.items():
    for model in TEACHER_MODELS:
        for use_template in [True, False]:
            for split_name in ["train", "test"]:
                tasks.append({
                    "domain": domain,
                    "split": split_name,
                    "data": datasets[split_name],
                    "model": model,
                    "use_template": use_template
                })

print(f"\nTotal configurations to run: {len(tasks)}")

def process_task(task):
    try:
        run_generation_process(
            domain=task["domain"],
            split=task["split"],
            data=task["data"],
            model=task["model"],
            use_template=task["use_template"]
        )
        return f"Success: {task['domain']} {task['model']}"
    except Exception as e:
        return f"Error in {task['domain']} {task['model']}: {e}"

print("Starting parallel execution...")
with ThreadPoolExecutor(max_workers=MAX_PARALLEL_CONFIGS) as executor:
    futures = [executor.submit(process_task, task) for task in tasks]

    for future in as_completed(futures):
        print(future.result())

print("\nAll generation tasks completed!")

Loading data for trivia...
Loading existing selected data for trivia/train...
Loading existing selected data for trivia/test...

Total configurations to run: 8
Starting parallel execution...
  [Skipping] train_generated_qwen_qwen-2.5-72b-instruct_with_template.jsonl already exists.
  [Skipping] test_generated_qwen_qwen-2.5-72b-instruct_with_template.jsonl already exists.
Success: trivia qwen/qwen-2.5-72b-instruct
Success: trivia qwen/qwen-2.5-72b-instruct
  [Skipping] test_generated_qwen_qwen-2.5-72b-instruct_no_template.jsonl already exists.
Success: trivia qwen/qwen-2.5-72b-instruct
  [Skipping] train_generated_qwen_qwen-2.5-7b-instruct_with_template.jsonl already exists.
  [Skipping] test_generated_qwen_qwen-2.5-7b-instruct_with_template.jsonl already exists.
  [Generating] trivia | train | qwen/qwen-2.5-72b-instruct | Template=False
Success: trivia qwen/qwen-2.5-7b-instruct
Success: trivia qwen/qwen-2.5-7b-instruct
  [Skipping] test_generated_qwen_qwen-2.5-7b-instruct_no_template.j

                                                                  

  [Saved] train_generated_qwen_qwen-2.5-7b-instruct_no_template.jsonl
Success: trivia qwen/qwen-2.5-7b-instruct




  [Saved] train_generated_qwen_qwen-2.5-72b-instruct_no_template.jsonl
Success: trivia qwen/qwen-2.5-72b-instruct

All generation tasks completed!
