<a href="https://colab.research.google.com/github/lennartvoelz/fine_tune_hf/blob/main/gen_syn_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
# Cell 1: Install & Gemini 3 Setup
!pip install -q google-genai datasets pandas nest_asyncio

from google import genai # Note the import change: from google import genai
import pandas as pd
from google.colab import userdata
from datasets import Dataset
import json
import re

# Initialize the client with the API key directly
API_KEY = userdata.get('Gemini_API_KEY')
client = genai.Client(api_key=API_KEY) # THIS is the new way to configure

# GEMINI 3 MODELS
model_flash = "gemini-3.0-flash"
model_pro = "gemini-3.0-pro"

print("‚úÖ Gemini 3 Client ready!")


‚úÖ Gemini 3 Client ready!


In [20]:
SEED_EXAMPLES = [
    {
        "question": "Redact provided text according to the task description using unique numbered tokens.",
        "context": "I'm 33, single, and my phone is 030-1234567.",
        "answer": "I'm [AGE_YEARS_1:33], single, and my phone is [PHONE_1]."
    },
    {
        "question": "Redact provided text according to the task description using unique numbered tokens.",
        "context": "Send email to John Smith at john.smith@company.com and call Peter at 555-1234.",
        "answer": "Send email to [PERSON_1] at [EMAIL_1] and call [PERSON_2] at [PHONE_1]."
    },
    {
        "question": "Redact provided text according to the task description using unique numbered tokens.",
        "context": "Patient Sarah lives at 123 Main St, age 45, female, SSN 123-45-6789.",
        "answer": "Patient [PERSON_1] lives at [ADDRESS_1], [AGE_YEARS_1:45], [GENDER_1], [SSN_1]."
    },
    {
        "question": "Redact provided text according to the task description using unique numbered tokens.",
        "context": "Order #12345 for David Lee, card ****1234, ship to 456 Oak Ave.",
        "answer": "Order #12345 for [PERSON_1], [CARD_1_LAST4:1234], ship to [ADDRESS_1]."
    }
]

pd.DataFrame(SEED_EXAMPLES).head()

Unnamed: 0,question,context,answer
0,Redact provided text according to the task des...,"I'm 33, single, and my phone is 030-1234567.","I'm [AGE_YEARS_1:33], single, and my phone is ..."
1,Redact provided text according to the task des...,Send email to John Smith at john.smith@company...,Send email to [PERSON_1] at [EMAIL_1] and call...
2,Redact provided text according to the task des...,"Patient Sarah lives at 123 Main St, age 45, fe...","Patient [PERSON_1] lives at [ADDRESS_1], [AGE_..."
3,Redact provided text according to the task des...,"Order #12345 for David Lee, card ****1234, shi...","Order #12345 for [PERSON_1], [CARD_1_LAST4:123..."


In [30]:
# Cell 3: FINAL WORKING VERSION - google.genai SDK
from google.genai import types  # CRITICAL IMPORT

PROMPT_TEMPLATE = """
Generate exactly {num_examples} NEW PII redaction examples in EXACT JSON format.

SEED EXAMPLE:
{seed_example}

RULES:
1. "question": EXACTLY "Redact provided text according to the task description using unique numbered tokens."
2. "context": 2-4 sentences, 2-6 PII (vary names/emails/phones/ages/addresses)
3. "answer": [PERSON_1], [EMAIL_1], [AGE_YEARS_1:45] sequential by appearance
4. Same entity = same number
5. Keep order #12345 unchanged

Output ONLY JSON array:
"""

import json
from typing import List, Dict

def robust_json_parse(text: str, min_expected: int = 1) -> List[Dict]:
    """Robust Gemini JSON parser - no scope issues"""
    # Clean markdown
    text = re.sub(r'^```(?:json)?\s*|\s*```$', '', text, flags=re.MULTILINE).strip()

    # Try direct parse
    try:
        parsed = json.loads(text)
        if isinstance(parsed, list) and len(parsed) > 0:
            return parsed
    except json.JSONDecodeError:
        pass

    # Extract JSON array candidates
    json_candidates = re.findall(r'\[.*?\]', text, re.DOTALL)
    for candidate in json_candidates[-3:]:  # Last 3 (most likely complete)
        try:
            parsed = json.loads(candidate)
            if isinstance(parsed, list) and len(parsed) > 0 and isinstance(parsed[0], dict):
                return parsed
        except:
            continue

    # Line-by-line object extraction (fallback)
    objects = []
    for line in text.split('\n'):
        line = line.strip()
        if line.startswith('{') and line.endswith('}'):
            try:
                obj = json.loads(line)
                if all(k in obj for k in ['question', 'context', 'answer']):
                    objects.append(obj)
            except:
                pass

    # Return if reasonable yield
    if len(objects) >= min_expected // 5:  # At least 20% expected
        return objects

    print("‚ö†Ô∏è Parse failed. Raw:", text[:300] + "..." if len(text) > 300 else text)
    return []


def generate_gemini3_sync(model_name="gemini-2.5-flash", num_examples=50):
    seed_example_str = json.dumps(SEED_EXAMPLES[0], ensure_ascii=False)
    escaped_seed = seed_example_str.replace('{', '{{').replace('}', '}}')

    prompt = PROMPT_TEMPLATE.format(num_examples=num_examples, seed_example=escaped_seed)

    response = client.models.generate_content(
        model=model_name,
        contents=prompt,
        config=types.GenerateContentConfig(
            temperature=0.5,  # More consistent JSON
            max_output_tokens=12000
        )
    )

    return robust_json_parse(response.text, num_examples)  # Pass num_examples


# Test - Handles ALL Gemini edge cases
batch = generate_gemini3_sync("gemini-2.5-flash", 10)
print("‚úÖ Robust parse:", len(batch))
print(json.dumps(batch[0], indent=2) if batch else "Empty batch")


‚úÖ Robust parse: 10
{
  "question": "Redact provided text according to the task description using unique numbered tokens.",
  "context": "My name is Alice Smith and I am 28 years old. You can reach me at 555-123-4567.",
  "answer": "My name is [PERSON_1] and I am [AGE_YEARS_1:28] years old. You can reach me at [PHONE_1]."
}


In [31]:
BATCH_SIZE = 100
TOTAL_EXAMPLES = 100

all_examples = SEED_EXAMPLES.copy()
print("Generating", TOTAL_EXAMPLES, "SYNC...")

for i in range(0, TOTAL_EXAMPLES, BATCH_SIZE):
    batch_size = min(BATCH_SIZE, TOTAL_EXAMPLES - len(all_examples))
    print(f"Batch {i//BATCH_SIZE+1}: {batch_size}")

    batch = generate_gemini3_sync("gemini-2.5-flash", batch_size)
    all_examples.extend(batch)

    pd.DataFrame(all_examples).to_json(f'dataset_b{i//BATCH_SIZE+1}.jsonl', orient='records', lines=True)


Generating 100 SYNC...
Batch 1: 96
‚ö†Ô∏è Parse failed. Raw: [
  {
    "question": "Redact provided text according to the task description using unique numbered tokens.",
    "context": "My name is John Doe and I am 35 years old. You can reach me at john.d@example.com or call me at (123) 456-7890. I live at 123 Main St, Anytown, USA 12345.",
    "answer": "My...
‚úÖ COMPLETE - No async bugs!


In [33]:
def validate_format(row):
    """Robust token validation - handles all edge cases"""
    # Exact question match
    q_ok = row['question'] == "Redact provided text according to the task description using unique numbered tokens."

    # Find tokens safely
    tokens = re.findall(r'\[(PERSON|EMAIL|PHONE|ADDRESS|SSN|CARD_\d+_LAST4|IBAN_\d+_LAST4|GENDER|AGE_\d+_YEARS|RACE|MARITAL_STATUS)_\d+(?::\d+)?\]',
                       row['answer'])
    if not tokens:
        return False  # No tokens = invalid

    type_groups = {}
    for t in tokens:
        # Safe regex extraction
        match = re.search(r'_(\d+)', t)
        if not match:
            return False  # Malformed token

        typ = t.split('_')[0]
        num = int(match.group(1))
        type_groups.setdefault(typ, []).append(num)

    # Check sequential numbering (1,2,3... no gaps/duplicates)
    for typ, nums in type_groups.items():
        unique_nums = sorted(set(nums))
        if unique_nums != list(range(1, len(unique_nums) + 1)):
            return False

    return q_ok

# Now works perfectly
df['valid'] = df.apply(validate_format, axis=1)
print(f"‚úÖ {df['valid'].mean():.1%} valid ({df['valid'].sum()}/{len(df)})")

print("\nSample valid examples:")
display(df[df['valid']][['question', 'context', 'answer']].head(3))

# Save only valid ones
valid_df = df[df['valid']].copy()
valid_df.to_csv('pii_instruction_dataset_clean.csv', index=False)
valid_df.to_json('pii_instruction_dataset_clean.jsonl', orient='records', lines=True)

print(f"üíæ Saved {len(valid_df)} clean examples")


‚úÖ 0.0% valid (0/4)

Sample valid examples:


Unnamed: 0,question,context,answer


üíæ Saved 0 clean examples
