In [2]:
import os
import json
import random
from faker import Faker
import subprocess

fake = Faker()

OUTPUT_DIR = "llm"
NUM_SAMPLES = 50

TOPICS = [
    "relationships", "school", "work", "daily life",
    "health", "stress", "money", "habits", "goals", "emotions"
]

def generate_segment_with_llm(topic):
    prompt = f"""
Write a short personal journal segment (3-10 sentences) about the topic "{topic}".
Style: casual, slightly emotional, reflective, natural phrasing.
Fully synthetic. Do NOT use any real names or real places.
"""
    result = subprocess.run(
        ["ollama", "run", "llama3.2:3b"],
        input=prompt,
        capture_output=True,
        text=True,
        encoding="utf-8",
        errors="ignore"
    )

    return result.stdout.strip()

def inject_realistic_pii(sentences):
    pii_items = []

    synthetic_pii_patterns = [
        ("PERSON", fake.name),
        ("EMAIL", fake.email),
        ("PHONE", fake.phone_number),
        ("ADDRESS", lambda: fake.address().replace("\n", ", "))
    ]

    num_pii = random.randint(1, 4)
    chosen_sentences = random.sample(range(len(sentences)), num_pii)

    for idx in chosen_sentences:
        pii_type, generator = random.choice(synthetic_pii_patterns)
        value = generator()

        sentences[idx] += f" You can reach them at {value}."

        pii_items.append({
            "type": pii_type,
            "text": value,
            "sentence_index": idx
        })

    return sentences, pii_items

def generate_sample(i):

    num_segments = random.randint(2, 5)
    topics = random.sample(TOPICS, num_segments)

    segment_texts = []

    # STEP 1 â€” generate segments via LLM
    for topic in topics:
        seg = generate_segment_with_llm(topic)
        segment_texts.append(seg)

    # STEP 2 â€” merge + compute offsets later
    sentences = []
    segment_boundaries = []
    cursor = 0

    for seg_id, (topic, seg_text) in enumerate(zip(topics, segment_texts)):
        seg_sentences = [s.strip() for s in seg_text.split(".") if s.strip()]
        segment_sentence_count = len(seg_sentences)

        start_char = None
        end_char = None

        for s in seg_sentences:
            if start_char is None:
                start_char = cursor
            sentences.append(s)
            cursor += len(s) + 2

        end_char = cursor

        segment_boundaries.append({
            "id": seg_id,
            "topic": topic,
            "start_char": start_char,
            "end_char": end_char,
            "text": seg_text
        })

    sentences, pii_raw = inject_realistic_pii(sentences)

    transcript = ""
    offsets = []

    for idx, sentence in enumerate(sentences):
        if idx > 0:
            transcript += " "
        base_pos = len(transcript)
        transcript += sentence + "."

        for item in pii_raw:
            if item["sentence_index"] == idx:
                value = item["text"]
                start = transcript.index(value)
                end = start + len(value)
                offsets.append({
                    "type": item["type"],
                    "text": value,
                    "start_char": start,
                    "end_char": end
                })

    return {
        "transcript_id": f"synthetic_{i:04d}",
        "transcript": transcript,
        "segments": segment_boundaries,
        "topics": topics,
        "pii": offsets,
    }


def generate_dataset():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    for i in range(1, NUM_SAMPLES + 1):
        try:
            sample = generate_sample(i)
            with open(f"{OUTPUT_DIR}/{i:04d}.json", "w", encoding="utf-8") as f:
                json.dump(sample, f, indent=2)
            print(f"Generated {i}")
        except Exception as e:
            print(f"Error on sample {i}: {e}")


if __name__ == "__main__":
    generate_dataset()

Generated 1
Generated 2
Generated 3
Generated 4
Generated 5
Generated 6
Generated 7
Generated 8
Generated 9
Generated 10
Generated 11
Generated 12
Generated 13
Generated 14
Generated 15
Generated 16
Generated 17
Generated 18
Generated 19
Generated 20
Generated 21
Generated 22
Generated 23
Generated 24
Generated 25
Generated 26
Generated 27
Generated 28
Generated 29
Generated 30
Generated 31
Generated 32
Generated 33
Generated 34
Generated 35
Generated 36
Generated 37
Generated 38
Generated 39
Generated 40
Generated 41
Generated 42
Generated 43
Generated 44
Generated 45
Generated 46
Generated 47
Generated 48
Generated 49
Generated 50
