In [1]:
import os
import random
import json
from faker import Faker

fake = Faker("en_US")
random.seed(42)

OUTPUT_DIR = "Faker"
NUM_SAMPLES = 50


# ---------------------------------------
# Topics and building blocks
# ---------------------------------------

TOPICS = {
    "daily_life": ["morning routine", "cooking", "cleaning", "errands", "commute"],
    "school": ["assignments", "lectures", "deadlines", "group project", "teachers"],
    "work": ["meetings", "coworkers", "deadlines", "tasks", "office politics"],
    "relationships": ["arguments", "crushes", "friends", "family drama"],
    "health": ["sleep", "gym", "injuries", "stress symptoms"],
    "stress": ["overthinking", "pressure", "burnout signs"],
    "money": ["bills", "budgeting", "overspending", "saving"],
    "goals": ["plans", "future", "consistency", "habits"],
    "habits": ["routine", "bad habits", "new habits", "productivity"],
    "emotions": ["anxiety", "motivation", "frustration", "confidence"]
}

TONE_VARIATIONS = [
    "Honestly", "To be real", "For some reason", "Weirdly enough",
    "I swear", "Not gonna lie", "Lately", "Sometimes", "I guess"
]

FILLERS = ["uh", "like", "you know", "I mean", "kinda", "sort of", "honestly"]

# sentence structures (very open-ended)
SENTENCE_SHAPES = [
    "{tone} I’ve been dealing with {detail} and it’s messing with me.",
    "Today I kept thinking about {detail}, and {filler} it made everything feel heavier.",
    "I talked to {person} about {detail}, but I’m not sure it helped.",
    "I ended up going to {place} because I needed a break from {detail}.",
    "I feel {emotion} whenever I think about {detail}, and it’s getting annoying.",
    "Sometimes I wonder if {detail} is the reason I can’t focus.",
    "My mind keeps jumping back to {detail}, even when I’m trying to chill.",
    "It’s strange how {detail} still affects me after all this time.",
    "I wrote an email to {email} about {detail}, but I regret it now.",
    "I called {person} at {phone} and didn’t even know what to say.",
    "Ended up ranting about {detail} out loud like a crazy person.",
    "Went outside for air, but {detail} kept looping in my head.",
    "Last night I couldn’t sleep because of {detail}.",
    "I keep telling myself I'll fix {detail}, but then I don’t.",
    "Maybe I’m just overthinking {detail}, but whatever."
]


EMOTIONS = ["tired", "anxious", "okayish", "motivated", "overwhelmed", "frustrated", "numb"]


def generate_sentence(topic):
    """Compose a unique messy journal sentence."""
    shape = random.choice(SENTENCE_SHAPES)
    sentence = shape.format(
        tone=random.choice(TONE_VARIATIONS),
        detail=random.choice(TOPICS[topic]),
        filler=random.choice(FILLERS),
        emotion=random.choice(EMOTIONS),
        person=fake.name(),
        place=fake.city(),
        phone=fake.phone_number(),
        email=fake.email()
    )
    return sentence


def inject_pii(text, insert_text, pii_type):
    insert_pos = random.randint(0, len(text))
    new_text = text[:insert_pos] + insert_text + text[insert_pos:]
    return new_text, {
        "type": pii_type,
        "text": insert_text,
        "start_char": insert_pos,
        "end_char": insert_pos + len(insert_text)
    }


def generate_transcript(sample_id):
    num_segments = random.randint(2, 6)
    chosen_topics = random.sample(list(TOPICS.keys()), num_segments)

    total_sentences = random.randint(3, 50)

    sentences_per_segment = max(1, total_sentences // num_segments)

    transcript = ""
    segments = []
    pii_tags = []

    for seg_id, topic in enumerate(chosen_topics):
        segment_sentences = [
            generate_sentence(topic) for _ in range(sentences_per_segment)
        ]
        segment_text = " ".join(segment_sentences)

        start_char = len(transcript)
        transcript += segment_text + " "
        end_char = len(transcript)

        segments.append({
            "id": seg_id,
            "topic": topic,
            "text": segment_text,
            "start_char": start_char,
            "end_char": end_char
        })

        # occasional extra random PII
        if random.random() < 0.5:
            pii_type = random.choice(["PERSON", "EMAIL", "PHONE", "ADDRESS"])
            if pii_type == "PERSON":
                insert = fake.name()
            elif pii_type == "EMAIL":
                insert = fake.email()
            elif pii_type == "PHONE":
                insert = fake.phone_number()
            else:
                insert = fake.address().replace("\n", " ")

            transcript, tag = inject_pii(transcript, insert, pii_type)
            pii_tags.append(tag)

    summary = f"This journal touches on {', '.join(chosen_topics)}."

    return {
        "transcript_id": f"synthetic_{sample_id:04d}",
        "transcript": transcript.strip(),
        "segments": segments,
        "topics": chosen_topics,
        "pii": pii_tags,
        "summary": summary
    }


def generate_dataset():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    for i in range(1, NUM_SAMPLES + 1):
        data = generate_transcript(i)
        with open(os.path.join(OUTPUT_DIR, f"{i:04d}.json"), "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2)
        print(f"Generated sample {i}")


if __name__ == "__main__":
    generate_dataset()

Generated sample 1
Generated sample 2
Generated sample 3
Generated sample 4
Generated sample 5
Generated sample 6
Generated sample 7
Generated sample 8
Generated sample 9
Generated sample 10
Generated sample 11
Generated sample 12
Generated sample 13
Generated sample 14
Generated sample 15
Generated sample 16
Generated sample 17
Generated sample 18
Generated sample 19
Generated sample 20
Generated sample 21
Generated sample 22
Generated sample 23
Generated sample 24
Generated sample 25
Generated sample 26
Generated sample 27
Generated sample 28
Generated sample 29
Generated sample 30
Generated sample 31
Generated sample 32
Generated sample 33
Generated sample 34
Generated sample 35
Generated sample 36
Generated sample 37
Generated sample 38
Generated sample 39
Generated sample 40
Generated sample 41
Generated sample 42
Generated sample 43
Generated sample 44
Generated sample 45
Generated sample 46
Generated sample 47
Generated sample 48
Generated sample 49
Generated sample 50
