In [None]:
import random
import json
import uuid
import names
from openai import OpenAI
client = OpenAI(api_key="sk-xxxx")  # Replace with your actual key

In [10]:
TRAITS = {
    "openness": ["curious", "imaginative", "open to new experiences", "creative", "adventurous"],
    "conscientiousness": ["organized", "responsible", "hardworking", "disciplined", "detail-oriented"],
    "extraversion": ["outgoing", "talkative", "energetic", "sociable", "assertive"],
    "agreeableness": ["kind", "cooperative", "empathetic", "helpful", "trusting"],
    "neuroticism": ["anxious", "moody", "emotional", "insecure", "sensitive"]
}

CONTEXTS = ["workplace", "social life", "emotional experiences"]

In [11]:
def generate_person():
    return {
        "name": names.get_full_name(),
        "age": random.randint(20, 60),
        "occupation": random.choice(["teacher", "engineer", "artist", "doctor", "student", "writer", "designer"]),
        "gender": random.choice(["male", "female", "non-binary"])
    }

def select_traits():
    traits = random.sample(list(TRAITS.keys()), k=random.randint(2, 3))
    return {t: round(random.uniform(0.1, 1.0), 2) for t in traits}

def generate_document(person, traits_dict):
    trait_descriptions = ", ".join([f"{trait} (strength {strength})" for trait, strength in traits_dict.items()])
    contexts = random.sample(CONTEXTS, k=random.randint(1, 3))
    context_str = ", ".join(contexts)

    prompt = f"""
    Write a coherent paragraph (5-8 sentences) describing {person['name']}, 
    a {person['age']}-year-old {person['occupation']}. 
    The description should illustrate their personality traits ({trait_descriptions})
    through behaviors, emotions, and interactions. 
    Cover contexts like {context_str}. 
    Do not list traits explicitly — show them naturally through storytelling and actions.
    """

    response = client.responses.create(model="gpt-4.1", input=prompt)
    text = response.output_text.strip()
    return {
        "doc_id": str(uuid.uuid4()),
        "text": text,
        "context": contexts,
        "source": "synthetic_llm"
    }

In [12]:
def create_record(person, traits_dict, document):
    return {
        "person_id": str(uuid.uuid4()),
        "person": person,
        "traits": [{"name": t, "strength": s} for t, s in traits_dict.items()],
        "document": document
    }

In [13]:
def generate_dataset(n_samples=100):
    records = []
    for _ in range(n_samples):
        person = generate_person()
        traits_dict = select_traits()
        document = generate_document(person, traits_dict)
        record = create_record(person, traits_dict, document)
        records.append(record)
    return records

def split_dataset(records, train_ratio=0.7, val_ratio=0.15):
    random.shuffle(records)
    n = len(records)
    n_train = int(n * train_ratio)
    n_val = int(n * val_ratio)
    train = records[:n_train]
    val = records[n_train:n_train+n_val]
    test = records[n_train+n_val:]
    return train, val, test

def save_json(data, filename):
    with open(filename, "w") as f:
        json.dump(data, f, indent=2)

In [14]:
if __name__ == "__main__":
    print("Generating synthetic personality dataset...")
    all_records = generate_dataset(n_samples=100)  # you can scale up later
    train, val, test = split_dataset(all_records)

    save_json(train, "train_dataset.json")
    save_json(val, "validation_dataset.json")
    save_json(test, "test_dataset.json")

    print(f"✅ Generated {len(all_records)} samples.")
    print(f"Train: {len(train)}, Validation: {len(val)}, Test: {len(test)}")

Generating synthetic personality dataset...
✅ Generated 100 samples.
Train: 70, Validation: 15, Test: 15
