In [2]:
from persona_generation_prompt import PROMPT
import json

In [3]:
print(PROMPT)


You must generate a set of fictional but realistic Iranian elderly personas that represent cultural, geographical, and social diversity in Iran.

Rules:
- All personas must be elderly (age 65–90).
- Diversity must be reflected across all components, but keep the integrity and realism of each persona.
- Personas should reflect cultural and social realities of Iran.
- Reactions and attitudes do not need to be "correct" or "moral"; they may be shaped by culture, personal experience, or limitations.
- Personality traits and psychological states should be consistent with the individual’s background.
- Use the JSON format provided below.
- For every variable, use values consistent with Iranian context.
- Do not omit any variable.

Variable definitions and accepted values:
Here’s the translation of your variables and values into **English**:

### 1. **Demographic Information**

* **Age**: integer
* **Gender**: ["Male", "Female", "Non-binary", "Other"]
* **Marital Status**: ["Single", "Marrie

In [4]:
from openai import OpenAI
import os

BASE_URL = "https://api.metisai.ir/openai/v1"
API_KEY = os.getenv("METIS_API_KEY")

client = OpenAI(
    api_key=API_KEY,
    base_url=BASE_URL,
    http_client=None
)

In [10]:
MODEL_TO_USE = "gpt-5-nano"

TEMPERATURE = 1
TOP_P = 0.9
PRESENCE_PENALTY = 0.3
FREQUENCY_PENALTY = 0.4

In [28]:
def messages(persona_count: int):
    return [
        {
            "role": "system",
            "content": PROMPT
        },
        {
            "role": "user",
            "content": f"Generate {persona_count} personas. Only give the JSON array with no extra text or formatting. Don't wrap the array in markdown formatting."
        }
    ]


In [29]:
def call_llm(model, messages):
    resp = client.chat.completions.create(
            model=model,
            temperature=TEMPERATURE,
            # top_p=TOP_P,
            # presence_penalty=PRESENCE_PENALTY,
            # frequency_penalty=FREQUENCY_PENALTY,
            messages=messages,
        )
    return resp

def parse_response(resp):
    return resp.choices[0].message.content

In [None]:
# Token counting helpers using tiktoken
# Requires: pip install tiktoken
import tiktoken
from typing import List, Dict, Any


def num_tokens_from_messages(messages: List[Dict[str, Any]], model: str = MODEL_TO_USE) -> int:
    """Return an estimate of the number of tokens used by a list of chat messages.

    Uses heuristics commonly used with OpenAI chat models (tokens per message/name),
    falling back to the `cl100k_base` encoding when the model encoding isn't available.

    Note: exact token accounting depends on the model's internal tokenization; this
    function gives a good practical estimate for budgeting and monitoring.
    """
    try:
        encoding = tiktoken.encoding_for_model(model)
    except Exception:
        # fallback if model name is unknown to tiktoken
        encoding = tiktoken.get_encoding("cl100k_base")

    # Heuristics from public guidance; adjust if you know exact model rules
    if model in ("gpt-3.5-turbo-0301", "gpt-4-0314"):
        tokens_per_message = 4
        tokens_per_name = -1
    else:
        tokens_per_message = 3
        tokens_per_name = 1

    total_tokens = 0
    for message in messages:
        total_tokens += tokens_per_message
        for key, value in message.items():
            # skip non-string values by converting to string
            if not isinstance(value, str):
                value = json.dumps(value, ensure_ascii=False)
            total_tokens += len(encoding.encode(value))
            if key == "name":
                total_tokens += tokens_per_name

    total_tokens += 3  # assistant priming (heuristic)
    return total_tokens


def num_tokens_from_string(s: str, model: str = MODEL_TO_USE) -> int:
    """Return the number of tokens in a string for the given model's tokenizer."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except Exception:
        encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(s))


SAMPLE_PERSONA = {
    "age": 65,
    "gender": "Male",
    "marital_status": "Married",
    "children": "2-3",
    "living_situation": "Living with Family",
    "general_health": "Good",
    "chronic_disease": "High Blood Pressure",
    "mobility": "Independent",
    "hearing_senses": "Good",
    "vision_senses": "Good",
    "daily_energy": "High",
    "personality_type": "ISTJ",
    "cognitive_status": "Healthy Memory",
    "dominant_emotion": "Calm",
    "emotional_intelligence": "High",
    "iq": "Average",
    "attitude_toward_aging": "Acceptance",
    "main_social_role": "Grandfather",
    "social_support": "Large Family",
    "social_participation": "Active",
    "income": "Retirement Pension",
    "economic_decile": 6,
    "housing": "Own Home",
    "religion_and_sect": "Shia Muslim",
    "internalized_moral_traits": ["Respectful", "Reliable", "Generous"],
    "religiosity_level": "Average",
    "ethnicity": "Persian",
    "language": "Persian",
    "important_personal_experiences": "Educational Achievement",
    "life_satisfaction": "Satisfied",
    "meaning_and_purpose_in_old_age": "Helping Family"
}


def estimate_persona_tokens(persona_sample: Dict[str, Any], persona_count: int, model: str = MODEL_TO_USE) -> Dict[str, int]:
    """Estimate tokens for a single persona JSON and for persona_count copies of it.

    Returns a dict with single_persona_tokens and total_personas_tokens.
    """
    persona_str = json.dumps(persona_sample, ensure_ascii=False)
    single = num_tokens_from_string(persona_str, model)
    return {"single_persona_tokens": single, "total_personas_tokens": single * persona_count}


def estimate_run_tokens(persona_count: int, response_text: str, model: str = MODEL_TO_USE) -> Dict[str, int]:
    """Estimate input/output tokens for a run that asks for persona_count personas.

    - input_tokens: tokens consumed by the `messages(persona_count)` payload
    - output_tokens: tokens in the LLM response text
    - total: sum of input + output
    """
    msgs = messages(persona_count)
    input_tokens = num_tokens_from_messages(msgs, model)
    output_tokens = num_tokens_from_string(response_text or "", model)
    return {"input_tokens": input_tokens, "output_tokens": output_tokens, "total": input_tokens + output_tokens}


# Usage examples (uncomment to run):
print(num_tokens_from_messages(messages(1)))
print(estimate_persona_tokens(SAMPLE_PERSONA, 50))
# if 'personas' in globals():
#     print(estimate_run_tokens(50, personas))


In [30]:
test_messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant."
    },
    {
        "role": "user",
        "content": "What is the capital of France?"
    }
]
resp = call_llm(MODEL_TO_USE, test_messages)
parse_response(resp)


'Paris.'

In [31]:
resp = call_llm(MODEL_TO_USE, messages(50))

In [32]:
personas = parse_response(resp)

In [33]:
personas

'[\n  {\n    "age": 65,\n    "gender": "Male",\n    "marital_status": "Married",\n    "children": "2-3",\n    "living_situation": "Living with Family",\n    "general_health": "Good",\n    "chronic_disease": "High Blood Pressure",\n    "mobility": "Independent",\n    "hearing_senses": "Good",\n    "vision_senses": "Good",\n    "daily_energy": "High",\n    "personality_type": "ISTJ",\n    "cognitive_status": "Healthy Memory",\n    "dominant_emotion": "Calm",\n    "emotional_intelligence": "High",\n    "iq": "Average",\n    "attitude_toward_aging": "Acceptance",\n    "main_social_role": "Grandfather",\n    "social_support": "Large Family",\n    "social_participation": "Active",\n    "income": "Retirement Pension",\n    "economic_decile": 6,\n    "housing": "Own Home",\n    "religion_and_sect": "Shia Muslim",\n    "internalized_moral_traits": ["Respectful", "Reliable", "Generous"],\n    "religiosity_level": "Average",\n    "ethnicity": "Persian",\n    "language": "Persian",\n    "important

In [34]:
if personas is None:
    print("Personas is None")
else:
    print(json.dumps(json.loads(personas), ensure_ascii=False, indent=4))

[
    {
        "age": 65,
        "gender": "Male",
        "marital_status": "Married",
        "children": "2-3",
        "living_situation": "Living with Family",
        "general_health": "Good",
        "chronic_disease": "High Blood Pressure",
        "mobility": "Independent",
        "hearing_senses": "Good",
        "vision_senses": "Good",
        "daily_energy": "High",
        "personality_type": "ISTJ",
        "cognitive_status": "Healthy Memory",
        "dominant_emotion": "Calm",
        "emotional_intelligence": "High",
        "iq": "Average",
        "attitude_toward_aging": "Acceptance",
        "main_social_role": "Grandfather",
        "social_support": "Large Family",
        "social_participation": "Active",
        "income": "Retirement Pension",
        "economic_decile": 6,
        "housing": "Own Home",
        "religion_and_sect": "Shia Muslim",
        "internalized_moral_traits": [
            "Respectful",
            "Reliable",
            "Generous"

In [35]:
from time import time
timestamp = time()

if personas:
    with open(f"personas/personas{timestamp}.json", "w", encoding="utf-8") as f:
        f.write(json.dumps(json.loads(personas), ensure_ascii=False))
else:
    print("Personas is None")