In [1]:
from persona_generation_prompt import PROMPT
import json

In [2]:
print(PROMPT)


You must generate a set of fictional but realistic Iranian elderly personas that represent cultural, geographical, and social diversity in Iran.

Rules:
- All personas must be elderly (age 65–90).
- Diversity must be reflected across all components, but keep the integrity and realism of each persona.
- Personas should reflect cultural and social realities of Iran.
- Reactions and attitudes do not need to be "correct" or "moral"; they may be shaped by culture, personal experience, or limitations.
- Personality traits and psychological states should be consistent with the individual’s background.
- Use the JSON format provided below.
- For every variable, use values consistent with Iranian context.
- Do not omit any variable.

Variable definitions and accepted values:
Here’s the translation of your variables and values into **English**:

### 1. **Demographic Information**

* **Age**: integer
* **Gender**: ["Male", "Female", "Non-binary", "Other"]
* **Marital Status**: ["Single", "Marrie

In [3]:
from openai import OpenAI
import os

BASE_URL = "https://api.metisai.ir/openai/v1"
API_KEY = os.getenv("METIS_API_KEY")

client = OpenAI(
    api_key=API_KEY,
    base_url=BASE_URL,
    http_client=None
)

In [63]:
MODEL_TO_USE = "gpt-5-mini"

TEMPERATURE = 1
TOP_P = 0.9
PRESENCE_PENALTY = 0.3
FREQUENCY_PENALTY = 0.4

In [64]:
def messages(persona_count: int):
    return [
        {
            "role": "system",
            "content": PROMPT
        },
        {
            "role": "user",
            "content": f"Generate {persona_count} personas. Only give the JSON array with no extra text or formatting. Don't wrap the array in markdown formatting."
        }
    ]


In [65]:
def call_llm_batch(model, messages, batch_count=1, batch_file_path="batch_input.jsonl"):
    """
    Call the OpenAI Batch API to process requests asynchronously.

    Args:
        model (str): The model to use.
        messages (list): The list of messages to send (single prompt with system and user roles).
        batch_count (int): The number of batches to create.
        batch_file_path (str): Path to save the batch input file.

    Returns:
        dict: Batch metadata including status and file IDs.
    """
    import json
    import os

    # Step 1: Prepare the batch input file
    with open(batch_file_path, "w", encoding="utf-8") as f:
        for i in range(batch_count):
            batch_request = {
                "custom_id": f"request-{i+1}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": model,
                    "messages": messages,
                }
            }
            f.write(json.dumps(batch_request, ensure_ascii=False) + "\n")

    # Step 2: Upload the batch input file
    batch_input_file = client.files.create(
        file=open(batch_file_path, "rb"),
        purpose="batch"
    )

    # Step 3: Create the batch
    batch = client.batches.create(
        input_file_id=batch_input_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": "Batch processing for persona generation"
        }
    )

    return batch

In [66]:
# Token counting helpers using tiktoken
# Requires: pip install tiktoken
import tiktoken
from typing import List, Dict, Any


def num_tokens_from_messages(messages: List[Dict[str, Any]], model: str = MODEL_TO_USE) -> int:
    """Return an estimate of the number of tokens used by a list of chat messages.

    Uses heuristics commonly used with OpenAI chat models (tokens per message/name),
    falling back to the `cl100k_base` encoding when the model encoding isn't available.

    Note: exact token accounting depends on the model's internal tokenization; this
    function gives a good practical estimate for budgeting and monitoring.
    """
    try:
        encoding = tiktoken.encoding_for_model(model)
    except Exception:
        # fallback if model name is unknown to tiktoken
        encoding = tiktoken.get_encoding("cl100k_base")

    # Heuristics from public guidance; adjust if you know exact model rules
    if model in ("gpt-3.5-turbo-0301", "gpt-4-0314"):
        tokens_per_message = 4
        tokens_per_name = -1
    else:
        tokens_per_message = 3
        tokens_per_name = 1

    total_tokens = 0
    for message in messages:
        total_tokens += tokens_per_message
        for key, value in message.items():
            # skip non-string values by converting to string
            if not isinstance(value, str):
                value = json.dumps(value, ensure_ascii=False)
            total_tokens += len(encoding.encode(value))
            if key == "name":
                total_tokens += tokens_per_name

    total_tokens += 3  # assistant priming (heuristic)
    return total_tokens


def num_tokens_from_string(s: str, model: str = MODEL_TO_USE) -> int:
    """Return the number of tokens in a string for the given model's tokenizer."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except Exception:
        encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(s))


SAMPLE_PERSONA = {
    "age": 65,
    "gender": "Male",
    "marital_status": "Married",
    "children": "2-3",
    "living_situation": "Living with Family",
    "general_health": "Good",
    "chronic_disease": "High Blood Pressure",
    "mobility": "Independent",
    "hearing_senses": "Good",
    "vision_senses": "Good",
    "daily_energy": "High",
    "personality_type": "ISTJ",
    "cognitive_status": "Healthy Memory",
    "dominant_emotion": "Calm",
    "emotional_intelligence": "High",
    "iq": "Average",
    "attitude_toward_aging": "Acceptance",
    "main_social_role": "Grandfather",
    "social_support": "Large Family",
    "social_participation": "Active",
    "income": "Retirement Pension",
    "economic_decile": 6,
    "housing": "Own Home",
    "religion_and_sect": "Shia Muslim",
    "internalized_moral_traits": ["Respectful", "Reliable", "Generous"],
    "religiosity_level": "Average",
    "ethnicity": "Persian",
    "language": "Persian",
    "important_personal_experiences": "Educational Achievement",
    "life_satisfaction": "Satisfied",
    "meaning_and_purpose_in_old_age": "Helping Family"
}


def estimate_persona_tokens(persona_sample: Dict[str, Any], persona_count: int, model: str = MODEL_TO_USE) -> Dict[str, int]:
    """Estimate tokens for a single persona JSON and for persona_count copies of it.

    Returns a dict with single_persona_tokens and total_personas_tokens.
    """
    persona_str = json.dumps(persona_sample, ensure_ascii=False)
    single = num_tokens_from_string(persona_str, model)
    return {"single_persona_tokens": single, "total_personas_tokens": single * persona_count}


def estimate_run_tokens(persona_count: int, response_text: str, model: str = MODEL_TO_USE) -> Dict[str, int]:
    """Estimate input/output tokens for a run that asks for persona_count personas.

    - input_tokens: tokens consumed by the `messages(persona_count)` payload
    - output_tokens: tokens in the LLM response text
    - total: sum of input + output
    """
    msgs = messages(persona_count)
    input_tokens = num_tokens_from_messages(msgs, model)
    output_tokens = num_tokens_from_string(response_text or "", model)
    return {"input_tokens": input_tokens, "output_tokens": output_tokens, "total": input_tokens + output_tokens}


# Usage examples (uncomment to run):
print(num_tokens_from_messages(messages(1)))
print(estimate_persona_tokens(SAMPLE_PERSONA, 50))
# if 'personas' in globals():
#     print(estimate_run_tokens(50, personas))


1321
{'single_persona_tokens': 278, 'total_personas_tokens': 13900}


In [67]:
from openai.types import Batch

def poll_batch_status(batch: Batch):
    batch_id = batch.id

    resp = client.batches.retrieve(batch_id)

    if resp.status == "completed":
        if resp.output_file_id:
            file_response = client.files.content(resp.output_file_id)
            return file_response
        elif resp.error_file_id:
            file_response = client.files.content(resp.error_file_id)
            return file_response
        else:
            print("Batch completed but no output_file_id")
            return None
    else:
        print(f"Batch status: {resp.status}. Not completed yet.")
        return None


In [80]:
def parse_response(resp):
    contents = []
    if resp:
        answers = resp.text.split("\n")[:-1]

        for batch_answer in answers:
            single_batch_resp = json.loads(batch_answer)

            answer = single_batch_resp['response']['body']['choices'][0]['message']['content']
            contents.append(answer)

    return contents

In [None]:
from time import time

def save_batch_output(batch):
    timestamp = time()

    output_path = f"personas/batch_output_{timestamp}.jsonl"
    result = poll_batch_status(batch)
    if not result:
        print("Batch not completed yet or no output available.")
        return
        
    parsed = parse_response(result)

    with open(output_path, "w", encoding="utf-8") as f:
        for p in parsed:
            f.write(json.dumps(json.loads(p)) + "\n")

    return output_path

In [69]:
test_messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant."
    },
    {
        "role": "user",
        "content": "What is the capital of France?"
    }
]

resp = call_llm_batch(MODEL_TO_USE, test_messages, 2)

In [None]:
result = poll_batch_status(resp)
if result:
    print(parse_response(result))

['The capital of France is Paris.', 'The capital of France is Paris.']


In [None]:
resp = call_llm_batch(MODEL_TO_USE, messages(10), 20)

In [None]:
result = poll_batch_status(resp)
result

<openai._legacy_response.HttpxBinaryResponseContent at 0x7358f65638c0>

In [94]:
save_batch_output(resp)