In [1]:

import asyncio
from dotenv import load_dotenv
import os
import yaml
from typing import List

import pandas as pd
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm

from schema import NerResult

# .env 파일 로드
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
OPENAI_MODEL = os.getenv("OPENAI_MODEL")
OPENAI_REASOINING_EFFORT = os.getenv("OPENAI_REASOINING_EFFORT")
print(OPENAI_MODEL)

PROMPT_NAME = os.getenv("PROMPT_NAME")
print("Prompt:", PROMPT_NAME)

client = AsyncOpenAI(
    base_url=OPENAI_BASE_URL,
    api_key=OPENAI_API_KEY
)

if OPENAI_API_KEY is None or OPENAI_BASE_URL is None:
    raise RuntimeError("OPENAI_API_KEY or OPENAI_BASE_URL environment variable is not set")

# 예시: openai 설정
import openai
openai.api_key = OPENAI_API_KEY
openai.api_base = OPENAI_BASE_URL

# ——————————————
# 2. YAML 템플릿 읽기 함수
def load_prompt_template(yaml_path: str) -> dict:
    with open(yaml_path, "r", encoding="utf-8") as f:
        tpl = yaml.safe_load(f)
    return tpl

# ——————————————
# 3. OpenAI 비동기 호출 함수
async def call_openai_for_ner(instruction: str, prompt: str) -> NerResult:
    messages = [
        {"role": "developer", "content": instruction},
        {"role": "user", "content": prompt}
    ]
    # 비동기 호출 (예시로 openai.ChatCompletion.acreate 사용)
    response = await client.responses.parse(
        model=OPENAI_MODEL,
        input=messages,
        text_format=NerResult,
        reasoning={"effort": OPENAI_REASOINING_EFFORT}
    )
    return response

async def extract(text: List[str], yaml_template_path: str):
    tpl = load_prompt_template(yaml_template_path)
    instruction = tpl['prompt']['developer']
    # YAML 템플릿의 prompt 부분에 텍스트 삽입
    prompt = tpl["prompt"]["user1"].replace("{TEXT}", text)
    response = await call_openai_for_ner(instruction, prompt)
    return response

gpt-5-mini-2025-08-07-dev
Prompt: try2


# 1. Generate Sample

In [2]:
dataset_name = "NIKL_NEWSPAPER_2023_CSV"
fname = "NEWSPAPER_2022_1"

df = pd.read_parquet(f"source/{dataset_name}/{fname}.parquet")
print(df.shape, df.columns)

prompt_path = f"prompt/{PROMPT_NAME}.yaml"

(103464, 11) Index(['file_id', 'doc_id', 'title', 'author', 'publisher', 'date', 'topic',
       'original_topic', 'sentence_ids', 'sentence_offsets', 'text'],
      dtype='object')


In [3]:
n = 32
sample = df.sample(n)
texts = sample.text.values.tolist()
print(len(texts))

32


In [4]:
tasks = [extract(text, prompt_path) for text in texts]
responses = await tqdm.gather(*tasks)

100%|██████████| 32/32 [00:33<00:00,  1.05s/it]


# 2. Process Usages
## try2
```
[gpt-5-mini-minimal]
Avg Input Tokens 2756.156 Output Tokens 1714.812
Max Input Tokens 3604.000 Output Tokens 3396.000

gpt-5-mini-minimal per 1K samples Avg $4.119 Max $7.693
NEWSPAPER_2022_1 103464 samples gpt-5-mini-minimal Avg $426.133 Max $795.949

[gpt-5-nano-minimal]
Avg Input Tokens 2807.594 Output Tokens 1174.375
Max Input Tokens 3795.000 Output Tokens 2543.000

gpt-5-nano-minimal per 1K samples Avg $0.610 Max $1.207
NEWSPAPER_2022_1 103464 samples gpt-5-nano-minimal Avg $63.126 Max $124.876
```

## Baseline:

gpt-5-nano (minimal)
```
per 1K samples Avg $1.024 Max $2.723
NEWSPAPER_2022_1 103464 samples Avg $105.896 Max $281.707
```

gpt-5-mini (minimal)
```
gpt-5-mini per 1K samples Avg $5.118 Max $13.614
NEWSPAPER_2022_1 103464 samples gpt-5-mini Avg $529.479 Max $1408.533
```

In [5]:
input_tokens = []
output_tokens = []

for response in responses:
    usage = response.usage
    input_tokens.append(usage.input_tokens)
    output_tokens.append(usage.output_tokens)

avg_input_tokens = sum(input_tokens)/32
avg_output_tokens = sum(output_tokens)/32

print(f"Avg Input Tokens {avg_input_tokens:.3f} Output Tokens {avg_output_tokens:.3f}")
print(f"Max Input Tokens {max(input_tokens):.3f} Output Tokens {max(output_tokens):.3f}")

Avg Input Tokens 2756.156 Output Tokens 1714.812
Max Input Tokens 3604.000 Output Tokens 3396.000


In [6]:
pricings = {
    "gpt-5-mini": {
        "input": 0.25,
        "output": 2.0
    },
    "gpt-5-nano": {
        "input": 0.05,
        "output": 0.4
    }
}
# 1M: 1_000_000

def calculate_price(model, input_tokens, output_tokens, n):
    input_price = n*(input_tokens/1_000_000)*pricings[model]["input"]
    output_price = n*(output_tokens/1_000_000)*pricings[model]["output"]
    return input_price+output_price

In [7]:
pricing_n=1000
pricing_model = "gpt-5-nano"
pricing_model = "gpt-5-mini"

avg_price = calculate_price(pricing_model, avg_input_tokens, avg_output_tokens, n=pricing_n)
max_price = calculate_price(pricing_model, max(input_tokens), max(output_tokens), n=pricing_n)

print(f"{pricing_model}-{OPENAI_REASOINING_EFFORT} per 1K samples Avg ${avg_price:.3f} Max ${max_price:.3f}")

gpt-5-mini-minimal per 1K samples Avg $4.119 Max $7.693


In [8]:
df_avg_price = (df.shape[0]/1000)*avg_price
df_max_price = (df.shape[0]/1000)*max_price

print(f"{fname} {df.shape[0]} samples {pricing_model}-{OPENAI_REASOINING_EFFORT} Avg ${df_avg_price:.3f} Max ${df_max_price:.3f}")

NEWSPAPER_2022_1 103464 samples gpt-5-mini-minimal Avg $426.133 Max $795.949


In [9]:
import json
if not os.path.exists(f"sample/{PROMPT_NAME}-{OPENAI_MODEL}-{OPENAI_REASOINING_EFFORT}"):
    os.makedirs(f"sample/{PROMPT_NAME}-{OPENAI_MODEL}-{OPENAI_REASOINING_EFFORT}")
for i in range(sample.shape[0]):
    row = sample.iloc[i]
    doc_id = row['doc_id']
    with open(f"sample/{PROMPT_NAME}-{OPENAI_MODEL}-{OPENAI_REASOINING_EFFORT}/{doc_id}.json", "w") as f:
        f.write(json.dumps(responses[i].model_dump(), ensure_ascii=False, indent=4))