In [1]:
import asyncio
from dotenv import load_dotenv
import os
import yaml
from typing import List

import pandas as pd
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm


from schema import NerResult

# .env 파일 로드
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
OPENAI_MODEL = os.getenv("OPENAI_MODEL")
OPENAI_REASOINING_EFFORT = os.getenv("OPENAI_REASOINING_EFFORT")
print(OPENAI_MODEL, OPENAI_REASOINING_EFFORT)

PROMPT_NAME = os.getenv("PROMPT_NAME")
print("Prompt:", PROMPT_NAME)

client = AsyncOpenAI(
    base_url=OPENAI_BASE_URL,
    api_key=OPENAI_API_KEY
)

if OPENAI_API_KEY is None or OPENAI_BASE_URL is None:
    raise RuntimeError("OPENAI_API_KEY or OPENAI_BASE_URL environment variable is not set")

# 예시: openai 설정
import openai
openai.api_key = OPENAI_API_KEY
openai.api_base = OPENAI_BASE_URL

# ——————————————
# 2. YAML 템플릿 읽기 함수
def load_prompt_template(yaml_path: str) -> dict:
    with open(yaml_path, "r", encoding="utf-8") as f:
        tpl = yaml.safe_load(f)
    return tpl

# ——————————————
# 3. OpenAI 비동기 호출 함수
async def call_openai_for_ner(instruction: str, prompt: str) -> NerResult:
    messages = [
        # {"role": "system", "content": "주어진 텍스트에서 명명된 객체 인식(NER)을 수행합니다."},
        {"role": "developer", "content": instruction},
        {"role": "user", "content": prompt}
    ]
    # 비동기 호출 (예시로 openai.ChatCompletion.acreate 사용)
    response = await client.responses.parse(
        model=OPENAI_MODEL,
        input=messages,
        text_format=NerResult,
        reasoning={"effort": OPENAI_REASOINING_EFFORT}
    )
    print(response.usage)
    ner_result = response.output_parsed
    return ner_result

# —
# 4. 여러 텍스트에 대해 처리하고 DataFrame 변환
async def process_texts(texts: List[str], yaml_template_path: str) -> pd.DataFrame:
    tpl = load_prompt_template(yaml_template_path)
    records = []
    instruction = tpl['prompt']['developer']
    
    tasks = []
    for txt in texts:
        # YAML 템플릿의 prompt 부분에 텍스트 삽입
        prompt = tpl["prompt"]["user1"].replace("{TEXT}", txt)
        task = call_openai_for_ner(instruction, prompt)
        tasks.append(task)
    
    results = await tqdm.gather(*tasks)
    for ner_res in results:
        for ent in ner_res.entities:
            records.append({
                "tagged_text": ner_res.tagged_text,
                "entity_value": ent.value,
                "entity_label": ent.label.value,
                # "entity_sentence": ent.sentence
            })
        df = pd.DataFrame(records)
    return df

gpt-5.1 none
Prompt: try2


In [2]:
dataset_name = "NIKL_NEWSPAPER_2023_CSV"
fname = "NEWSPAPER_2022_1"

df = pd.read_parquet(f"source/{dataset_name}/{fname}.parquet")
print(df.shape, df.columns)

prompt_path = f"prompt/{PROMPT_NAME}.yaml"

(103464, 11) Index(['file_id', 'doc_id', 'title', 'author', 'publisher', 'date', 'topic',
       'original_topic', 'sentence_ids', 'sentence_offsets', 'text'],
      dtype='object')


In [3]:
df.topic.value_counts()

topic
사회       33513
경제       16340
정치       15751
생활       15104
스포츠       7564
IT/과학     5248
문화        4048
미용/건강     3819
연예        2077
Name: count, dtype: int64

In [4]:
n = 5
sample = df.sample(n)
texts = sample.text.values.tolist()
print(len(texts))

5


In [5]:
result = await process_texts(texts, prompt_path)

 20%|██        | 1/5 [00:09<00:38,  9.53s/it]

ResponseUsage(input_tokens=2537, input_tokens_details=InputTokensDetails(cached_tokens=0), output_tokens=639, output_tokens_details=OutputTokensDetails(reasoning_tokens=0), total_tokens=3176)


 40%|████      | 2/5 [00:20<00:30, 10.12s/it]

ResponseUsage(input_tokens=2760, input_tokens_details=InputTokensDetails(cached_tokens=2048), output_tokens=1196, output_tokens_details=OutputTokensDetails(reasoning_tokens=0), total_tokens=3956)


 60%|██████    | 3/5 [00:20<00:11,  5.80s/it]

ResponseUsage(input_tokens=2782, input_tokens_details=InputTokensDetails(cached_tokens=0), output_tokens=1535, output_tokens_details=OutputTokensDetails(reasoning_tokens=0), total_tokens=4317)


 80%|████████  | 4/5 [00:22<00:04,  4.08s/it]

ResponseUsage(input_tokens=2642, input_tokens_details=InputTokensDetails(cached_tokens=2048), output_tokens=1424, output_tokens_details=OutputTokensDetails(reasoning_tokens=0), total_tokens=4066)


100%|██████████| 5/5 [00:26<00:00,  5.22s/it]

ResponseUsage(input_tokens=2832, input_tokens_details=InputTokensDetails(cached_tokens=2048), output_tokens=1602, output_tokens_details=OutputTokensDetails(reasoning_tokens=0), total_tokens=4434)





In [6]:
print(f"{PROMPT_NAME}-{OPENAI_MODEL}-{OPENAI_REASOINING_EFFORT}")
result.to_csv(f"results/sample/{PROMPT_NAME}-{OPENAI_MODEL}-{OPENAI_REASOINING_EFFORT}.tsv", sep='\t', index=None)

try2-gpt-5.1-none
