# 보고서 키워드 추출 및 요약 

## 세팅

In [1]:
import openai 
import time 
import json 
import os 
from tqdm import tqdm
from dotenv import load_dotenv


load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

In [None]:
def load_jsonl_data(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data


def save_to_jsonl_file(data, path): 
    with open(path, encoding='utf-8', mode='w') as f:
        for entry in data:
            json.dump(entry, f, ensure_ascii=False)
            f.write('\n')

## OpenAI API (GPT)

In [2]:
def get_response_gpt(system_message, user_message, model_name, temperature, max_tokens, top_p, retry_attempts=3, delay=5):
    messages = [
        {
            "role": "system",
            "content": system_message,
        },
        {
            "role": "user",
            "content": user_message,
        },
    ]

    attempts = 0
    while attempts < retry_attempts:
        try:
            response = openai.ChatCompletion.create(
                model=model_name, 
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
                top_p=top_p
                )
            return response['choices'][0]['message']['content']
        except openai.error.InvalidRequestError as e:
            error_message = str(e)
            print(f"Invalid request error: {error_message}")
            return None
        except openai.error.APIError as e:
            print(f"API error on attempt {attempts + 1}: {e}")
            attempts += 1
            time.sleep(delay)

    print("Max retries reached, skipping this data.")
    return None


def save_dataset_gpt(data, save_path, model_name, system_message, user_message, params):

    if os.path.exists(save_path):
        dataset = load_jsonl_data(save_path)
    else:
        dataset = []

    print(f"existing dataset length: {len(dataset)}")
    existing_datas = {item['idx'] for item in dataset}

    with open(save_path, "a", encoding='utf-8') as f:
        for idx, entry in tqdm(enumerate(data)):
            if idx in existing_datas:
                continue

            doc = entry["content"]
            temperature = params["temperature"]
            max_tokens = params["max_tokens"]
            top_p = params["top_p"] 

            response = get_response_gpt(system_message, user_message.format(document=doc), model_name, temperature, max_tokens, top_p)

            new_entry = {"idx": idx, "document": doc, "response": response}
            dataset.append(new_entry)

            json.dump(new_entry, f, ensure_ascii=False)
            f.write('\n')

    return dataset

## run 

In [3]:
data_path = "document.jsonl"
save_path = "response.jsonl"
model_name = 'gpt-4o'

system_message = "### INSTRUCTION ###\n당신은 텍스트 분석 및 요약 전문가입니다. 아래 문서에서 가장 핵심적인 키워드들을 추출하여 리스트 형식으로 제공해주세요. 줄바꿈을 하여 문서의 내용을 3문장 이내로 요약해주세요. 요약 시에는 '~이다'와 같은 종결 어미를 사용하세요. 다른 부차적인 말 없이 키워드 및 요약만 추출하세요.\n\n### OUTPUT FORMAT ###\n['키워드1', '키워드2', '키워드3', ...]\n'요약내용'"
user_message = "### DOCUMENT ###\n{document}"

params = {
    'temperature': 0.5,
    'max_tokens': 500,
    'top_p': 1,
}

data = load_jsonl_data(data_path)
dataset = save_dataset_gpt(data, save_path, model_name, system_message, user_message, params)
print(f"Dataset processed and saved. Total entries: {len(dataset)}")

existing dataset length: 0


10it [00:27,  2.70s/it]

Dataset processed and saved. Total entries: 10





# json format으로 응답받기 (파싱 용이, 일관된 품질 향상)

## OpenAI API (GPT)

In [None]:
def get_response_gpt(system_message, user_message, model_name, temperature, max_tokens, top_p, retry_attempts=3, delay=5):
    json_format = {"type": "json_object"}
    messages = [
        {
            "role": "system",
            "content": system_message,
        },
        {
            "role": "user",
            "content": user_message,
        },
    ]

    attempts = 0
    while attempts < retry_attempts:
        try:
            response = openai.ChatCompletion.create(
                model=model_name, 
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
                top_p=top_p,
                response_format=json_format,
                )
            return response['choices'][0]['message']['content']
        except openai.error.InvalidRequestError as e:
            error_message = str(e)
            print(f"Invalid request error: {error_message}")
            return None
        except openai.error.APIError as e:
            print(f"API error on attempt {attempts + 1}: {e}")
            attempts += 1
            time.sleep(delay)

    print("Max retries reached, skipping this data.")
    return None


## run

In [6]:
data_path = "document.jsonl"
save_path = "response_json.jsonl"
model_name = 'gpt-4o'

system_message = "### INSTRUCTION ###\n당신은 텍스트 분석 및 요약 전문가입니다. 아래 문서에서 가장 핵심적인 키워드들을 추출하여 리스트 형식으로 만들고 문서의 내용을 3문장 이내로 요약하여 JSON 형태로 제공하세요. 요약 시에는 '~이다'와 같은 종결 어미를 사용하세요.\n\n### OUTPUT FORMAT ###\n{{\n  \"keywords\": \"list\",\n \"summary\": \"string\",\n}}"
user_message = "### DOCUMENT ###\n{document}"

params = {
    'temperature': 0.5,
    'max_tokens': 500,
    'top_p': 1,
}

data = load_jsonl_data(data_path)
dataset = save_dataset_gpt(data, save_path, model_name, system_message, user_message, params)
print(f"Dataset processed and saved. Total entries: {len(dataset)}")

existing dataset length: 0


10it [00:32,  3.20s/it]

Dataset processed and saved. Total entries: 10



