chunk 로부터 다양한 질문을 생성해서 json 파일로 저장.

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [2]:
from openai import OpenAI
import os
import json
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
llm_model = "gpt-4o-2024-08-06"
client = OpenAI()

In [7]:
with open("../../data/chunked_documents_300.jsonl") as f:
    chunks = [json.loads(line) for line in f]

In [None]:
chunks[0]

In [5]:
augment_instruct = """
## Role
가상 데이터 생성기

## Instructions
- 주어진 레퍼런스 정보를 보고 레퍼런스와 관련 있는 질문을 중복 없이 한국어로 최대 5개 생성해줘.
- 아래 JSON 포맷으로 생성해줘.

## Output format
{"questions": [$question1, $question2, $question3, $question4, $question5]}
"""

In [6]:
def autogen(messages, llm_model):
  print("Requesting...")
  result = client.chat.completions.create(
            model=llm_model,
            messages=messages,
            temperature=0.8,
            response_format={"type": "json_object"},
            timeout=10,
            seed=1
    )
  print("Received.")
  return result

In [None]:
len(chunks)

In [None]:
import time

chunkOffset = 0
max_len = len(chunks)
target_filename = f'questions_from_chunks_5.jsonl'

while chunkOffset < max_len:
    # if chunkOffset >= 2: break
    
    chunk = chunks[chunkOffset]
    
    messages = [
        {"role": "system", "content": augment_instruct},
        {"role": "user", "content": chunk['content']}
    ]

    try:
        result = autogen(messages, llm_model)
    except Exception as e:
        print(f"Exception ocurred. {e}")
        print(f"Let's retry!")
        time.sleep(5)
        continue
    
    questions = json.loads(result.choices[0].message.content)['questions']
    print(f'chunkOffset: {chunkOffset}, Gen counts: {len(questions)},  Gen questions: {questions}')
    
    if len(questions) <= 0:
        assert False, "hmm..."
    
    dic_list = [
        {
            "question": question,
            "chunkOffset": chunkOffset,
        }
        for question in questions
    ]
    
    with open(target_filename, 'a') as f:
        for item in dic_list:
            json_line = json.dumps(item, ensure_ascii=False)  # JSON 객체를 문자열로 변환
            f.write(json_line + '\n')     # 각 객체를 한 줄에 작성
    
    chunkOffset += 1