### OpenAI 설정

In [3]:
from openai import OpenAI
import os

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
    )

In [7]:
translation_system_prompt = '''
You are a professional translator, fluent in both English and Korean.
Your goal is to translate the following 10 sentences from English to Korean.
You will be provided with a list of 10 English sentences, and you will output a json object containing the Korean translation of each sentence.

{
    sentences_en: string[] // Array of 10 English sentences
    sentences_ko: string[] // Array of 10 Korean translations of the English sentences
}

Please produce translations that read naturally in Korean. Use polite, standard Korean (존댓말) and maintain the original context as much as possible.
'''

### OpenAI API 테스트

In [39]:
def get_translated_sentences(sentences_en):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0.7,
        response_format={
            "type": "json_object",
        },
        messages=[
            {
                "role": "system",
                "content": translation_system_prompt,
            },
            {
                "role": "user",
                "content": sentences_en,
            },
        ]
    )

    return response.choices[0].message.content

In [40]:
import pandas as pd

# Load the CSV file
file_path = '../../datasets/csv/combined_ko_sampled300k.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(df.head())

                                            english
0                                   Serious people.
1                            Could i say something?
2                              I have your daughter
3                       What are we supposed to do?
4  They're not even sure that he knows how to talk.


In [41]:
sentences_en = df['english'].tolist()
sentences_en[:3]

['Serious people.', 'Could i say something?', 'I have your daughter']

In [42]:
result = get_translated_sentences('\n'.join(sentences_en[20:30]))
print(result)

{
    "sentences_en": [
        "Sugar is an essential energy source of the brain and...",
        "Take some cheese, it's good.",
        "I really want to lose my inhibitions. You know, be able to talk to strangers. Break the ice.",
        "Well, okay, then.",
        "I need to get out of here.",
        "I want to do it... there.",
        "It is all I can do to keep you alive!",
        "But I couldn't say no.",
        "I got a hundred bucks if you can be here in five. Thanks.",
        "Sometimes it's hard to tell."
    ],
    "sentences_ko": [
        "당은 뇌의 필수적인 에너지원입니다...",
        "치즈 좀 드세요, 맛있어요.",
        "저는 정말로 제 억제를 없애고 싶어요. 아시죠, 낯선 사람과 이야기할 수 있게요. 분위기를 깨고 싶어요.",
        "그럼, 알겠어요.",
        "저는 여기서 나가야 해요.",
        "저는 거기서 하고 싶어요.",
        "당신을 살리는 것이 제가 할 수 있는 전부입니다!",
        "하지만 거절할 수는 없었어요.",
        "5분 안에 여기 올 수 있다면 100달러 드릴게요. 감사합니다.",
        "가끔은 판단하기가 어렵습니다."
    ]
}


### batch 데이터 준비

In [43]:
sentence_list = []
for i in range(0, len(sentences_en), 10):
    sentence_list.append("\n".join(sentences_en[i:i+10]))

sentence_list[:3]

["Serious people.\nCould i say something?\nI have your daughter\nWhat are we supposed to do?\nThey're not even sure that he knows how to talk.\nHow long until the scanners are back online?\nNow, will you get the luggage off immediately and take it straight to our rooms?\n-What does that mean?\nJohnny, it's seven and a half.\nHe'll get himself killed.",
 'My lord the Prince Alexandre de Grasillac de Morvan Lebro!\nOn the Enterprise, our condition is rapidly worsening.\nThis is the "young little white girl with the big black guys" scene.\nLike you did with us.\nThere was one man who could tilt the balance. - Greetings, Pashabhai.\nQuickly! - Hurry!\nAnd where does the boss man sit?\n- They told you about that, huh?\nGod, please save me\nGlienicke Bridge.',
 "Sugar is an essential energy source of the brain and...\n- Take some cheese, it's good.\nI really want to lose my inhibitions. You know, be able to talk to strangers. Break the ice.\nWell, okay, then.\nI need to get out of here.\nI w

In [44]:
tasks = []

for idx, sentence10_en in enumerate(sentence_list):
    task = {
        "custom_id": f"task-{idx}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "temperature": 0.7,
            "response_format": {
                "type": "json_object",
            },
            "messages": [
                {
                    "role": "system",
                    "content": translation_system_prompt,
                },
                {
                    "role": "user",
                    "content": sentence10_en,
                },
            ]
        }
    }
    tasks.append(task)
    

In [45]:
tasks[0]

{'custom_id': 'task-0',
 'method': 'POST',
 'url': '/v1/chat/completions',
 'body': {'model': 'gpt-4o-mini',
  'temperature': 0.7,
  'response_format': {'type': 'json_object'},
  'messages': [{'role': 'system',
    'content': '\nYou are a professional translator, fluent in both English and Korean.\nYour goal is to translate the following 10 sentences from English to Korean.\nYou will be provided with a list of 10 English sentences, and you will output a json object containing the Korean translation of each sentence.\n\n{\n    sentences_en: string[] // Array of 10 English sentences\n    sentences_ko: string[] // Array of 10 Korean translations of the English sentences\n}\n\nPlease produce translations that read naturally in Korean. Use polite, standard Korean (존댓말) and maintain the original context as much as possible.\n'},
   {'role': 'user',
    'content': "Serious people.\nCould i say something?\nI have your daughter\nWhat are we supposed to do?\nThey're not even sure that he knows h

### batch 파일 업로드

In [47]:
import json

file_name = "../../batch_file/batch_en-ko_translation.jsonl"
with open(file_name, "w") as file:
    for obj in tasks:
        file.write(json.dumps(obj) + "\n")

In [48]:
batch_file = client.files.create(
    file=open(file_name, "rb"),
    purpose="batch"
)

In [51]:
print(batch_file)

FileObject(id='file-Pk9ui24823feWhd8bjykR8', bytes=47254016, created_at=1735041657, filename='batch_en-ko_translation.jsonl', object='file', purpose='batch', status='processed', status_details=None)


### batch 생성

In [52]:
batch_job = client.batches.create(
    input_file_id=batch_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

In [None]:
batch_job.id