### OpenAI 설정

In [5]:
from openai import OpenAI
import os

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
    )

### Batch 검색

In [None]:
batch_job = client.batches.retrieve("batch_id") # replace with your batch id
dict(batch_job)

### 결과 검색

In [45]:
result_file_id = batch_job.output_file_id
print(result_file_id)

if result_file_id is not None: # 배치 작업이 끝나면 result_file_id가 생성됨
    result = client.files.content(result_file_id).content
else:
    print("No result file id found")

file-3Vqc96X5JjZLFdPYJf3gE9


In [46]:
result_file_name = "../../api_output/batch_job_results_translate_241230.jsonl"

# 파일로 저장
with open(result_file_name, "wb") as file:
    file.write(result)

In [47]:
import json

results = []
with open(result_file_name, "r") as file:
    for line in file:
        json_object = json.loads(line)
        results.append(json_object)

### 결과 읽기 및 저장

In [48]:
for res in results[:5]:
    task_id = res['custom_id']
    idx = int(task_id.split('-')[1])
    result = json.loads(res['response']['body']['choices'][0]['message']['content'])
    print(f"Task ID: {task_id}, Index: {idx}\nEn: {result['sentences_en']}\nKo: {result['sentences_ko']}\nSP: {result['sentences_sp']}\nFR: {result['sentences_fr']}\nJP: {result['sentences_ja']}")
    

Task ID: task-0, Index: 0
En: ["Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.", 'To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink, and celebrity parties.', "At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film 'Hostel: Part II,' currently six places below his number one movie on the UK box office chart.", 'His agent and publicist had no comment on his plans.', 'Despite his growing fame and riches, the actor says he is keeping his feet firmly on the ground.', "At first, it's hard to determine where the people are.", 'He says the arrests often result from confrontations with police.', 'We toured the jail with Leifman.', 'Leifman says about one-third of all people in Miami-Dade county jails are mentally ill.', "Most often, th

In [49]:
import csv

with open("../../datasets/csv/translation_241230.csv", "w") as f_out:
    writer = csv.writer(f_out)
    writer.writerow(["English", "Korean", "Spanish", "French", "Japanese"])
    
    seen_pairs = set()
    
    for res in results:
        try:
            task_id = res['custom_id']
            idx = int(task_id.split('-')[1])
            result = json.loads(res['response']['body']['choices'][0]['message']['content'])
            sentences_en = result['sentences_en']
            sentences_ko = result['sentences_ko']
            sentences_sp = result['sentences_sp']
            sentences_fr = result['sentences_fr']
            sentences_ja = result['sentences_ja']
            
            for en, ko, sp, fr, ja in zip(sentences_en, sentences_ko, sentences_sp, sentences_fr, sentences_ja):
                pair = (en, ko)
                if pair not in seen_pairs:
                    writer.writerow([en, ko, sp, fr, ja])
                    seen_pairs.add(pair)
        except:
            print(res)

{'id': 'batch_req_6772609690bc8190a1f6526d51c8f138', 'custom_id': 'task-31', 'response': {'status_code': 200, 'request_id': '4b7a8c2370232a31f5f6b48f7976f980', 'body': {'id': 'chatcmpl-Ak57GkRL4Oa6XkuTmwcZVBC6Jbjjf', 'object': 'chat.completion', 'created': 1735546118, 'model': 'gpt-4o-mini-2024-07-18', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\n    "sentences_en": [\n        "KANSAS CITY, Missouri (CNN) -- President Bush drew parallels between the aftermath of the Vietnam War and the potential costs of pulling out of Iraq in a speech Wednesday.",\n        "President Bush draws parallels Wednesday between the cost of pulling out of Iraq and \'the tragedy of Vietnam.\'",\n        "\'Three decades later, there is a legitimate debate about how we got into the Vietnam War and how we left,\' Bush told members of the Veterans of Foreign Wars, at their convention in Kansas City, Missouri.",\n        "\'Whatever your position in that debate, one unmistakable legacy

### Huggingface에 업로드

In [None]:
from datasets import load_dataset

datasets = load_dataset("csv", data_files="../../datasets/csv/translation_241230.csv", split="train")

datasets.push_to_hub(repo_id="Jooinjang/translation_241230", private=True)

In [None]:
datasets[100202]

{'English': "It just happens naturally; they don't even try to learn.",
 'Korean': '그것은 자연스럽게 일어납니다. 그들은 배우려고 하지도 않아요.',
 'Spanish': 'Sucede de forma natural; ni siquiera intentan aprender.',
 'French': "Cela se produit naturellement ; ils n'essaient même pas d'apprendre.",
 'Japanese': 'それは自然に起こることです。彼らは学ぼうともしません。'}

In [None]:
from datasets import load_dataset

ds = load_dataset("Jooinjang/translation_241230", split="train")

ds.train_test_split(test_size=0.1)

Generating train split: 100%|██████████| 113597/113597 [00:00<00:00, 360533.10 examples/s]


DatasetDict({
    train: Dataset({
        features: ['English', 'Korean', 'Spanish', 'French', 'Japanese'],
        num_rows: 102237
    })
    test: Dataset({
        features: ['English', 'Korean', 'Spanish', 'French', 'Japanese'],
        num_rows: 11360
    })
})

In [None]:
new_samples = []
for line in ds:
    new_samples.append({
        "input_text": line["English"],
        "output_text": line["Korean"]
    })
    new_samples.append({
        "input_text": line["Korean"],
        "output_text": line["English"]
    })

from datasets import Dataset
new_ds = Dataset.from_list(new_samples)

In [None]:
new_ds = new_ds.shuffle(seed=3407)

new_ds.train_test_split(test_size=0.1).push_to_hub(repo_id="Jooinjang/translation_241230_enko", private=True)

Creating parquet from Arrow format: 100%|██████████| 205/205 [00:02<00:00, 71.68ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:09<00:00,  9.88s/it]
Creating parquet from Arrow format: 100%|██████████| 23/23 [00:00<00:00, 70.64ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.42s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Jooinjang/translation_241230_enko/commit/88fdd5716b2c89cd955620c40599b1b4db694e89', commit_message='Upload dataset', commit_description='', oid='88fdd5716b2c89cd955620c40599b1b4db694e89', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Jooinjang/translation_241230_enko', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Jooinjang/translation_241230_enko'), pr_revision=None, pr_num=None)

In [51]:
SYSTEM_PROMPT = (
    "You are a highly capable bilingual translation assistant, skilled in both Korean and English."
    "your task is to accurately translate text while preserving meaning, tone, and style. Follow these guidelines carefully:\n\n"
    "1. If the text is in Korean, translate it into English. If the text is in English, translate it into Korean.\n"
    "2. Preserve the original meaning, Maintain the tone and style of the input text.\n"
    "3. **Proper nouns** (e.g. names, places, organizations) should remain in their original form unless widely accepted translations exist."
)


dataset = load_dataset("Jooinjang/translation_241230_enko")['train']

In [None]:
from datasets import Dataset

def temp():
    dataset = load_dataset("Jooinjang/translation_241230_enko")["train"]    
    messeges = []
    for i in dataset:
        messege = []
        messege.append(
            {
                "role": "system",
                "content": SYSTEM_PROMPT,
            }
        )
        messege.append(
            {
                "role": "user",
                "content": "*** Input Text:\n" + i["input_text"],
            }
        )
        messege.append(
            {
                "role": "assistant",
                "content": "*** Translation:\n" + i["output_text"],
            }
        )
        messeges.append(messege)

    return Dataset.from_dict({"messeges": messeges})

In [68]:
ds = temp()

In [70]:
ds['messages'][0]

[{'content': 'You are a highly capable bilingual translation assistant, skilled in both Korean and English.your task is to accurately translate text while preserving meaning, tone, and style. Follow these guidelines carefully:\n\n1. If the text is in Korean, translate it into English. If the text is in English, translate it into Korean.\n2. Preserve the original meaning, Maintain the tone and style of the input text.\n3. **Proper nouns** (e.g. names, places, organizations) should remain in their original form unless widely accepted translations exist.',
  'role': 'system'},
 {'content': "*** Input Text:\n나는 레이첼 요크가 '루시' 역할을 훌륭하게 소화했다고 생각합니다. 그녀는 무엇이든 연기(및 노래)할 수 있는 카멜레온입니다!",
  'role': 'user'},
 {'content': "*** Translation:\nI thought Rachel York was fantastic as 'Lucy.' She is a chameleon who can play (and sing) anything!",
  'role': 'assistant'}]