### OpenAI 설정

In [25]:
from openai import OpenAI
import os

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
    )

### Batch 검색

In [30]:
batch_job = client.batches.retrieve("batch_id") # replace with your batch id
dict(batch_job)

{'id': 'batch_676aa2f4fc2081909d1995a11a67a048',
 'completion_window': '24h',
 'created_at': 1735041781,
 'endpoint': '/v1/chat/completions',
 'input_file_id': 'file-Pk9ui24823feWhd8bjykR8',
 'object': 'batch',
 'status': 'completed',
 'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': 1735050539,
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1735128181,
 'failed_at': None,
 'finalizing_at': 1735046710,
 'in_progress_at': 1735041787,
 'metadata': None,
 'output_file_id': 'file-EcAcgAVTSjX8bWgmhSRGxg',
 'request_counts': BatchRequestCounts(completed=30000, failed=0, total=30000)}

### 결과 검색

In [31]:
result_file_id = batch_job.output_file_id
print(result_file_id)

if result_file_id is not None: # 배치 작업이 끝나면 result_file_id가 생성됨
    result = client.files.content(result_file_id).content
else:
    print("No result file id found")

file-EcAcgAVTSjX8bWgmhSRGxg


In [32]:
result_file_name = "../../api_output/batch_job_results_translate_en-ko.jsonl"

# 파일로 저장
with open(result_file_name, "wb") as file:
    file.write(result)

In [33]:
import json

results = []
with open(result_file_name, "r") as file:
    for line in file:
        json_object = json.loads(line)
        results.append(json_object)

### 결과 읽기 및 저장

In [34]:
for res in results[:5]:
    task_id = res['custom_id']
    idx = int(task_id.split('-')[1])
    result = json.loads(res['response']['body']['choices'][0]['message']['content'])
    print(f"Task ID: {task_id}, Index: {idx}\nEn: {result['sentences_en']}\nKo: {result['sentences_ko']}")
    

Task ID: task-0, Index: 0
En: ['Serious people.', 'Could I say something?', 'I have your daughter.', 'What are we supposed to do?', "They're not even sure that he knows how to talk.", 'How long until the scanners are back online?', 'Now, will you get the luggage off immediately and take it straight to our rooms?', '-What does that mean?', "Johnny, it's seven and a half.", "He'll get himself killed."]
Ko: ['진지한 사람들입니다.', '제가 한 말씀 드려도 될까요?', '제가 당신의 딸을 데리고 있습니다.', '우리는 무엇을 해야 하나요?', '그들은 그가 말을 할 줄 아는지조차 확신하지 못하고 있습니다.', '스캐너가 다시 작동하기까지 얼마나 걸리나요?', '지금 짐을 즉시 내려서 우리 방으로 바로 가져다 주시겠어요?', '-그게 무슨 뜻이에요?', 'Johnny, 그건 7.5입니다.', '그는 스스로를 위험에 빠뜨릴 것입니다.']
Task ID: task-1, Index: 1
En: ['My lord the Prince Alexandre de Grasillac de Morvan Lebro!', 'On the Enterprise, our condition is rapidly worsening.', "This is the 'young little white girl with the big black guys' scene.", 'Like you did with us.', 'There was one man who could tilt the balance. - Greetings, Pashabhai.', 'Quickly! - Hurry!', 'And w

In [36]:
import csv

with open("../../datasets/csv/TED2020+OpenSubtitles300k.csv", "w") as f_out:
    writer = csv.writer(f_out)
    writer.writerow(["en", "ko"])
    
    seen_pairs = set()
    
    for res in results:
        try:
            task_id = res['custom_id']
            idx = int(task_id.split('-')[1])
            result = json.loads(res['response']['body']['choices'][0]['message']['content'])
            sentences_en = result['sentences_en']
            sentences_ko = result['sentences_ko']
            
            for en, ko in zip(sentences_en, sentences_ko):
                pair = (en, ko)
                if pair not in seen_pairs:
                    writer.writerow([en, ko])
                    seen_pairs.add(pair)
        except:
            print(res)

{'id': 'batch_req_676ab7ea605081908a6c6b066e0a8b34', 'custom_id': 'task-3339', 'response': {'status_code': 200, 'request_id': '29b835cec8a112dc8a78ba34c3199c25', 'body': {'id': 'chatcmpl-Ahy3CRFOY7cNPlpqtwff5bpHcFqSN', 'object': 'chat.completion', 'created': 1735042302, 'model': 'gpt-4o-mini-2024-07-18', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\n    "sentences_en": [\n        "I love my job, being The Flash.",\n        "Maybe I could\'ve forgiven you and helped raise your kid.",\n        "- They all did.",\n        "That\'s a $350 window.",\n        "And yet, across the gulf of space, minds immeasurably superior to ours regarded this earth with envious eyes, and, slowly and surely, they drew their', 'refusal': None}, 'logprobs': None, 'finish_reason': 'content_filter'}], 'usage': {'prompt_tokens': 247, 'completion_tokens': 83, 'total_tokens': 330, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_to

### Huggingface에 업로드

In [46]:
from datasets import load_dataset

datasets = load_dataset("csv", data_files="../../datasets/csv/TED2020+OpenSubtitles300k.csv", split="train")

datasets.push_to_hub(repo_id="Jooinjang/my_translate_300k_en-ko", private=True)

Creating parquet from Arrow format: 100%|██████████| 297/297 [00:00<00:00, 1082.03ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.78s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Jooinjang/my_translate_300k_en-ko/commit/48b24db010b945d3e2762f338fe0aed9ac6daabc', commit_message='Upload dataset', commit_description='', oid='48b24db010b945d3e2762f338fe0aed9ac6daabc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Jooinjang/my_translate_300k_en-ko', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Jooinjang/my_translate_300k_en-ko'), pr_revision=None, pr_num=None)

In [45]:
datasets[100:110]

{'en': ['-For helping you move the bricks!',
  'All the children available for adoption have some genetic advantage.',
  "I'm trying to communicate.",
  "It's at the hotel.",
  'Non, merci, quand meme.',
  "I'm kind of a vegetarian.",
  'So you feel like telling me what happened to you that night now?',
  "It's a blessed spot.",
  'There is a new proconsul in the Romulan Senate.',
  "I made Dimpy lie, goldy's stomach isn't aching I wanted to meet you.."],
 'ko': ['- 벽돌 옮기는 걸 도와주기 위해서요!',
  '입양 가능한 모든 아이들은 유전적으로 어떤 장점이 있어요.',
  '저는 소통하려고 하고 있어요.',
  '그것은 호텔에 있어요.',
  '아니요, 감사합니다, 그럼에도 불구하고.',
  '저는 채식주의자 같은 편이에요.',
  '그래서 이제 그날 밤 당신에게 무슨 일이 있었는지 말해줄 기분인가요?',
  '그곳은 축복받은 장소예요.',
  '로물루스 상원에 새로운 총독이 생겼어요.',
  '제가 딤피에게 거짓말하게 했어요, 골디의 배는 아프지 않아요, 당신을 만나고 싶었어요.']}

In [50]:
from datasets import load_dataset

ds = load_dataset("Jooinjang/my_translate_300k_en-ko", split="train")

ds.train_test_split(test_size=0.1)

DatasetDict({
    train: Dataset({
        features: ['en', 'ko'],
        num_rows: 266773
    })
    test: Dataset({
        features: ['en', 'ko'],
        num_rows: 29642
    })
})