In [1]:
import os
import instructor
import pandas as pd
import tqdm.notebook as tqdm
import asyncio as aio

from openai import AsyncOpenAI
from pydantic import BaseModel

with open("../../openai.secret") as f:
    os.environ["OPENAI_API_KEY"] = f.read().strip()

In [2]:
TRANSLATED_FILE_PATH_LOOSE = "czech-simpleqa_raw_loose.csv"
TRANSLATE_PROMPT_LOOSE = """
You are given a problem text and the correct answer to the problem. Your task is to translate
both the problem and the answer to Czech language. It's okay to change the word order in the
translation to achieve natural sounding Czech sentences but do NOT change the meaning of the text.
Respond in the following format:

translated_problem: your translation of the problem
translated_answer: your translation of the answer

Example 1:
```
problem: During what year did Julie Mehretu first marry Jessica Rankin?
answer: 2008

translated_problem: Během kterého roku se Julie Mehretu poprvé provdala za Jessicu Rankin
translated_asnwer: 2008
```

Example 2:
```
problem: Which place in the Jammu division is known as Chota Kashi?
answer: PURMADAL

translated_problem: Které místo v oblasti Jammu je známé jako Chota Kashi?
answer PURMADAL
```

Example 3:
```
problem: As of 2020, what efficiency rate in percent was achieved by the latest perovskite solar cells?
answer: 25.5

translated_problem: Jakého procenta účinnosti dosáhly v roce 2020 nejnovější perovskitové solární články?
translated_answer: 25.5
```

Example 4:
```
problem: In 1981, in what Broadway musical did Janet Hubert make her debut?
answer: The First

translated_problem: V jakém muzikálu na Broadwayi debutovala v roce 1981 Janet Hubert?
translated_answer: The First

Example 5
```
problem: What was the full name of the first Prime Minister of Congo?
answer: Patrice Émery Lumumba

translated_problem: Jaké bylo celé jméno prvního premiéra Konga?
translated_answer: Patrice Émery Lumumba
```

problem: {problem}
answer: {answer}
"""

In [3]:
TRANSLATED_FILE_PATH_STRICT = "czech-simpleqa_raw_strict.csv"
TRANSLATE_PROMPT_STRICT = """
You are given a problem text and the correct answer to the problem. Your task is to translate
both the problem and the answer to Czech language. It's okay to change the word order in the
translation to achieve natural sounding Czech sentences but do NOT change the meaning of the text.
Do NOT translate titles and names of scientific papers, reports, works of arts, institutions, or
places unless they have a well-established Czech translation.

Respond in the following format:

translated_problem: your translation of the problem
translated_answer: your translation of the answer

Example 1:
```
problem: During what year did Julie Mehretu first marry Jessica Rankin?
answer: 2008

translated_problem: Během kterého roku se Julie Mehretu poprvé provdala za Jessicu Rankin
translated_asnwer: 2008
```

Example 2:
```
problem: Which place in the Jammu division is known as Chota Kashi?
answer: PURMADAL

translated_problem: Které místo v oblasti Jammu je známé jako Chota Kashi?
answer PURMADAL
```

Example 3:
```
problem: As of 2020, what efficiency rate in percent was achieved by the latest perovskite solar cells?
answer: 25.5

translated_problem: Jakého procenta účinnosti dosáhly v roce 2020 nejnovější perovskitové solární články?
translated_answer: 25.5
```

Example 4:
```
problem: In 1981, in what Broadway musical did Janet Hubert make her debut?
answer: The First

translated_problem: V jakém muzikálu na Broadwayi debutovala v roce 1981 Janet Hubert?
translated_answer: The First

Example 5
```
problem: In the 2022 research paper titled "Analyzing EEG Data with Machine and Deep Learning: A Benchmark" by Danilo Avola et al., what are the four machine learning models that were used?
answer: MLP, CNN, LSTM, and GRU.

translated_problem: Jaké čtyři modely strojového učení byly použity ve výzkumném článku z roku 2022 s názvem "Analyzing EEG Data with Machine and Deep Learning: A Benchmark" od Danila Avoly a kol.?
translated_answer: MLP, CNN, LSTM a GRU.
```

problem: {problem}
answer: {answer}
"""

In [4]:
class Translation(BaseModel):
    translated_problem: str
    translated_answer: str

client = instructor.from_openai(AsyncOpenAI())

In [5]:
async def translate(
    problem: str,
    answer: str,
    prompt: str,
    model_args: dict,
) -> Translation:
    
    if model_args is None:
        model_args = {}
    
    formatted_prompt = prompt.format(
        problem=problem,
        answer=answer,
    )
    return await client.chat.completions.create(
        model="gpt-4o",
        response_model=Translation,
        messages=[{"role": "user", "content": formatted_prompt}],
        **model_args,
    )

In [6]:
def init(translated_file_path: str) -> None:
    if os.path.exists(translated_file_path):
        print("File exists, nothing to init.")
        return
    
    czech_simpleqa = pd.read_csv(
        "https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv"
    )
    czech_simpleqa["translated_problem"] = "missing"
    czech_simpleqa["translated_answer"] = "missing"
    czech_simpleqa.to_csv(translated_file_path, index=False)

In [7]:
async def run_translation(
    translated_file_path: str,
    prompt: str,
    model_args: dict = None
) -> pd.DataFrame:
    
    def save() -> None:
        czech_simpleqa.to_csv(translated_file_path, index=False)

    async def run_tasks() -> None:
        tasks = [
            aio.create_task(translate(row["problem"], row["answer"], prompt, model_args))
            for _, row in task_data
        ]
        translations = await aio.gather(*tasks)
        for (i, _), translation in zip(task_data, translations):
            czech_simpleqa.loc[i, "translated_problem"] = translation.translated_problem
            czech_simpleqa.loc[i, "translated_answer"] = translation.translated_answer

    czech_simpleqa = pd.read_csv(translated_file_path)
    
    max_tasks = 20
    task_data = []
    for i, row in tqdm.tqdm(list(czech_simpleqa.iterrows())):
        if row["translated_problem"] == "missing":
            task_data.append((i, row))
            
        if len(task_data) == max_tasks:
            await run_tasks()
            save()
            task_data = []
    
    await run_tasks()
    save()
    return czech_simpleqa

In [12]:
# init(TRANSLATED_FILE_PATH_STRICT)
# await run_translation(TRANSLATED_FILE_PATH_STRICT, TRANSLATE_PROMPT_STRICT)

File exists, nothing to init.


  0%|          | 0/4326 [00:00<?, ?it/s]

Unnamed: 0,metadata,problem,answer,translated_problem,translated_answer
0,"{'topic': 'Science and technology', 'answer_ty...",Who received the IEEE Frank Rosenblatt Award i...,Michio Sugeno,Kdo obdržel v roce 2010 cenu IEEE Frank Rosenb...,Michio Sugeno
1,"{'topic': 'Science and technology', 'answer_ty...",Who was awarded the Oceanography Society's Jer...,Annick Bricaud,Kdo obdržel Jerlovovu cenu od Oceanography Soc...,Annick Bricaud
2,"{'topic': 'Geography', 'answer_type': 'Place',...",What's the name of the women's liberal arts co...,Radcliffe College,Jak se jmenuje ženská vysoká škola svobodných ...,Radcliffe College
3,"{'topic': 'Sports', 'answer_type': 'Person', '...",In whose honor was the Leipzig 1877 tournament...,Adolf Anderssen,Na počest koho byl zorganizován turnaj v Lipsk...,Adolf Anderssen
4,"{'topic': 'Art', 'answer_type': 'Person', 'url...","According to Karl Küchler, what did Empress El...",Poet Henrich Heine.,"Podle Karla Küchlera, co zobrazovala oblíbená ...",Básník Heinrich Heine.
...,...,...,...,...,...
4321,"{'topic': 'Art', 'answer_type': 'Date', 'urls'...","The book ""Rhine"" by Anselm Kiefer is from what...",1981.,"Kniha ""Rhine"" od Anselma Kiefera je z jakého r...",1981.
4322,"{'topic': 'Video games', 'answer_type': 'Perso...",What was the first and last name of the voice ...,Jodelle Ferland,"Jaké bylo křestní a příjmení hlasového herce, ...",Jodelle Ferland
4323,"{'topic': 'Music', 'answer_type': 'Date', 'url...",What month and year was Miranda Lambert's albu...,October 2010,V kterém měsíci a roce získalo album Mirandy L...,říjen 2010
4324,"{'topic': 'Sports', 'answer_type': 'Date', 'ur...","Provide the day, month, and year Gazprom becam...",17th July 2012,"Uveďte den, měsíc a rok, kdy se Gazprom stal o...",17. července 2012


In [16]:
# init(TRANSLATED_FILE_PATH_LOOSE)
# await run_translation(TRANSLATED_FILE_PATH_LOOSE, TRANSLATE_PROMPT_LOOSE)

  0%|          | 0/4326 [00:00<?, ?it/s]

Unnamed: 0,metadata,problem,answer,translated_problem,translated_answer
0,"{'topic': 'Science and technology', 'answer_ty...",Who received the IEEE Frank Rosenblatt Award i...,Michio Sugeno,Kdo obdržel cenu IEEE Frank Rosenblatt v roce ...,Michio Sugeno
1,"{'topic': 'Science and technology', 'answer_ty...",Who was awarded the Oceanography Society's Jer...,Annick Bricaud,Kdo obdržel Jerlovovu cenu společnosti Oceanog...,Annick Bricaud
2,"{'topic': 'Geography', 'answer_type': 'Place',...",What's the name of the women's liberal arts co...,Radcliffe College,Jak se jmenuje ženská vysoká škola svobodných ...,Radcliffe College
3,"{'topic': 'Sports', 'answer_type': 'Person', '...",In whose honor was the Leipzig 1877 tournament...,Adolf Anderssen,Na počest koho byl uspořádán turnaj v Lipsku v...,Adolf Anderssen
4,"{'topic': 'Art', 'answer_type': 'Person', 'url...","According to Karl Küchler, what did Empress El...",Poet Henrich Heine.,"Podle Karla Küchlera, co zobrazovala oblíbená ...",Básníka Heinricha Heineho.
...,...,...,...,...,...
4321,"{'topic': 'Art', 'answer_type': 'Date', 'urls'...","The book ""Rhine"" by Anselm Kiefer is from what...",1981.,"Z jakého roku je kniha ""Rhine"" od Anselma Kief...",1981.
4322,"{'topic': 'Video games', 'answer_type': 'Perso...",What was the first and last name of the voice ...,Jodelle Ferland,"Jaké bylo jméno a příjmení hlasové herečky, kt...",Jodelle Ferland
4323,"{'topic': 'Music', 'answer_type': 'Date', 'url...",What month and year was Miranda Lambert's albu...,October 2010,V kterém měsíci a roce bylo album Mirandy Lamb...,Říjen 2010
4324,"{'topic': 'Sports', 'answer_type': 'Date', 'ur...","Provide the day, month, and year Gazprom becam...",17th July 2012,"Uveďte den, měsíc a rok, kdy se Gazprom stal o...",17. červenec 2012
