In [None]:
import json
import pandas as pd
from openai import OpenAI
import tiktoken
from time import sleep
from dotenv import load_dotenv
from IPython.display import clear_output

In [None]:
load_dotenv()

In [None]:
def load_data(file_path: str) -> pd.DataFrame:
    df = pd.read_csv(file_path)
    df_eng = df[df["article_language"] == "English"]
    df_non_eng = df[df["article_language"] != "English"]
    return df_eng, df_non_eng


def create_translation_records(df: pd.DataFrame, model: str = "gpt-4o-mini") -> list:
    records = []
    total_tokens = 0
    encoding = tiktoken.encoding_for_model(model)

    for row in df.itertuples():
        text = (
            f"{row.article_title} {row.article_text}"
            if pd.notna(row.article_title)
            else row.article_text
        )
        message = {
            "role": "system",
            "content": f"Translate the following {row.article_language} text to English. Don't include anything other than the translation: '{text}'.",
        }

        record = {
            "custom_id": str(row.article_id),
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {"model": model, "messages": [message]},
        }

        num_tokens = len(encoding.encode(message["content"]))
        total_tokens += num_tokens
        records.append(record)

    print(f"Total tokens: {total_tokens}")
    print(f"Avg tokens per record: {total_tokens / len(records)}")
    return records, total_tokens


def estimate_costs(total_tokens: int, model: str = "gpt-4o-mini") -> dict:
    price_per_token_input = 0.075  # Example input price per 1M tokens for gpt-4o-mini
    price_per_token_output = 0.300  # Example output price per 1M tokens for gpt-4o-mini

    input_cost = round(total_tokens / 1_000_000 * price_per_token_input, 2)
    output_cost = round(total_tokens / 1_000_000 * price_per_token_output, 2)
    total_cost = input_cost + output_cost

    return {
        "input_cost": f"{input_cost}$",
        "output_cost": f"{output_cost}$",
        "total_cost": f"{total_cost}$",
    }


def save_records_to_jsonl(records: list, file_name: str = "batch.jsonl") -> None:
    with open(file_name, "w") as f:
        for record in records:
            f.write(json.dumps(record) + "\n")
    print(f"Records saved to {file_name}")


def create_translation_batch(client: OpenAI, file_path: str) -> str:
    batch_input_file = client.files.create(file=open(file_path, "rb"), purpose="batch")
    batch_request = client.batches.create(
        input_file_id=batch_input_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={"description": "Translation Batch"},
    )
    return batch_request.id


def monitor_batch(client: OpenAI, request_id: str) -> None:
    while True:
        status = client.batches.retrieve(request_id)
        if status.status == "completed":
            break
        else:
            print(f"Status: {status.status}, Request Counts: {status.request_counts}")
            sleep(3)
            clear_output()


def save_batch_output(client: OpenAI, request_id: str, output_file: str) -> None:
    status = client.batches.retrieve(request_id)
    file_response = client.files.content(status.output_file_id)
    file_response.write_to_file(output_file)
    print(f"Batch results saved to {output_file}")


def load_translated_data(file_path: str) -> pd.DataFrame:
    translated_data = []
    with open(file_path, "r") as f:
        for line in f:
            content = json.loads(line)
            article_id = content["custom_id"]
            text = content["response"]["body"]["choices"][0]["message"]["content"]
            translated_data.append({"article_id": article_id, "text_translated": text})
    return pd.DataFrame(translated_data)


def merge_translated_data(
    df_eng: pd.DataFrame, df_non_eng: pd.DataFrame, translated_df: pd.DataFrame
) -> pd.DataFrame:
    df_non_eng = df_non_eng.merge(translated_df, on=["article_id"])
    df_non_eng = df_non_eng.drop(["article_text", "article_language"], axis=1)
    df_eng = df_eng.drop(["article_language"], axis=1)
    df_non_eng["article_text"] = df_non_eng["text_translated"]
    df_non_eng = df_non_eng.drop(["text_translated"], axis=1)
    df_final = pd.concat([df_non_eng, df_eng], ignore_index=True)
    return df_final

In [None]:
df_eng, df_non_eng = load_data("../datasets/raw/euvsdisinfo.csv")

In [None]:
records, total_tokens = create_translation_records(df_non_eng)

In [None]:
costs = estimate_costs(total_tokens)
print("Estimated costs:", costs)

In [None]:
batch1 = records[: len(records) // 2]
batch2 = records[len(records) // 2 :]

save_records_to_jsonl(batch1, "batch1.jsonl")
save_records_to_jsonl(batch2, "batch2.jsonl")

In [None]:
client = OpenAI()

In [None]:
batch_request_id_1 = create_translation_batch(client, "batch1.jsonl")

In [None]:
monitor_batch(client, batch_request_id_1)

In [None]:
batch_request_id_2 = create_translation_batch(client, "batch2.jsonl")

In [None]:
monitor_batch(client, batch_request_id_2)

In [None]:
save_batch_output(client, batch_request_id_1, f"result-{batch_request_id_1}.jsonl")
save_batch_output(client, batch_request_id_2, f"result-{batch_request_id_2}.jsonl")

In [None]:
translated_df_1 = load_translated_data(f"result-{batch_request_id_1}.jsonl")
translated_df_2 = load_translated_data(f"result-{batch_request_id_2}.jsonl")
translated_df = pd.concat([translated_df_1, translated_df_2], ignore_index=True)

final_df = merge_translated_data(df_eng, df_non_eng, translated_df)

In [None]:
final_df.to_csv("../datasets/euvsdisinfo_translated.csv")