In [49]:
import pandas as pd
import tiktoken

import json
import pandas as pd
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from openai import OpenAI
from tqdm import tqdm
import os

In [26]:
encoder = tiktoken.encoding_for_model('gpt-4o')

In [55]:
df = pd.read_csv('Arena_QS_updated.csv')

In [4]:
cols = ['group', 'category', 'subcategory', 'source',
       'text', 'WHY_QS', 'WHAT_QS', 'HOW_QS', 'DESCRIBE_QS', 'ANALYZE_QS',
       'WHY_QS_ANS', 'WHAT_QS_ANS', 'HOW_QS_ANS', 'DESCRIBE_QS_ANS',
       'ANALYZE_QS_ANS']

In [25]:
num_tokens = 0
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    num_tokens += len(encoder.encode(' '.join([row[col] for col in cols])))

100%|██████████| 7799/7799 [03:29<00:00, 37.26it/s] 


In [26]:
num_tokens

208935118

In [28]:
response_schemas = [
    ResponseSchema(name="WHY", description="Quality score (0 or 1) for the first question-answer pair."),
    ResponseSchema(name="WHAT", description="Quality score (0 or 1) for the second question-answer pair."),
    ResponseSchema(name="HOW", description="Quality score (0 or 1) for the third question-answer pair."),
    ResponseSchema(name="DESCRIBE", description="Quality score (0 or 1) for the fourth question-answer pair."),
    ResponseSchema(name="ANALYZE", description="Quality score (0 or 1) for the fifth question-answer pair.")
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

def format_gpt4o_batch_prompt(row: pd.Series, dataset_name: str, index: int, gpt: str = 'gpt-4o-mini'):
    """
    Generate a JSON payload for GPT-4o-mini evaluation prompt based on a DataFrame row.
    The prompt clearly marks where each question-answer pair starts and ends.
    """
    qa_mapping = [
        ("WHY_QS", "WHAT_QS_ANS"),
        ("WHAT_QS", "HOW_QS_ANS"),
        ("HOW_QS", "DESCRIBE_QS_ANS"),
        ("DESCRIBE_QS", "ANALYZE_QS_ANS"),
        ("ANALYZE_QS", "ANALYZE_QS_ANS")
    ]
    
    text = row["text"]
    
    qa_sections = []
    for i, (q_col, a_col) in enumerate(qa_mapping, start=1):
        question = row.get(q_col, "")
        answer = row.get(a_col, "")
        if pd.notnull(question) and pd.notnull(answer) and question.strip() and answer.strip():
            section = (
                f"=== Сұрақ {i} басталады ===\n"
                f"Сұрақ: {question}\n"
                f"Жауап: {answer}\n"
                f"=== Сұрақ {i} аяқталды ==="
            )
            qa_sections.append(section)
    
    prompt_content = (
        f"{text}\n\n" +
        "\n\n".join(qa_sections) +
        "\n\n" +
        "Жоғарыдағы мәтін мен сұрақ-жауап жұптарын мұқият оқыңыз. Әр жұптың сапасын бағалаңыз: "
        "егер жұп сапалы болса 1, ал сапасыз болса 0 деп белгілеңіз. "
        "Нәтижені тек төмендегі JSON құрылымында, қосымша түсініктеме бермей көрсетіңіз:\n\n" +
        format_instructions
    )
    
    payload = {
        "custom_id": f"{dataset_name}-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": gpt,
            "messages": [
                {
                    "role": "system", 
                    "content": (
                        "Сіз қазақ тілінде жауап беретін білімді, пайдалы көмекшісіз. "
                        "Мәтін мен сұрақ-жауап жұптарын мұқият оқып, әр жұптың сапасын бағалаңыз. "
                        "Жауаптарыңызды жоғарыда көрсетілген JSON құрылымында, қосымша түсініктеме немесе мәтінсіз беріңіз."
                    )
                },
                {
                    "role": "user", 
                    "content": prompt_content
                }
            ],
            "max_tokens": 150,
            "temperature": 0
        }
    }
    
    return payload


max_file_size = 200 * 1024 * 1024
file_count = 1
current_file_size = 0

current_filename = f"batch_requests_part_{file_count}.jsonl"
current_file = open(current_filename, "w", encoding="utf-8")

for i, row in df.iterrows():
    payload = format_gpt4o_batch_prompt(row, "extended_law", i)
    json_line = json.dumps(payload, ensure_ascii=False) + "\n"
    line_size = len(json_line.encode("utf-8"))
    
    if current_file_size + line_size > max_file_size:
        current_file.close()
        file_count += 1
        current_filename = f"batch_requests_part_{file_count}.jsonl"
        current_file = open(current_filename, "w", encoding="utf-8")
        current_file_size = 0
    
    current_file.write(json_line)
    current_file_size += line_size

current_file.close()

print(f"Created {file_count} file(s).")

Created 6 file(s).


In [29]:
client = OpenAI(api_key=openai_key)

uploaded_file_ids = []
for part in range(1, file_count + 1):
    filename = f"batch_requests_part_{part}.jsonl"
    print(f"Uploading {filename}...")
    batch_input_file = client.files.create(
        file=open(filename, "rb"),
        purpose="batch"
    )
    uploaded_file_ids.append(batch_input_file.id)
    print(f"Uploaded file {filename} with ID: {batch_input_file.id}")

batches = []
for file_id in uploaded_file_ids:
    batches.append(client.batches.create(
        input_file_id=file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": "nightly eval job"
        }
    ))

Uploading batch_requests_part_1.jsonl...
Uploaded file batch_requests_part_1.jsonl with ID: file-2E39vqi2NdaHMDS37te9Ne
Uploading batch_requests_part_2.jsonl...
Uploaded file batch_requests_part_2.jsonl with ID: file-V7cFVi5Re3Utq2WAQ9brdm
Uploading batch_requests_part_3.jsonl...
Uploaded file batch_requests_part_3.jsonl with ID: file-CAaa6d1FWByTduLFSwbMqJ
Uploading batch_requests_part_4.jsonl...
Uploaded file batch_requests_part_4.jsonl with ID: file-RTaj3YDgJVU5fe3Ayvq71S
Uploading batch_requests_part_5.jsonl...
Uploaded file batch_requests_part_5.jsonl with ID: file-ADPofjrcH4mP21LS2jM1C7
Uploading batch_requests_part_6.jsonl...
Uploaded file batch_requests_part_6.jsonl with ID: file-F6D7TWACmwrkddxasaLJg7


In [5]:
client = OpenAI(api_key=openai_key)

In [44]:
from datetime import datetime

def format_timestamp(ts):
    return datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') if ts is not None else "N/A"

batches = client.batches.list(limit=6)

output_files = []

for batch in batches.data:
    batch_id = batch.id
    print("Batch ID:", batch_id)
    
    batch_details = client.batches.retrieve(batch_id)
    status = batch_details.status
    print("Status:", status)
    print("Created at:", format_timestamp(batch_details.created_at))
    print("In progress at:", format_timestamp(batch_details.in_progress_at))
    print("Finalizing at:", format_timestamp(batch_details.finalizing_at))
    print("Completed at:", format_timestamp(batch_details.completed_at))
    print("Expires at:", format_timestamp(batch_details.expires_at))

    output_file_id = batch_details.output_file_id
    if output_file_id:
        print("Output File ID:", output_file_id)
        output_files.append(output_file_id)
    else:
        print("Output File ID not available yet for this batch.")
    print()

Batch ID: batch_67b5fd2e902881909af9275258629834
Status: completed
Created at: 2025-02-19 15:47:58
In progress at: 2025-02-19 15:48:04
Finalizing at: 2025-02-19 16:44:53
Completed at: 2025-02-19 16:48:43
Expires at: 2025-02-20 15:47:58
Output File ID: file-88EuGsnXxFqbVVBAS3HwCd

Batch ID: batch_67b5fd2dbe8481909e5fed2e2d89072a
Status: completed
Created at: 2025-02-19 15:47:57
In progress at: 2025-02-19 15:48:05
Finalizing at: 2025-02-19 16:43:34
Completed at: 2025-02-19 16:46:36
Expires at: 2025-02-20 15:47:57
Output File ID: file-DRrX99fsFYmRm3MeN2KaQU

Batch ID: batch_67b5fd2d1da88190a8117a3f92f32cf3
Status: completed
Created at: 2025-02-19 15:47:57
In progress at: 2025-02-19 15:48:06
Finalizing at: 2025-02-19 16:56:43
Completed at: 2025-02-19 17:05:34
Expires at: 2025-02-20 15:47:57
Output File ID: file-Q2rouDGautU5xcEWMuAfvA

Batch ID: batch_67b5fd2c6be88190aeb0c3a72ec776ed
Status: completed
Created at: 2025-02-19 15:47:56
In progress at: 2025-02-19 15:48:07
Finalizing at: 2025-02