In [2]:
import csv
import json
import os

import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI

#### Function to generate batch data

In [3]:
def generate_openai_batch_requests_json(
    csv_file, output_file, start_index, end_index, max_requests
):
    system_prompt = """
    You are an expert annotator. Your task is to classify the following text into one of the following categories:
    1. No Hate/Toxic: The comment contains no hate or toxic speech.
    2. Hate 1: Bias and prejudice (e.g., attitudes or beliefs about a group).
    3. Hate 2: Discrimination (expressing dislike or exclusion but without a call to action).
    4. Hate 3: Threats and violence (contains direct threats, incitement to violence, or serious harm).
    5. Toxic 1: Baseless complaints (complaints or frustrations without direct harm).
    6. Toxic 2: Insults or sarcasm directed at someone (humiliation or belittling without inciting violence).
    7. Toxic 3: Insults with high level of hostility, aggressive, harassment, or threats of harm (incitement to self-harm or targeted harassment).
    Please classify each comment based on the most severe form of speech present, either hate or toxic speech (but not both). The model should internally reason through the following process but only output the final classification (e.g., Toxic 2, Hate 1, etc.).
    Step-by-Step Classification Process:
    1. Identify Target: Determine whether the text targets a specific group or an individual (group = hate, individual = toxic).
    2. Assess Intensity: Evaluate the severity of the statement. Does it express an opinion, incite discrimination, or make threats?
    3. Apply Hate or Toxic Levels: Assign a label based on the most severe behavior present in the text (choose either hate or toxic, not both).
    4. Final Classification: Choose the highest relevant category based on the content.
    Examples:
    Example 1:
    Text: "Foreigners are taking all our jobs."
    Hate 1
    Example 2:
    Text: "You’re an idiot, why do you even try?"
    Toxic 2
    Example 3:
    Text: "I refuse to serve Black people in my restaurant."
    Hate 2
    Example 4:
    Text: "Go end your life, no one cares about you."
    Toxic 3
    Classify the text given by the user, output only the final classification. 
    """

    request_count = 0

    with open(csv_file, mode="r", encoding="utf-8") as file:
        csv_reader = csv.DictReader(file)
        with open(output_file, mode="w", encoding="utf-8") as output:
            for i, row in enumerate(csv_reader):
                # Check if the row falls within the start and end index range
                if i < start_index:
                    continue
                if i >= end_index or request_count >= max_requests:
                    break

                custom_id = f"request-{i + 1}"
                user_prompt = row["text"]
                request = {
                    "custom_id": custom_id,
                    "method": "POST",
                    "url": "/v1/chat/completions",
                    "body": {
                        "model": "gpt-4o-mini",
                        "messages": [
                            {"role": "system", "content": system_prompt},
                            {"role": "user", "content": user_prompt},
                        ],
                        "max_tokens": 10,
                    },
                }
                # Write each request as a separate JSON object on a new line
                output.write(json.dumps(request) + "\n")

                request_count += 1  # Increment the request counter

#### Creating the batch data

In [20]:
# Example usage:
batch_number = 7

generate_openai_batch_requests_json(
    csv_file="clean_concatenated_texts_data.csv",
    output_file=f"batch_requests_{batch_number}.json1",
    start_index=6000,  # Specify start index
    end_index=36000,  # Specify end index
    max_requests=30000,  # Specify max number of requests to generate
)

#### Uploading Your Batch Input File

In [21]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY", None)

client = OpenAI(api_key=api_key)

batch_input_file = client.files.create(
    file=open(
        f"C:/Users/richm/OneDrive/Desktop/DSA4264/DSA4264-Detoxify/batch_requests_{batch_number}.json1",
        "rb",
    ),
    purpose="batch",
)

#### Creating the Batch

In [22]:
batch_input_file_id = batch_input_file.id

description = f"classification of toxic/hate comments batch_{batch_number}"

client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={description: description},
)

Batch(id='batch_67063b0fda0c819087744646029c276e', completion_window='24h', created_at=1728461583, endpoint='/v1/chat/completions', input_file_id='file-4XZOyxf2RZfKTzSEpSIWktEg', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1728547983, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'classification of toxic/hate comments batch_7': 'classification of toxic/hate comments batch_7'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

#### Checking the Status of a Batch
https://platform.openai.com/batches/batch_670627cc819c819090732c245d2543c6

In [29]:
# print(client.batches.retrieve("batch_6706205e87808190b0658d09d60b64ae"))
# print(client.batches.retrieve("batch_67062689346081909f6d648882586a9f"))
# print(client.batches.retrieve("batch_67062782c47c8190ad7047745d92dc88"))
# print(client.batches.retrieve("batch_6706352d233481908494cc98ad3722d9"))
# print(client.batches.retrieve("batch_6706354d17c8819090d439ba3c5e794d"))
# print(client.batches.retrieve("batch_670635699a288190bdaee914d22a71c8"))
print(
    client.batches.retrieve("batch_67063b0fda0c819087744646029c276e")
)  # not processed

Batch(id='batch_67063b0fda0c819087744646029c276e', completion_window='24h', created_at=1728461583, endpoint='/v1/chat/completions', input_file_id='file-4XZOyxf2RZfKTzSEpSIWktEg', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1728547983, failed_at=None, finalizing_at=None, in_progress_at=1728461607, metadata={'classification of toxic/hate comments batch_7': 'classification of toxic/hate comments batch_7'}, output_file_id=None, request_counts=BatchRequestCounts(completed=9030, failed=0, total=30000))


#### Retrieving the Results of a Batch

In [27]:
file_response = client.files.content("file-9su2ZTmfcJlzRj7BgJSCeilW")

# Assuming file_response.text contains the full content with multiple JSON objects, one per line
file_content = file_response.text

# Split the content into individual lines (assuming each line is a separate JSON object)
lines = file_content.strip().splitlines()

# Initialize an empty list to store the labels
gold_labels = []

# Iterate through each line and parse the JSON object
for line in lines:
    try:
        # Parse each line as a JSON object
        response = json.loads(line)
        # Navigate to the 'content' field
        gold_label = response["response"]["body"]["choices"][0]["message"]["content"]
        gold_labels.append(gold_label)
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {e} on line: {line}")
    except KeyError as e:
        print(f"Missing key in response: {e}")

# Now you have a list of gold labels
print(gold_labels)

['No Hate/Toxic', 'Toxic 2', 'Toxic 2', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'Toxic 2', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'Toxic 1', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'Toxic 2', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'Toxic 1', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'Toxic 1', 'No Hate/Toxic', 'Toxic 2', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'Toxic 1', 'Toxic 2', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No Hate/Toxic', 'No H

In [28]:
# Load the CSV file into a DataFrame
file_path = "clean_concatenated_texts_data_with_labels.csv"
df = pd.read_csv(file_path)

# Assign labels for rows with index 1000 to 1999
df.loc[6000:35999, "gold_label"] = gold_labels[:30000]

# Save the updated DataFrame back to a CSV file, retaining the index
df.to_csv("clean_concatenated_texts_data_with_labels.csv", index=False)

print("Gold labels have been assigned for rows 1000 to 1999, index retained.")

  df = pd.read_csv(file_path)


Gold labels have been assigned for rows 1000 to 1999, index retained.
