In [1]:
from pathlib import Path
import json
import time
import datetime
import os
import pandas as pd
from tqdm.auto import tqdm
from openai import OpenAI

In [2]:
# --- Configuration ---
SYSTEM_PROMPT = """
You are an expert annotator responsible for labeling online messages to support AI systems that detect mental health risks, predatory behavior, and emotional distress. Your output will be used to train models for real-time safety monitoring.

Objective: Create high-quality labeled data for training AI models that can detect predatory behaviors, emotional distress, and mental health risks in online communications.

===========================
1) Label Set (Choose one or more)
===========================

• Suicidal Reference
  - Mentions or implies suicidal thoughts, ideation, or self-harm
  - Example: "Sometimes I think about ending it all."

• Emotional Distress
  - Signs of depression, anxiety, emotional suffering, psychological struggle
  - Example: "I just feel so lost lately, like nothing matters."

• Attacking Behavior
  - Bullying, sexually predatory content, threats, grooming, or intimidation
  - Example: "Don't tell your parents — it's our secret, okay?"

• Safe/Neutral
  - No concerning behavior, emotionally balanced, or informative
  - Example: "I got an A on my exam! So happy right now!"

• Unclear
  - Too vague or ambiguous to confidently assign a label
  - Example: "I'm fine." (without context)

===========================
2) Labeling Instructions
===========================

• Carefully read the message.
• Assign one or more appropriate labels from the Label Set.
• If the message is vague or lacks sufficient context, assign the 'Unclear' label.
• Respond **only** in JSON format with the keys: "labels" (list of strings).
• IMPORTANT: Always return labels as an array of strings, even for single labels.

--- Example Output Format ---
{"labels": ["Emotional Distress"]}
{"labels": ["Attacking Behavior"]}
{"labels": ["Suicidal Reference"]}
{"labels": ["Safe/Neutral"]}
{"labels": ["Unclear"]}
"""

In [3]:
# ── CONFIG ──────────────────────────────────────────────────────────────────────
INPUT_FILE = Path("Twitter.xlsx")      
OPENAI_MODEL = "gpt-4o"
CHUNK_SIZE   = 10                       

In [4]:
def label_text(text: str) -> dict:
    OPENAI_API_KEY = "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"
    client = OpenAI(api_key=OPENAI_API_KEY)
    try:
        clean_text = str(text).strip()
        completion = client.chat.completions.create(
            model=OPENAI_MODEL,
            temperature=0,
            response_format={"type": "json_object"},
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": clean_text}
            ],
        )
        response_content = completion.choices[0].message.content
        result = json.loads(response_content)
        return {"labels": result["labels"]}
    except Exception as e:
        print(f"API error: {e}")
        return {"labels": ["Unclear - API Error"]}

In [5]:
def main() -> None:
    print(f"Loading data from {INPUT_FILE}…")
    try:
        all_sheets = pd.read_excel(INPUT_FILE, sheet_name=None)
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return

    # Gather every "body" column we can find
    bodies = (
        pd.concat(
            [df["body"] for df in all_sheets.values() if "body" in df.columns],
            ignore_index=True
        )
        .dropna()
        .astype(str)
    )

    # subset_df = bodies.iloc[:8500].reset_index(drop=True).to_frame(name="body")
    # subset_df["labels"] = ""  # placeholder column
    # total_rows = len(subset_df)
    # print(f"Selected {total_rows} messages for labelling.")
    subset_df = bodies.reset_index(drop=True).to_frame(name="body")
    subset_df["labels"] = ""  # placeholder column
    total_rows = len(subset_df)
    print(f"Selected {total_rows} messages for labelling.")

    # Pre-compute filenames
    ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    final_outfile = INPUT_FILE.with_name(f"Twitter_labelled_{ts}.xlsx")
    progress_file = INPUT_FILE.with_name(f"Twitter_labelled_cp_{ts}.xlsx")

    print("Starting labelling process…")
    
    for idx, msg in enumerate(tqdm(subset_df["body"], desc="Labelling", total=total_rows), 1):
        time.sleep(0.2) 
        
        # Get labels for the message
        rec = label_text(msg)
        labels_list = rec.get("labels", ["Unclear - Processing Error"])
        subset_df.at[idx - 1, "labels"] = ", ".join(labels_list)

        # Checkpoint save
        if idx % CHUNK_SIZE == 0 or idx == total_rows:
            try:
                subset_df.iloc[:idx].to_excel(progress_file, index=False)
                print(f"✓ Checkpoint: ({idx}/{total_rows}) ➜ {progress_file}")
            except Exception as e:
                print(f"Warning: Could not save checkpoint: {e}")

    # Final save
    try:
        subset_df.to_excel(final_outfile, index=False)
        print(f"✓ Finished. Full file saved ➜ {final_outfile}")
    except Exception as e:
        print(f"Error saving final file: {e}")

In [6]:
if __name__ == "__main__":
    main()

Loading data from Twitter.xlsx…
Selected 3075 messages for labelling.
Starting labelling process…


Labelling:   0%|          | 0/3075 [00:00<?, ?it/s]

✓ Checkpoint: (10/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (20/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (30/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (40/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (50/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (60/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (70/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (80/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (90/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (100/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (110/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (120/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (130/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (140/3075) ➜ Twitter_labelled_cp_20250529_182509.xlsx
✓ Checkpoint: (150/3075) ➜ Twitter_labelled_cp_20250529_1