In [43]:
from pathlib import Path
import json
import time
import datetime
import os
import pandas as pd
from typing import Final, Dict, List
from tqdm.auto import tqdm
from google import genai
from google.genai import types

In [44]:
# --- Configuration ---
SYSTEM_PROMPT = """
You are an expert annotator responsible for labeling online messages to support AI systems that detect mental health risks, predatory behavior, and emotional distress. Your output will be used to train models for real-time safety monitoring.

Objective: Create high-quality labeled data for training AI models that can detect predatory behaviors, emotional distress, and mental health risks in online communications.

===========================
1) Label Set (Choose one or more)
===========================

• Suicidal Reference
  - Mentions or implies suicidal thoughts, ideation, or self-harm
  - Example: "Sometimes I think about ending it all."

• Emotional Distress
  - Signs of depression, anxiety, emotional suffering, psychological struggle
  - Example: "I just feel so lost lately, like nothing matters."

• Attacking Behavior
  - Bullying, sexually predatory content, threats, grooming, or intimidation
  - Example: "Don't tell your parents — it's our secret, okay?"

• Safe/Neutral
  - No concerning behavior, emotionally balanced, or informative
  - Example: "I got an A on my exam! So happy right now!"

• Unclear
  - Too vague or ambiguous to confidently assign a label
  - Example: "I'm fine." (without context)

===========================
2) Labeling Instructions
===========================

• Carefully read the message.
• Assign one or more appropriate labels from the Label Set.
• If the message is vague or lacks sufficient context, assign the 'Unclear' label.
• Respond **only** in JSON format with the keys: "labels" (list of strings).
• IMPORTANT: Always return labels as an array of strings, even for single labels.

--- Example Output Format ---
{"labels": ["Emotional Distress"]}
{"labels": ["Attacking Behavior"]}
{"labels": ["Suicidal Reference"]}
{"labels": ["Safe/Neutral"]}
{"labels": ["Unclear"]}
"""

In [None]:
INPUT_FILE = Path("Reddit_data_final.xlsx")       
GEMINI_MODEL = "gemini-2.5-flash-preview-05-20"  # Updated to available model
CHUNK_SIZE = 10
GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]  # Fixed API key format

KeyError: 'GEMINI_KEY'

In [None]:
client = genai.Client(api_key=GEMINI_API_KEY)

def label_text(text: str) -> dict[str, list[str]]:
    text = (text or "").strip()
    if not text:
        return {"labels": ["Unclear - Empty Text"]}
    
    prompt = f"{SYSTEM_PROMPT}\n\nText to classify:\n{text}"
    
    try:
        resp = client.models.generate_content(
            model=GEMINI_MODEL,
            contents=[prompt],
            config=types.GenerateContentConfig(
                response_mime_type="application/json", 
                temperature=0
            ),
        )
        candidate = resp.candidates[0]
        # Check finish reason
        if candidate.finish_reason != "STOP":
            return {"labels": ["Error - Response Blocked"]}
        
        # Check if content exists
        if candidate.content is None:
            return {"labels": ["Error - No Content"]}
        
        # Check if content has parts
        if not candidate.content.parts or len(candidate.content.parts) == 0:
            return {"labels": ["Error - No Parts"]}
        
        # Get the text from the first part
        response_content = candidate.content.parts[0].text
        if response_content is None:
            return {"labels": ["Error - No Text"]}
        
        # Parse JSON
        try:
            result = json.loads(response_content)
            if "labels" not in result:
                return {"labels": ["Error - Invalid Format"]}
            return {"labels": result["labels"]}
        except json.JSONDecodeError as e:
            return {"labels": ["Error - Invalid JSON"]}
            
    except Exception as e:
        print(f"API call failed: {e}")
        return {"labels": ["Error - API Failure"]}

In [None]:
def main() -> None:
    print(f"Loading data from {INPUT_FILE}…")
    try:
        all_sheets = pd.read_excel(INPUT_FILE, sheet_name=None)
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return

    # Gather every "body" column we can find
    bodies = (
        pd.concat(
            [df["body"] for df in all_sheets.values() if "body" in df.columns],
            ignore_index=True
        )
        .dropna()
        .astype(str)
    )

    subset_df = bodies.iloc[8500:13000].reset_index(drop=True).to_frame(name="body")
    subset_df["labels"] = ""  # placeholder column
    total_rows = len(subset_df)
    print(f"Selected {total_rows} messages for labelling.")

    # Pre-compute filenames
    ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    final_outfile = INPUT_FILE.with_name(f"Data_labelled_{ts}.xlsx")
    progress_file = INPUT_FILE.with_name(f"Data_labelled_cp_{ts}.xlsx")

    print("Starting labelling process…")
        
    for idx, msg in enumerate(tqdm(subset_df["body"], desc="Labelling", total=total_rows), 1):
        time.sleep(0.2)
                
        # Get labels for the message
        rec = label_text(msg)
        labels_list = rec.get("labels", ["Unclear - Processing Error"])
        subset_df.at[idx - 1, "labels"] = ", ".join(labels_list)

        # Checkpoint save
        if idx % CHUNK_SIZE == 0 or idx == total_rows:
            try:
                subset_df.iloc[:idx].to_excel(progress_file, index=False)
                print(f"✓ Checkpoint: ({idx}/{total_rows}) ➜ {progress_file}")
            except Exception as e:
                print(f"Warning: Could not save checkpoint: {e}")

    # Final save
    try:
        subset_df.to_excel(final_outfile, index=False)
        print(f"✓ Finished. Full file saved ➜ {final_outfile}")
    except Exception as e:
        print(f"Error saving final file: {e}")

In [None]:
if __name__ == "__main__":
    main()

Loading data from Reddit_data_final.xlsx…
Selected 4500 messages for labelling.
Starting labelling process…


Labelling:   0%|          | 0/4500 [00:00<?, ?it/s]

✓ Checkpoint: (10/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (20/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (30/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (40/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (50/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (60/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (70/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (80/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (90/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (100/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (110/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (120/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (130/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (140/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (150/4500) ➜ Data_labelled_cp_20250529_145205.xlsx
✓ Checkpoint: (160/4500) ➜ Data_la