In [None]:
!pip install openai

Collecting openai
  Downloading openai-1.68.2-py3-none-any.whl.metadata (25 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.9.0-cp310-cp310-win_amd64.whl.metadata (5.3 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.7-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.68.2-py3-none-any.whl (606 kB)
   ---------------------------------------- 0.0/606.1 kB ? eta -:--:--
   ---------------------------------------- 606.1/606.1 kB 7.3 MB/s eta 0:00:00
Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Downloading httpx-0.28.1-py3-none-any.whl (73 kB)
Downloading httpcore-1.0.7-py3-none-any.whl 

In [None]:
import pandas as pd

In [None]:
from openai import OpenAI
import json

In [None]:
# Prepare API key
client = OpenAI(organization="xx",
               project="xx",
               api_key="xx")

In [None]:
# Split mixed identities from a single note into separate entities through prompt
prompt_role = """
You will be given a conversation between several parties. Your task is to parse the conversation and discern who said what in a conversation. Possible parties include the following:
- BB (big brother);
- BS (big sister);
- LB (little brother);
- LS (little sister);
- PG (parents)
- BIG (big brother and sister);
- BBBS (big brother and big sister);
- Little (little brother and sister);

Note that these parties may be referred to by other labels (e.g., different capitalizations). The conversation may also include other parties not listed above. Please use your best judgment to determine who said what.

Each conversation can have multiple parties saying multiple things. Make sure to extract all of them, not just the first one. Return your result as a JSON object with a key `utterances`, which is a list of all parsed speaker-text pairs.

Also, some of the conversations may include event reminders or inviting them to fill out event forms. If you detect those sort of things, try to ignore them and focus on what the parties describe.
"""

In [None]:
# indicate the output format as JSON
output_format = {
    "format": {
        "type": "json_schema",
        "name": "role_identification",
        "schema": {
            "type": "object",
            "properties": {
                "utterances": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "who": {"type": "string"},
                            "what": {"type": "string"}
                        },
                        "required": ["who", "what"],
                        "additionalProperties": False
                    }
                }
            },
            "required": ["utterances"],
            "additionalProperties": False
        },
        "strict": True
    }
}

In [None]:
from typing import List, Dict
from tqdm import tqdm

# Function to call the model for a single conversation
def extract_utterances_from_conversation(client, conversation: str, prompt_role: str, output_format: Dict) -> Dict:
    try:
        response = client.responses.create(
            model="gpt-4o",
            input=[
                {"role": "system", "content": prompt_role},
                {"role": "user", "content": conversation}
            ],
            text=output_format,
            temperature=0.0
        )
        return json.loads(response.output_text)
    except Exception as e:
        return {"error": str(e)}

# Batch processing function
def process_conversations_batch(client, conversations: List[str], output_path: str):
    results = []
    for convo in tqdm(conversations, desc="Processing conversations"):
        result = extract_utterances_from_conversation(client, convo, prompt_role, output_format)
        results.append(result)

    # Save all results to a JSON file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"\n Saved {len(results)} processed results to: {output_path}")
    return results

In [None]:
# Load the data to extract call notes
df = pd.read_excel("Training.xlsx")
df_sample = df.head(50)
conversations = df_sample["Match Support Contact Notes"].fillna("").astype(str).tolist()

results = process_conversations_batch(
    client=client,
    conversations=conversations,
    output_path="./parsed_results.json"
)

# Save as Excel file
flat_records = []
for idx, item in enumerate(results):
    if "utterances" in item:
        for utter in item["utterances"]:
            flat_records.append({
                "Row": idx,
                "Who": utter.get("who", ""),
                "What": utter.get("what", "")
            })
    else:
        flat_records.append({
            "Row": idx,
            "Who": "ERROR",
            "What": item.get("error", "Unknown error")
        })

df_output = pd.DataFrame(flat_records)
df_output.to_excel("./parsed_results.xlsx", index=False)

Processing conversations: 100%|████████████████████████████████████████████████████████| 50/50 [01:41<00:00,  2.03s/it]


✅ Saved 50 processed results to: ./parsed_results.json



