In [1]:
import json
import pandas as pd
import re
import numpy as np
import os
from star_utils import query_llm_proxy, get_llm_proxy_api_token
api_token = get_llm_proxy_api_token()

In [2]:
with open("task_tool_mapping_out.json", "r") as f:
    data = json.load(f)
annotation = pd.DataFrame(data["task_tool_mappings"])

tasks = pd.read_json("user_tasks.json", lines=True)

In [3]:
# Which expected user_idx values are missing?
expected = set(range(annotation["user_idx"].min(), annotation["user_idx"].max() + 1)) 
present  = set(annotation["user_idx"].unique())
missing  = expected - present
print("missing user_idx:", sorted(missing))

# Verify each user_idx has exactly 10 task_idx values
counts = annotation.groupby("user_idx")["task_idx"].nunique()
bad    = counts[counts != 10]
print(f"\nusers with wrong task count: {len(bad)}")
display(bad.head())

missing user_idx: []

users with wrong task count: 3


user_idx
1558    9
6626    6
9440    9
Name: task_idx, dtype: int64

In [15]:
annotation[annotation['user_idx'] == 1558]

Unnamed: 0,user_idx,task_idx,task,tool_to_search
5180,1558,0,Troubleshoot internet connection issues for 10...,Confluence
5181,1558,1,Update knowledge base articles on streaming de...,Confluence
5182,1558,2,Respond to 25 customer inquiries via phone and...,Gmail
5183,1558,3,Monitor ticket volume and prioritize tasks for...,Jira
5184,1558,4,Implement new software update for IT system ma...,Confluence
5185,1558,5,Research alternative solutions for outdated ha...,Slack
5186,1558,6,Collaborate with IT team to resolve network co...,Confluence
5187,1558,7,Document troubleshooting steps for common brow...,Google Calendar
5188,1558,8,Schedule training session on cybersecurity bes...,Google Sheets


In [29]:
# find missing tasks
missing_rows = tasks[tasks['user_idx'].isin([1558, 6626, 9440])] 
missing_rows

Unnamed: 0,name,employee_tenure,department,job_title,company_size,company_sector,tasks,user_idx
520,Benjamin,experienced,Customer Support / Service,Help Desk Technician,small,Media & Entertainment,[Troubleshoot internet connection issues for 1...,1558
5076,Sophia,experienced,Customer Support / Service,Support Agent,large,Manufacturing & Industrial,[Respond to 5 customer emails regarding delaye...,9440
6676,Liam,new,Marketing,Digital Advertising Coordinator,large,Financial Services & Banking,[Reviewing and optimizing ad campaigns for imp...,6626


In [34]:
task_tool_mapping_item = {
    "type": "object",
    "properties": {
        "tool_to_search": {"type": "string"}
    },
    "required": ["tool_to_search"],
    "additionalProperties": False
}

schema = {
    "name": "task_tool_mapping",
    "strict": True,
    "type": "object",
    "properties": {
        "task_tool_mappings": {
            "type": "array",
            "items": task_tool_mapping_item,
            "minItems": 1
        }
    },
    "required": ["task_tool_mappings"],
    "additionalProperties": False
}

PROMPT = r"""
You are an employee who must complete **10 distinct tasks**.
Before doing each task you will SEARCH for supporting resources
(templates, SOPs, past examples, documentation).

**Your job:**  
For **each task**, decide which **one** internal tool you would click FIRST
to find those resources.

────────────────────────────────────────────────────────
ROLE CONTEXT
  • Position: {employee_tenure} {job_title}  
  • Department: {department}  
  • Company:   {company_size}-size {company_sector} organisation
────────────────────────────────────────────────────────
ALLOWED TOOLS — choose from *this list only*:  
  Coda · Confluence · Jira · Google Doc · Slack · Miro ·  
  Google Slides · Google Sheets · Gmail · GitHub
────────────────────────────────────────────────────────
OUTPUT RULES  
  • Pick **exactly one** tool *from the list above* for every task.  
  • Do **NOT** invent tools or return multiple tools.  
  • Return **only** valid JSON that matches the schema below—no prose.

    Example:
    {{
      "task_tool_mappings": [
        {{ "tool_to_search": "Confluence" }},
        ...
      ]
    }}

TASKS  
{formatted_tasks}
""".strip()


def build_prompt(row):
    tasks = "\n".join(f"{i}. {t}" for i, t in enumerate(row["tasks"]))
    return PROMPT.format(
        employee_tenure=row["employee_tenure"],
        job_title=row["job_title"],
        department=row["department"],
        company_size=row["company_size"],
        company_sector=row["company_sector"],
        formatted_tasks=tasks
    )

output_file = "missing_annotations.json"
if not os.path.exists(output_file):
    with open(output_file, 'w') as f:
        json.dump({"task_tool_mappings": []}, f)

for _, row in missing_rows.iterrows(): 
    prompt = build_prompt(row)
    try:
        response = query_llm_proxy(
            prompt,
            schema=schema,
            sleep_time=1,
            backend='openai_direct_chat_gpt4o',
            api_token=api_token
        )

        text_chunk = response.get('chunk', {}).get('text', '{}')
        parsed = json.loads(text_chunk) if isinstance(text_chunk, str) else text_chunk
        fixed = [
                {
                    "user_idx": row["user_idx"],
                    "task_idx": i,
                    "task": row["tasks"][i],
                    "tool_to_search": tool_row["tool_to_search"]
                }
                for i, tool_row in enumerate(parsed.get("task_tool_mappings", []))
            ]
        with open(output_file, 'r') as f:
            existing = json.load(f)

        existing["task_tool_mappings"].extend(fixed)

        with open(output_file, 'w') as f:
            json.dump(existing, f, indent=2)

        print(f"Completed user_idx={row['user_idx']}")

    except Exception as e:
        print(f"Error for user_idx={row['user_idx']}: {e}")
print("All users processed successfully")

Completed user_idx=1558
Completed user_idx=9440
Completed user_idx=6626
All users processed successfully


In [35]:
with open("missing_annotations.json", "r") as f:
    data = json.load(f)
missing_annotations = pd.DataFrame(data["task_tool_mappings"])
missing_annotations.head()

Unnamed: 0,user_idx,task_idx,task,tool_to_search
0,1558,0,Troubleshoot internet connection issues for 10...,Confluence
1,1558,1,Update knowledge base articles on streaming de...,Confluence
2,1558,2,Respond to 25 customer inquiries via phone and...,Gmail
3,1558,3,Monitor ticket volume and prioritize tasks for...,Jira
4,1558,4,Implement new software update for IT system ma...,Confluence


In [36]:
len(missing_annotations)

30

In [38]:
# replace the missing annotations with the correct ones
annotation = annotation[~annotation['user_idx'].isin([1558, 6626, 9440])]
# add the missing annotations
annotation = pd.concat([annotation, missing_annotations])

In [39]:
# Which expected user_idx values are missing?
expected = set(range(annotation["user_idx"].min(), annotation["user_idx"].max() + 1)) 
present  = set(annotation["user_idx"].unique())
missing  = expected - present
print("missing user_idx:", sorted(missing))

# Verify each user_idx has exactly 10 task_idx values
counts = annotation.groupby("user_idx")["task_idx"].nunique()
bad    = counts[counts != 10]
print(f"\nusers with wrong task count: {len(bad)}")
display(bad.head())

missing user_idx: []

users with wrong task count: 0


Series([], Name: task_idx, dtype: int64)

In [41]:
annotation.to_json('annotation_1_out.json', orient = 'records', lines = True)

In [42]:
len(annotation)

104560