In [None]:
import os
import re
import json
import pandas as pd
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY", ""))

# File paths
version_dir = "/content"
timestamp_file = os.path.join(version_dir, "task1_answer_timestamps.json")
task2_output_file = os.path.join(version_dir, "task2_answer.txt")


# Prompt
def create_prompt(text):
    return f"""
You are a medical NER labeling assistant.

Your task is to extract **all words or phrases containing personal or sensitive health information (PHI/SHI)** from ENGLISH medical transcription text.

**Strictly use only the following labels, and always choose the most specific one.**
Do NOT skip any entity that matches below definitions. Output every possible entity, even if repeated or ambiguous.

LABELS:
AGE
DURATION
DATE
TIME
FAMILYNAME
PERSONALNAME
PATIENT
DOCTOR
PROFESSION
ORGANIZATION
HOSPITAL
DEPARTMENT
CITY
STATE
COUNTRY
COUNTY
DISTRICT
STREET
ZIP
LOCATION-OTHER
SET
ID_NUMBER
MEDICAL_RECORD_NUMBER
MEDICATION

### Label explanations & examples:
- AGE: Age/age range, e.g., "69", "six-year-old", "in her 40s", "a little younger", "adult"
- DURATION: Lengths/periods, e.g., "two hours", "15 minutes", "a week", "ages", "ten years"
- DATE: Dates, months, years, holidays, relative/explicit time, e.g., "Friday", "yesterday", "March 3, 1997", "2023", "today", "Easter", "weekend"
- TIME: Times of day, time expressions, e.g., "9:05 AM", "morning", "noontime", "tonight", "evening", "noon"
- FAMILYNAME: Surnames only, e.g., "Smith", "Ivan", "James", "Emma". **Do NOT include generic relationship terms ("mom", "dad", etc.)**
- PERSONALNAME: Given names/nicknames, e.g., "Emily", "Jess", "Kelly", "Sydney", "Vanessa", "Franco". **Do NOT include generic terms like "mom", "dad", "friend", "nurse", "sister", etc., or pronouns ("he", "she", "I", etc.).**
- PATIENT: Names, identifiers, or generic references to patients, e.g., "the patient", "this patient", "Elva Recidivi", "Jeremy case", "Susan Duren"
- DOCTOR: Doctor names/titles, e.g., "Dr. Smith", "Donald Jeremiah Burg", "Dr. Manion", "Dr. Taylor"
- PROFESSION: Occupations/titles (medical or non-medical), e.g., "chiropractor", "psychologist", "pastor", "academic", "researching", "GP", "president"
- ORGANIZATION: Organization/institution names, e.g., "Bank of America", "YMCA", "Boston Scientific", "Sealed Air Corporation"
- HOSPITAL: Hospital/clinic names, e.g., "Nambour General Hospital", "Ipswich Hospital"
- DEPARTMENT: Department names, e.g., "Tissue Pathology Department", "ICU", "Radiology Department"
- CITY: City/town names, e.g., "Kyabram", "Chicago"
- STATE: State/province/region names, e.g., "Ohio", "Victoria", "QLD"
- COUNTRY: Country names, e.g., "India", "Australia"
- COUNTY: County names, e.g., "Cheshire"
- DISTRICT: District names, e.g., "Greenwich"
- STREET: Street/road/avenue names, e.g., "Victor Street", "Walburgh Street"
- ZIP: Postal or ZIP codes, e.g., "5067", "8003"
- LOCATION-OTHER: Other location/place references not fitting above, e.g., "parking garage", "local", "doctor's office", "Chicago metro area"
- SET: Repeating or habitual time expressions, e.g., "every Monday", "once a week", "every day", "all week"
- ID_NUMBER: Any unique code, ID, or case number not covered above, e.g., "44B20748", "05T758305M"
- MEDICAL_RECORD_NUMBER: Unique medical record numbers, e.g., "558935.KOT", "7301067.DAE"
- MEDICATION: Drug/medication names, e.g., "Adderall", "Imipramine", "vitamin D", "Seroquel", "Welbutrin", "painkillers", "cocaine", "prescription"

---
**Special rules for numbers:**
- For any entity containing a number, always check context:
    - If it's a year, date, or record (e.g., "86" for 1986), label as DATE.
    - If it's a period/range ("10 minutes", "six years"), label as DURATION.
    - If it's an age ("7", "86-year-old", "adult"), label as AGE.
    - If it's a time ("9:05", "morning"), label as TIME.
    - If unsure, standalone numbers are usually DATE unless clearly age/duration/time.

**Do NOT mark pronouns ("I", "he", "she", "us", "them", etc.) as any entity.**
**Do NOT mark family/relationship words ("mom", "dad", "sister", "brother", etc.) as PERSONALNAME/FAMILYNAME.**

---
### Output format
LABEL: entity text
(one entity per line, no extra comments, no duplicates)

---

Now extract all entities from the following text:
---
{text}
---
"""










    # ...你原本的 prompt，無須修改
    # ...省略，見上一則

# 🔁 呼叫 GPT
def call_gpt(text):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": create_prompt(text)}],
            temperature=0,
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"[ERROR] GPT 呼叫失敗：{e}")
        return ""

    # ...你原本的 call_gpt
    # ...省略，見上一則

def extract_entities(text):
    entities = []
    for line in text.strip().split("\n"):
        match = re.match(r"(\w[\w-]*):\s*(.+)", line.strip())
        if match:
            label, content = match.groups()
            entities.append((label.strip(), content.strip()))
    return entities
    # ...你原本的 extract_entities
    # ...省略，見上一則

def normalize(text):
    return re.sub(r"[^a-z0-9]", "", text.lower())

def fuzzy_match_entity(entity, word_segments):
    entity_norm = normalize(entity)
    for i in range(len(word_segments)):
        for j in range(i, len(word_segments)):
            concat = "".join([normalize(w["text"]) for w in word_segments[i:j+1]])
            if concat == entity_norm:
                return word_segments[i]["start"], word_segments[j]["end"]
    return None, None

output_data = []
INVALID_PERSONALNAMES = {"i", "he", "she", "him", "her", "you", "me", "they", "them", "we", "us"}

with open(timestamp_file, "r", encoding="utf-8") as f2:
    timestamps = json.load(f2)

for idx, (key, val) in enumerate(timestamps.items()):
    #if idx >= 20:
       # break

    segments = val.get("segments", [])
    for seg_idx, seg in enumerate(segments):
        text = seg.get("text", "")
        print(f"[處理第 {idx} 筆 seg={seg_idx} → key={key}]")
        gpt_output = call_gpt(text)
        print(f"[GPT 回傳內容]\n{gpt_output}\n---")
        entities = extract_entities(gpt_output)
        word_segments = [{"text": w["word"], "start": w["start"], "end": w["end"]}
                         for w in seg.get("words", []) if w.get("word") is not None]

        has_output = False
        seen = set()
        for label, entity in entities:
            start, end = fuzzy_match_entity(entity, word_segments)
            if (label, entity) in seen:
                continue
            seen.add((label, entity))
            if label == "PERSONALNAME" and normalize(entity) in INVALID_PERSONALNAMES:
                print(f"[SKIP] 排除代名詞：{entity}")
                continue
            if start is not None:
                print(f"[✅ MATCHED] {entity} → {start:.2f}-{end:.2f}")
                output_data.append([key, label, f"{start:.2f}", f"{end:.2f}", entity])
                has_output = True
            else:
                print(f"[WARNING] 無法對齊：{entity}（經標準化後為：{normalize(entity)}）")
                # 你可以選擇加 fallback 輸出
                # output_data.append([key, label, "-1", "-1", entity])

        if not has_output:
            print(f"[❗NO OUTPUT] key={key} seg={seg_idx} 無可對齊實體")

df = pd.DataFrame(output_data, columns=["file_id", "label", "start", "end", "text"])
print(f"✅ 共對齊成功 {len(output_data)} 筆")
print(f"[DEBUG] 前幾筆：\n{df.head()}")
print(f"[DEBUG] 寫入檔案路徑：{task2_output_file}")
df.to_csv(task2_output_file, sep="\t", index=False, header=False, lineterminator="\n")
print(f"✅ 輸出完成：{task2_output_file}")


[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
[✅ MATCHED] Bruce Rock Memorial Hospital → 19.37-21.84
[✅ MATCHED] Dr. Lazaro Isha Linmuth → 24.46-27.29
[處理第 438 筆 seg=0 → key=71988]
[GPT 回傳內容]
PERSONALNAME: Bradley Beal
MEDICAL_RECORD_NUMBER: 1914059.bmt
DATE: July 10, 1990
STREET: East Como Street
CITY: Cooktown
STATE: Australian Capital Territory
ZIP: 3956
---
[✅ MATCHED] Bradley Beal → 2.12-3.06
[✅ MATCHED] 1914059.bmt → 7.31-11.88
[✅ MATCHED] July 10, 1990 → 13.02-14.54
[✅ MATCHED] East Como Street → 16.73-17.81
[✅ MATCHED] Cooktown → 18.17-18.69
[✅ MATCHED] Australian Capital Territory → 18.97-20.87
[✅ MATCHED] 3956 → 22.28-23.92
[處理第 439 筆 seg=0 → key=71995]
[GPT 回傳內容]
DATE: March 21, 2013
PERSONALNAME: Lily
TIME: 7.51 p.m.
DATE: May 24, 2063
DOCTOR: Dr. Randolph
DATE: 2047
---
[✅ MATCHED] March 21, 2013 → 0.03-1.23
[✅ MATCHED] 7.51 p.m. → 5.64-7.43
[✅ MATCHED] May 24, 2063 → 18.45-21.07
[✅ MATCHED] Dr. Randolph → 22.24-23.20
[✅ MATCHED] 2047 → 26.12-26.59
[處理第 440 筆 seg=0 → key=71997]
[GPT 