In [7]:
import pandas as pd

txt_path = "task2_answer.txt"

df = pd.read_csv(txt_path, sep=r'\s{2,}|\t', engine='python', header=None)

df.columns = ["File ID", "SHI Type", "Start Offset", "End Offset", "Text"]

print(df.head())

   File ID SHI Type  Start Offset  End Offset          Text
0       23     DATE         9.779      10.219      tomorrow
1       23     DATE        11.759      12.100         today
2       23     DATE        15.901      16.282         today
3       23     DATE        17.382      17.842      tomorrow
4      121     TIME         2.333       2.773  this morning


In [8]:
def get_shi_spans(df, file_id):
    file_df = df[df["File ID"] == file_id]
    spans = []
    for _, row in file_df.iterrows():
        spans.append({
            "start": float(row["Start Offset"]),
            "end": float(row["End Offset"]),
            "label": row["SHI Type"]
        })
    return spans

file_ids = df["File ID"].unique()
spans_all = []
for file_id in file_ids:
    span = get_shi_spans(df, file_id)
    spans_all.append(span)

In [9]:
print(spans_all)

[[{'start': 9.779, 'end': 10.219, 'label': 'DATE'}, {'start': 11.759, 'end': 12.1, 'label': 'DATE'}, {'start': 15.901, 'end': 16.282, 'label': 'DATE'}, {'start': 17.382, 'end': 17.842, 'label': 'DATE'}], [{'start': 2.333, 'end': 2.773, 'label': 'TIME'}, {'start': 6.405, 'end': 6.825, 'label': 'TIME'}, {'start': 16.614, 'end': 17.034, 'label': 'TIME'}], [{'start': 6.844, 'end': 7.304, 'label': 'DURATION'}], [{'start': 9.434, 'end': 10.296, 'label': 'DURATION'}], [{'start': 2.863, 'end': 3.183, 'label': 'PERSONALNAME'}, {'start': 16.534, 'end': 16.796, 'label': 'PERSONALNAME'}], [{'start': 13.884, 'end': 14.265, 'label': 'DOCTOR'}], [{'start': 5.745, 'end': 6.027, 'label': 'DURATION'}], [{'start': 22.03, 'end': 22.432, 'label': 'DATE'}, {'start': 22.63, 'end': 23.111, 'label': 'DATE'}, {'start': 26.615, 'end': 27.111, 'label': 'DATE'}], [{'start': 0.779, 'end': 1.04, 'label': 'FAMILYNAME'}, {'start': 1.48, 'end': 1.781, 'label': 'FAMILYNAME'}, {'start': 2.5, 'end': 2.8, 'label': 'FAMILYN

In [1]:
with open("tag.txt", "r") as f:
    label_list = [line.strip() for line in f.readlines() if line.strip()]

label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

print("Label List:", label_list)
print("label2id:", label2id)
print("id2label:", id2label)


Label List: ['PATIENT', 'DOCTOR', 'USERNAME', 'PERSONALNAME', 'FAMILYNAME', 'HOSPITAL', 'DEPARTMENT', 'ROOM', 'STREET', 'CITY', 'DISTRICT', 'COUNTY', 'STATE', 'COUNTRY', 'ZIP', 'ORGANIZATION', 'LOCATION-OTHER', 'AGE', 'DATE', 'TIME', 'DURATION', 'SET', 'CONTACT', 'PHONE', 'FAX', 'EMAIL', 'URL', 'IPADDRESS', 'SOCIAL_SECURITY_NUMBER', 'MEDICAL_RECORD_NUMBER', 'HEALTH_PLAN_NUMBER', 'ACCOUNT_NUMBER', 'LICENSE_NUMBER', 'VEHICLE_ID', 'DEVICE_ID', 'BIOMETRIC_ID', 'ID_NUMBER']
label2id: {'PATIENT': 0, 'DOCTOR': 1, 'USERNAME': 2, 'PERSONALNAME': 3, 'FAMILYNAME': 4, 'HOSPITAL': 5, 'DEPARTMENT': 6, 'ROOM': 7, 'STREET': 8, 'CITY': 9, 'DISTRICT': 10, 'COUNTY': 11, 'STATE': 12, 'COUNTRY': 13, 'ZIP': 14, 'ORGANIZATION': 15, 'LOCATION-OTHER': 16, 'AGE': 17, 'DATE': 18, 'TIME': 19, 'DURATION': 20, 'SET': 21, 'CONTACT': 22, 'PHONE': 23, 'FAX': 24, 'EMAIL': 25, 'URL': 26, 'IPADDRESS': 27, 'SOCIAL_SECURITY_NUMBER': 28, 'MEDICAL_RECORD_NUMBER': 29, 'HEALTH_PLAN_NUMBER': 30, 'ACCOUNT_NUMBER': 31, 'LICENSE_N

In [2]:
with open("tag.txt", "r") as f:
    base_labels = [line.strip() for line in f.readlines() if line.strip()]

label_list = ["O"]
for label in base_labels:
    label_list.append(f"B-{label}")
    label_list.append(f"I-{label}")

label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

print("label2id:", label2id)
print("id2label:", id2label)

label2id: {'O': 0, 'B-PATIENT': 1, 'I-PATIENT': 2, 'B-DOCTOR': 3, 'I-DOCTOR': 4, 'B-USERNAME': 5, 'I-USERNAME': 6, 'B-PERSONALNAME': 7, 'I-PERSONALNAME': 8, 'B-FAMILYNAME': 9, 'I-FAMILYNAME': 10, 'B-HOSPITAL': 11, 'I-HOSPITAL': 12, 'B-DEPARTMENT': 13, 'I-DEPARTMENT': 14, 'B-ROOM': 15, 'I-ROOM': 16, 'B-STREET': 17, 'I-STREET': 18, 'B-CITY': 19, 'I-CITY': 20, 'B-DISTRICT': 21, 'I-DISTRICT': 22, 'B-COUNTY': 23, 'I-COUNTY': 24, 'B-STATE': 25, 'I-STATE': 26, 'B-COUNTRY': 27, 'I-COUNTRY': 28, 'B-ZIP': 29, 'I-ZIP': 30, 'B-ORGANIZATION': 31, 'I-ORGANIZATION': 32, 'B-LOCATION-OTHER': 33, 'I-LOCATION-OTHER': 34, 'B-AGE': 35, 'I-AGE': 36, 'B-DATE': 37, 'I-DATE': 38, 'B-TIME': 39, 'I-TIME': 40, 'B-DURATION': 41, 'I-DURATION': 42, 'B-SET': 43, 'I-SET': 44, 'B-CONTACT': 45, 'I-CONTACT': 46, 'B-PHONE': 47, 'I-PHONE': 48, 'B-FAX': 49, 'I-FAX': 50, 'B-EMAIL': 51, 'I-EMAIL': 52, 'B-URL': 53, 'I-URL': 54, 'B-IPADDRESS': 55, 'I-IPADDRESS': 56, 'B-SOCIAL_SECURITY_NUMBER': 57, 'I-SOCIAL_SECURITY_NUMBER': 58

In [3]:
def assign_bio_labels(whisper_words, shi_spans, label2id):
    tokens = []
    ner_tags = []
    previous_label = None

    for word in whisper_words:
        tokens.append(word["word"])
        tag = "O"

        for span in shi_spans:
            if span["start"] <= word["start"] < span["end"]:
                if previous_label != span["label"]:
                    tag = f"B-{span['label']}"
                else:
                    tag = f"I-{span['label']}"
                previous_label = span["label"]
                break
        else:
            previous_label = None

        ner_tags.append(label2id.get(tag, label2id["O"]))

    return {"tokens": tokens, "ner_tags": ner_tags}


In [4]:
from Speech2Text.speech2text import transcribe, transcribe_with_timestamps

def get_whisper_words(audioFile):
    segments = transcribe_with_timestamps(audioFile)
    flat_words = []
    for segment in segments:
        flat_words.extend(segment)
    return flat_words


SyntaxError: invalid syntax (2552295309.py, line 1)

In [9]:
training_data = []

for file_id in file_ids:
    audio_path = f"./audio_folder/{file_id}.wav"
    whisper_words = get_whisper_words(audio_path)
    shi_spans = get_shi_spans(df, file_id)
    example = assign_bio_labels(whisper_words, shi_spans, label2id)
    training_data.append(example)

training_data



KeyboardInterrupt: 