In [7]:
import pandas as pd

txt_path = "task2_answer.txt"

df = pd.read_csv(txt_path, sep=r'\s{2,}|\t', engine='python', header=None)

df.columns = ["File ID", "SHI Type", "Start Offset", "End Offset", "Text"]

print(df.head())

   File ID SHI Type  Start Offset  End Offset          Text
0       23     DATE         9.779      10.219      tomorrow
1       23     DATE        11.759      12.100         today
2       23     DATE        15.901      16.282         today
3       23     DATE        17.382      17.842      tomorrow
4      121     TIME         2.333       2.773  this morning


In [8]:
def get_shi_spans(df, file_id):
    file_df = df[df["File ID"] == file_id]
    spans = []
    for _, row in file_df.iterrows():
        spans.append({
            "start": float(row["Start Offset"]),
            "end": float(row["End Offset"]),
            "label": row["SHI Type"]
        })
    return spans

file_ids = df["File ID"].unique()
spans_all = []
for file_id in file_ids:
    span = get_shi_spans(df, file_id)
    spans_all.append(span)

In [9]:
print(spans_all)

[[{'start': 9.779, 'end': 10.219, 'label': 'DATE'}, {'start': 11.759, 'end': 12.1, 'label': 'DATE'}, {'start': 15.901, 'end': 16.282, 'label': 'DATE'}, {'start': 17.382, 'end': 17.842, 'label': 'DATE'}], [{'start': 2.333, 'end': 2.773, 'label': 'TIME'}, {'start': 6.405, 'end': 6.825, 'label': 'TIME'}, {'start': 16.614, 'end': 17.034, 'label': 'TIME'}], [{'start': 6.844, 'end': 7.304, 'label': 'DURATION'}], [{'start': 9.434, 'end': 10.296, 'label': 'DURATION'}], [{'start': 2.863, 'end': 3.183, 'label': 'PERSONALNAME'}, {'start': 16.534, 'end': 16.796, 'label': 'PERSONALNAME'}], [{'start': 13.884, 'end': 14.265, 'label': 'DOCTOR'}], [{'start': 5.745, 'end': 6.027, 'label': 'DURATION'}], [{'start': 22.03, 'end': 22.432, 'label': 'DATE'}, {'start': 22.63, 'end': 23.111, 'label': 'DATE'}, {'start': 26.615, 'end': 27.111, 'label': 'DATE'}], [{'start': 0.779, 'end': 1.04, 'label': 'FAMILYNAME'}, {'start': 1.48, 'end': 1.781, 'label': 'FAMILYNAME'}, {'start': 2.5, 'end': 2.8, 'label': 'FAMILYN

In [1]:
with open("tag.txt", "r") as f:
    label_list = [line.strip() for line in f.readlines() if line.strip()]

label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

print("Label List:", label_list)
print("label2id:", label2id)
print("id2label:", id2label)


Label List: ['PATIENT', 'DOCTOR', 'USERNAME', 'PERSONALNAME', 'FAMILYNAME', 'HOSPITAL', 'DEPARTMENT', 'ROOM', 'STREET', 'CITY', 'DISTRICT', 'COUNTY', 'STATE', 'COUNTRY', 'ZIP', 'ORGANIZATION', 'LOCATION-OTHER', 'AGE', 'DATE', 'TIME', 'DURATION', 'SET', 'CONTACT', 'PHONE', 'FAX', 'EMAIL', 'URL', 'IPADDRESS', 'SOCIAL_SECURITY_NUMBER', 'MEDICAL_RECORD_NUMBER', 'HEALTH_PLAN_NUMBER', 'ACCOUNT_NUMBER', 'LICENSE_NUMBER', 'VEHICLE_ID', 'DEVICE_ID', 'BIOMETRIC_ID', 'ID_NUMBER']
label2id: {'PATIENT': 0, 'DOCTOR': 1, 'USERNAME': 2, 'PERSONALNAME': 3, 'FAMILYNAME': 4, 'HOSPITAL': 5, 'DEPARTMENT': 6, 'ROOM': 7, 'STREET': 8, 'CITY': 9, 'DISTRICT': 10, 'COUNTY': 11, 'STATE': 12, 'COUNTRY': 13, 'ZIP': 14, 'ORGANIZATION': 15, 'LOCATION-OTHER': 16, 'AGE': 17, 'DATE': 18, 'TIME': 19, 'DURATION': 20, 'SET': 21, 'CONTACT': 22, 'PHONE': 23, 'FAX': 24, 'EMAIL': 25, 'URL': 26, 'IPADDRESS': 27, 'SOCIAL_SECURITY_NUMBER': 28, 'MEDICAL_RECORD_NUMBER': 29, 'HEALTH_PLAN_NUMBER': 30, 'ACCOUNT_NUMBER': 31, 'LICENSE_N

In [2]:
with open("tag.txt", "r") as f:
    base_labels = [line.strip() for line in f.readlines() if line.strip()]

label_list = ["O"]
for label in base_labels:
    label_list.append(f"B-{label}")
    label_list.append(f"I-{label}")

label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

print("label2id:", label2id)
print("id2label:", id2label)

label2id: {'O': 0, 'B-PATIENT': 1, 'I-PATIENT': 2, 'B-DOCTOR': 3, 'I-DOCTOR': 4, 'B-USERNAME': 5, 'I-USERNAME': 6, 'B-PERSONALNAME': 7, 'I-PERSONALNAME': 8, 'B-FAMILYNAME': 9, 'I-FAMILYNAME': 10, 'B-HOSPITAL': 11, 'I-HOSPITAL': 12, 'B-DEPARTMENT': 13, 'I-DEPARTMENT': 14, 'B-ROOM': 15, 'I-ROOM': 16, 'B-STREET': 17, 'I-STREET': 18, 'B-CITY': 19, 'I-CITY': 20, 'B-DISTRICT': 21, 'I-DISTRICT': 22, 'B-COUNTY': 23, 'I-COUNTY': 24, 'B-STATE': 25, 'I-STATE': 26, 'B-COUNTRY': 27, 'I-COUNTRY': 28, 'B-ZIP': 29, 'I-ZIP': 30, 'B-ORGANIZATION': 31, 'I-ORGANIZATION': 32, 'B-LOCATION-OTHER': 33, 'I-LOCATION-OTHER': 34, 'B-AGE': 35, 'I-AGE': 36, 'B-DATE': 37, 'I-DATE': 38, 'B-TIME': 39, 'I-TIME': 40, 'B-DURATION': 41, 'I-DURATION': 42, 'B-SET': 43, 'I-SET': 44, 'B-CONTACT': 45, 'I-CONTACT': 46, 'B-PHONE': 47, 'I-PHONE': 48, 'B-FAX': 49, 'I-FAX': 50, 'B-EMAIL': 51, 'I-EMAIL': 52, 'B-URL': 53, 'I-URL': 54, 'B-IPADDRESS': 55, 'I-IPADDRESS': 56, 'B-SOCIAL_SECURITY_NUMBER': 57, 'I-SOCIAL_SECURITY_NUMBER': 58

In [3]:
def assign_bio_labels(whisper_words, shi_spans, label2id):
    tokens = []
    ner_tags = []
    previous_label = None

    for word in whisper_words:
        tokens.append(word["word"])
        tag = "O"

        for span in shi_spans:
            if span["start"] <= word["start"] < span["end"]:
                if previous_label != span["label"]:
                    tag = f"B-{span['label']}"
                else:
                    tag = f"I-{span['label']}"
                previous_label = span["label"]
                break
        else:
            previous_label = None

        ner_tags.append(label2id.get(tag, label2id["O"]))

    return {"tokens": tokens, "ner_tags": ner_tags}


In [4]:
from Speech2Text.speech2text import transcribe, transcribe_with_timestamps

def get_whisper_words(audioFile):
    segments = transcribe_with_timestamps(audioFile)
    flat_words = []
    for segment in segments:
        flat_words.extend(segment)
    return flat_words


SyntaxError: invalid syntax (2552295309.py, line 1)

In [10]:
training_data = []

for file_id in file_ids:
    audio_path = f"./audio_folder/{file_id}.wav"
    whisper_words = get_whisper_words(audio_path)
    shi_spans = get_shi_spans(df, file_id)
    example = assign_bio_labels(whisper_words, shi_spans, label2id)
    training_data.append(example)

training_data



[{'tokens': [' Yeah,',
   ' I',
   ' imagine',
   ' it',
   ' would.',
   ' Sorry,',
   ' go',
   ' ahead.',
   ' So',
   " it's",
   ' supposed',
   ' to',
   ' work',
   ' immediately,',
   ' right?',
   ' Yep.',
   ' So',
   " we'll",
   ' see',
   ' if',
   " I'm",
   ' productive',
   ' tomorrow.',
   ' I',
   ' hope',
   " I'm",
   ' productive',
   ' today.',
   " I've",
   ' actually',
   ' been',
   ' trying',
   ' to',
   ' plan.',
   ' If',
   ' I',
   ' do',
   ' the',
   ' titles',
   ' today,',
   ' then',
   ' I',
   ' can',
   ' do',
   ' my',
   ' laundry',
   ' tomorrow.',
   ' Right.',
   ' I',
   ' probably',
   ' could',
   ' bring',
   ' my',
   ' computer',
   ' and',
   ' do',
   ' titles',
   ' while',
   " I'm",
   ' doing',
   ' my',
   ' laundry.',
   ' If',
   ' I',
   ' was,',
   ' but',
   ' I',
   " won't",
   ' do',
   ' that.'],
  'ner_tags': [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,

In [11]:
import json

output_path = "training_data.json"

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(training_data, f, indent=2, ensure_ascii=False)

print(f"Saved to {output_path}")

Saved to training_data.json


In [13]:
!pip install datasets

Collecting datasets
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Using cached datasets-3.6.0-py3-none-any.whl (491 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Using cached multiprocess-0.70.16-py311-none-any.whl (143 kB)
Using cached pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl (30.9 MB)
Us

In [15]:
import json
from datasets import Dataset

with open("training_data.json", "r", encoding="utf-8") as f:
    training_data = json.load(f)

dataset = Dataset.from_list(training_data)


In [16]:
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 311/311 [00:00<00:00, 4002.67 examples/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 3546.60 examples/s]


In [18]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
!pip install --upgrade transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m5.5/10.5 MB[0m [31m53.7 kB/s[0m eta [36m0:01:33[0m
[?25h
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[31mERROR: Exception:
Traceback (most recent call last):
  File "/Users/feliciarulita/miniconda3/lib/python3.11/site-packages/pip/_vendor/urllib3/response.py", line 438, in _error_catcher
    yield
  File "/Users/feliciarulita/miniconda3/lib/python3.11/site-packages/pip/_vendor/urllib3/response.py", line 561, in read
    data = self._fp_read(amt) if not fp_closed else b""
           ^^^^^^^^^^^^^^^^^^
  File "/Users/feliciarulita/miniconda3/lib/python

In [22]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./ner_shi_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs"
)


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'