# Detecting Filler Speech

In [0]:
%load_ext autoreload
%autoreload 1
%aimport data.adress
%aimport detectors.filler_speech.llm_agent
%aimport detectors.filler_speech.keyword_search

In [0]:
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
import re
from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

### Validation Data

In [0]:
from data.adress import load_CHAT_transcripts

data = load_CHAT_transcripts()
data = data[["Timestamp", "Speaker", "Transcript", "Transcript_clean", "Filler speech"]]
data.head()

## Baseline: Keyword Search

In [0]:
import string
from detectors.filler_speech.keyword_search import FillerKeywordDetector

First, we look at the annotated fillers in the training dataset to help inform our list of keywords.

In [0]:
np.unique( np.concatenate(data.loc["train", "Transcript"].apply(lambda x: re.findall(r"&(\w+)\s*", x)).values) )

Then we define our keyword lists.

In [0]:
filler_sounds = [
    "ah",
    "eh",
    "er",
    "hm",
    "huh",
    "mm",
    "uh",
    "um",
]

filler_words = [
    "like",
    "well",
    "so",
    "basically",
    "actually",
    "literally",
]

filler_phrases = [
    "you know",
    "i mean",
    "i guess",
    "you see",
]

filler_letters = list(string.ascii_lowercase)

Now we run the detector.

In [0]:
def run(cfg_name, cfg, pt_data):
    d = FillerKeywordDetector(*cfg)
    # Apply detector to all patient utterances in the train data
    outputs = pt_data["Transcript_clean"].apply(d.detect)
    # Simplify to binary detection array
    det = outputs.apply(lambda x: len(x) > 0).astype(int)
    # Evaluate detector performance
    prec = precision_score(pt_data["Filler speech"], det)
    rec  = recall_score(pt_data["Filler speech"], det)
    f1   = f1_score(pt_data["Filler speech"], det)
    acc  = accuracy_score(pt_data["Filler speech"], det)
    return (cfg_name, prec, rec, f1, acc)

In [0]:
configs = {
    "sounds": (filler_sounds, False),
    "words": (filler_words, False),
    "phrases": (filler_phrases, False),
    "letters": (filler_letters, False),
    "nonwords": ([], True),
    "sounds+words": (filler_sounds + filler_words, False),
    "sounds+phrases": (filler_sounds + filler_phrases, False),
    "sounds+nonwords": (filler_sounds, True),
    "sounds+words+phrases": (filler_sounds + filler_words + filler_phrases, False),
    "sounds+words+letters": (filler_sounds + filler_words + filler_letters, False),
    "sounds+words+phrases+letters": (filler_sounds + filler_words + filler_phrases + filler_letters, False),
    "sounds+words+phrases+nonwords": (filler_sounds + filler_words + filler_phrases, True),
    "sounds+words+phrases+letters+nonwords": (filler_sounds + filler_words + filler_phrases + filler_letters, True),
}

pt_trn_idx = (data.index.get_level_values("split") == "train") & (data["Speaker"] == "Patient")

results = Parallel(n_jobs=10)(delayed(run)(name, configs[name], data.loc[pt_trn_idx]) for name in tqdm(configs))

table = pd.DataFrame(results, columns=["config", "precision", "recall", "f1", "accuracy"])
table.to_csv("filler_trn_results.csv")

In [0]:
table

## Prompt Engineering for the LLM-Based Detector

In [0]:
from detectors.filler_speech.llm_agent import FillerLLMDetector
from utils import llms

Prompts

In [0]:
prompt_v1 = "Identify all filler sounds (e.g., \"uh\"), words (e.g., \"like\"), or phrases (e.g., \"you know\") that indicate cognitive impairment in the following utterance:\n\n{}\n\nList each instance of filler as a bullet point in the order that they are spoken. Do not include any explanations. If a filler is repeated, list each occurrence in its own bullet point. If there are no fillers, return \"None\"."

prompt_v2 = "Identify all filler sounds (e.g., \"uh\"), words (e.g., \"like\"), or phrases (e.g., \"you know\") that indicate cognitive impairment in the last line of the following transcript:\n\n{}\n\nList each instance of filler in the last line as a bullet point in the order that they are spoken. Do not include any explanations. If a filler is repeated, list each occurrence in its own bullet point. If there are no fillers, return \"None\"."

Detector

In [0]:
endpoint    = "openai_gpt_4o"
api_token   = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

d = FillerLLMDetector(endpoint, llms[endpoint]["model"], api_token, llms[endpoint]["rpm"], llms[endpoint]["tpm"])

In [0]:
d = FillerLLMDetector()

outputs = []
for i, utt in data.iterrows():
    if utt["Speaker"] == "Patient":
        outputs.append( d.detect(utt["Transcript_clean"]) )