# Detecting Filler Speech

In [0]:
%load_ext autoreload
%autoreload 1
%aimport data.adress
%aimport detectors.filler_speech.llm_agent
%aimport detectors.filler_speech.keyword_search

In [0]:
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
import re
from pprint import pprint

### Validation Data

In [0]:
from data.adress import load_CHAT_transcripts

data = load_CHAT_transcripts()
data = data[["Timestamp", "Speaker", "Transcript", "Transcript_clean", "Filler speech"]]
data.head()

In [0]:
data.shape

Generate word-level labels.

In [0]:
def generate_label_array(x):
    # Extract all filler words in annotated transcript
    fillers = re.findall(r"&(\w+)", x["Transcript"])

    # Tokenize the clean transcript
    tokens = x["Transcript_clean"].split()
    
    i = 0
    lbl = np.zeros(len(tokens), dtype=int)
    while i < len(tokens):
        if fillers:
            fill_exp = fillers[0].replace("_", " ")
            if fill_exp == " ".join(tokens[i:i+len(fill_exp.split())]):
                lbl[i] = 1
                fillers.pop(0)
                i += len(fill_exp.split())
                continue

        lbl[i] = 0
        i += 1

    return lbl

In [0]:
true_lbls = data.apply(generate_label_array, axis=1)

In [0]:
true_lbls

## Prompt Engineering for the LLM-Based Detector

In [0]:
from detectors.filler_speech.llm_agent import FillerLLMDetector
from utils import llms

Prompts

In [0]:
prompt_v1 = "Identify all filler sounds (e.g., \"uh\"), words (e.g., \"like\"), or phrases (e.g., \"you know\") that indicate cognitive impairment in the following utterance:\n\n{}\n\nList each instance of filler as a bullet point in the order that they are spoken. Do not include any explanations. If a filler is repeated, list each occurrence in its own bullet point. If there are no fillers, return \"None\"."

prompt_v2 = "Identify all filler sounds (e.g., \"uh\"), words (e.g., \"like\"), or phrases (e.g., \"you know\") that indicate cognitive impairment in the last line of the following transcript:\n\n{}\n\nList each instance of filler in the last line as a bullet point in the order that they are spoken. Do not include any explanations. If a filler is repeated, list each occurrence in its own bullet point. If there are no fillers, return \"None\"."

Detector

In [0]:
endpoint    = "openai_gpt_4o"
api_token   = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

d = FillerLLMDetector(endpoint, llms[endpoint]["model"], api_token, llms[endpoint]["rpm"], llms[endpoint]["tpm"])

In [0]:
d = FillerLLMDetector()

outputs = []
for i, utt in data.iterrows():
    if utt["Speaker"] == "Patient":
        outputs.append( d.detect(utt["Transcript_clean"]) )

## Comparative Approach: Keyword Search

In [0]:
import string
from detectors.filler_speech.keyword_search import FillerKeywordDetector

First, we look at the annotated fillers in the training dataset to help inform our list of keywords.

In [0]:
np.unique( np.concatenate(data.loc[("train",), "Transcript"].apply(lambda x: re.findall(r"&(\w+)", x)).values) )

In [0]:
filler_sounds = [
    "um",
    "uh",
    "er",
    "hm",
    "mm",
    "ah",
    "huh",
]

filler_words = [
    "like",
    "well",
    "so",
    "basically",
    "actually",
    "literally",
]

filler_phrases = [
    "you know",
    "i mean",
    "i guess",
    "you see",
]

keywords = filler_sounds + filler_words + filler_phrases + string.ascii_lowercase

Now we run the detector.

In [0]:
d = FillerKeywordDetector(keywords)

outputs = []
for i, utt in data.iterrows():
    if utt["Speaker"] == "Patient":
        outputs.append( d.detect(utt["Transcript_clean"]) )

## Analysis

In [0]:
data["output"] = data.index.map(outputs)

In [0]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

true = data.loc[data["Speaker"] == "Patient", "Filler speech"].values
pred = np.array(list(map(lambda x: 0 if x == "None" else 1, outputs.values())))

print("ACC: %.2f" % (100 * accuracy_score(true, pred)))
print("F1:  %.2f" % (100 * f1_score(true, pred)))

mtx = confusion_matrix(true, pred)
print("TPR: %.2f" % (100 * (mtx[1, 1] / (mtx[1, 1] + mtx[1, 0]))))
print("TNR: %.2f" % (100 * (mtx[0, 0] / (mtx[0, 0] + mtx[0, 1]))))
print("FPR: %.2f" % (100 * (mtx[0, 1] / (mtx[0, 1] + mtx[0, 0]))))
print("FNR: %.2f" % (100 * (mtx[1, 0] / (mtx[1, 0] + mtx[1, 1]))))
print("PRE: %.2f" % (100 * (mtx[1, 1] / (mtx[1, 1] + mtx[0, 1]))))