# Detecting Filler Speech

TODO
- Run best config for baseline plus LLM critic post-processing
- Run LLM experiment

In [0]:
%load_ext autoreload
%autoreload 1
%aimport data.adress
%aimport data.observer
%aimport detectors.filler_speech.keyword_search
%aimport utils

In [0]:
import sys
sys.path.append("..")
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from openai import OpenAI
from pprint import pprint
from scipy.stats import ttest_ind, mannwhitneyu
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
# MLFlow
import mlflow
from mlflow.genai.scorers import Safety, scorer
from mlflow.entities import Feedback

### Load the Data

In [0]:
from data.adress import load_CHAT_transcripts, load_outcomes

In [0]:
adress_trans = load_CHAT_transcripts()
adress_trans = adress_trans[["Timestamp", "Speaker", "Transcript", "Transcript_clean", "Filler speech"]]
adress_trans.head()

In [0]:
trn_pt_utt_idx = (adress_trans.index.get_level_values("split") == "train") & (adress_trans["Speaker"] == "Patient")
tst_pt_utt_idx = (adress_trans.index.get_level_values("split") == "test")  & (adress_trans["Speaker"] == "Patient")

## Baseline: Keyword Search

In [0]:
from detectors.filler_speech.keyword_search import FillerKeywordDetector
from utils import create_custom_nlp

First, we look at the annotated fillers in the training dataset to help inform our list of keywords.

In [0]:
np.unique( np.concatenate(adress_trans.loc[trn_pt_utt_idx, "Transcript"].apply(lambda x: re.findall(r"&(\w+)\s*", x)).values) )

Then we define our keyword lists.

In [0]:
import string

In [0]:
filler_sounds = [
    "ah",
    "eh",
    "er",
    "hm",
    "huh",
    "mm",
    "uh",
    "um",
]

filler_words = [
    "like",
    "well",
    "so",
    "basically",
    "actually",
    "literally",
]

filler_phrases = [
    "you know",
    "i mean",
    "i guess",
    "you see",
]

filler_letters = list(string.ascii_lowercase)

filler_uncommonletters = list(filter(lambda c: c not in ["a", "i", "o"], filler_letters))

#### Explore different keyword sets

In [0]:
from joblib import Parallel, delayed
from tqdm import tqdm

In [0]:
configs = {
    "sounds": (filler_sounds, False),
    "letters": (filler_letters, False),
    "uncommonletters": (filler_uncommonletters, False),
    "words": (filler_words, False),
    "nonwords": ([], True),
    "phrases": (filler_phrases, False),
    "sounds+letters": (filler_sounds + filler_letters, False),
    "sounds+uncommonletters": (filler_sounds + filler_uncommonletters, False),
    "sounds+words": (filler_sounds + filler_words, False),
    "sounds+nonwords": (filler_sounds, True),
    "sounds+phrases": (filler_sounds + filler_phrases, False),
    "sounds+uncommonletters+words": (filler_sounds + filler_uncommonletters + filler_words, False),
    "sounds+uncommonletters+nonwords": (filler_sounds + filler_uncommonletters, True),
    "sounds+uncommonletters+phrases": (filler_sounds + filler_uncommonletters + filler_phrases, True),
    "sounds+words+nonwords": (filler_sounds + filler_phrases, True),
    "sounds+nonwords+phrases": (filler_sounds + filler_phrases, True),
    "sounds+uncommonletters+nonwords+words": (filler_sounds + filler_uncommonletters + filler_words, True), # new
    "sounds+uncommonletters+nonwords+phrases": (filler_sounds + filler_uncommonletters + filler_phrases, True), # new
    "sounds+letters+words+phrases+nonwords": (filler_sounds+ filler_letters + filler_words + filler_phrases, True),
    "sounds+uncommonletters+words+nonwords+phrases": (filler_sounds + filler_uncommonletters + filler_words + filler_phrases, True)
}

In [0]:
def run(cfg_name, cfg, dataset):
    # Create spaCy vocab
    nlp = create_custom_nlp()
    # Initialize keyword detector with config
    d = FillerKeywordDetector(nlp, *cfg)
    # Run detector on dataset
    outputs = dataset["Transcript_clean"].apply(d.detect)
    # Evaluate performance
    pred = outputs.apply(lambda x: len(x) > 0).astype(int)
    true = dataset["Filler speech"]
    prec = precision_score(true, pred)
    rec  = recall_score(true, pred)
    f1   = f1_score(true, pred)
    acc  = accuracy_score(true, pred)
    return cfg_name, prec, rec, f1, acc

In [0]:
results = Parallel(n_jobs=10, return_as="generator_unordered")(delayed(run)(cfg_name, configs[cfg_name], adress_trans.loc[trn_pt_utt_idx]) for cfg_name in configs)
results = [res for res in tqdm(results, total=len(configs))]

table = pd.DataFrame(results, columns=["config", "precision", "recall", "f1", "accuracy"])
table.to_csv("filler_trn_results.csv")

In [0]:
table.sort_values("recall", ascending=False).round(3)
# print(table.round(3).to_latex(index=False))

#### Evaluate best filler keyword detector configuration on the test data

In [0]:
# create custom spaCy
nlp = create_custom_nlp()
# init detector
keyword_detector_final = FillerKeywordDetector(nlp, filler_sounds + filler_uncommonletters, False)

# run on all data
outputs = adress_trans.apply(lambda x: keyword_detector_final.detect(x["Transcript_clean"]) if x["Speaker"] == "Patient" else [], axis=1)
adress_trans["baseline_output"] = outputs

# report performance on test data
true = adress_trans.loc[tst_pt_utt_idx, "Filler speech"]
pred = outputs.loc[tst_pt_utt_idx].apply(lambda x: len(x) > 0).astype(int)
print("TST PREC: %.3f" % (precision_score(true, pred)))
print("TST REC:  %.3f" % (recall_score(true, pred)))
print("TST F1:   %.3f" % (f1_score(true, pred)))
print("TST ACC:  %.3f" % (accuracy_score(true, pred)))

###### Investigate the failure cases

In [0]:
temp = adress_trans.loc[trn_pt_idx].copy()
temp["output"] = outputs.loc[trn_pt_idx]
temp["detected"] = temp["output"].apply(lambda x: len(x) > 0).astype(int)
temp

In [0]:
with pd.option_context("display.max_rows", None, "display.max_colwidth", None):
    print("False positives:")
    print(temp.loc[(temp["Filler speech"] == 0) & (temp["detected"] == 1), ["Transcript_clean", "output"]])

In [0]:
with pd.option_context("display.max_rows", None, "display.max_colwidth", None):
    print("False negatives:")
    print(temp.loc[(temp["Filler speech"] == 1) & (temp["detected"] == 0), ["Transcript"]])

In [0]:
doc = nlp("co th shor wai sau")
for token in doc:
    print(token.text, token.lemma_, token.is_oov, token.is_punct, token.is_space, token._.is_silence_tag, token._.is_inaudible_tag, token._.is_event_tag)

###### (@Sriharsha) Try adding a LLM to postprocess
We are already acheiving high performance with the keyword detector using filler sounds and uncommon letters. Can we improve performance by using an LLM to remove false positive detections?

*Experiment*: For each patient session, aggregate the utterances into a transcript and the filler detection lists (remember to offset them). Query an LLM to remove detections that are not actually filler. Try 3 different LLMs once you've settled on a good prompt. Also, try using MLFlow to log LLM outputs. It makes comparing the effect of different prompt versions a little easier to visualize.

In [0]:
from utils import llm_call

In [0]:
mlflow_creds = mlflow.utils.databricks_utils.get_databricks_host_creds()

client = OpenAI(
    api_key=mlflow_creds.token,
    base_url=f"{mlflow_creds.host}/serving-endpoints"
)

In [0]:
prompt = ''''''

In [0]:
@scorer
def correct(expectations, outputs):
    return Feedback(value=(expectations["has_filler"] == (len(outputs["fillers"]) > 0)))

scorers = [
    Safety(),
    correct
]

In [0]:
temp = data.loc["train"].copy()
temp["filler_spans"] = outputs["output"].reset_index(level=0, drop=True)
temp

In [0]:
(data["Speaker"] + ": " + data["Transcript_clean"]).values

In [0]:
fn = lambda text: llm_call(client, "openai_gpt_4o", None, prompt.format(text), {"type": "json_object"})

with mlflow.start_run(run_name="filler_LLMcritic_pv1") as run:
    result = mlflow.genai.evaluate(
        predict_fn=fn,
        data=trn_dataset,
        scorers=scorers
    )

## LLM-Based Detector

##### Explore different Prompts

In [0]:
# mlflow.openai.autolog()

# mlflow_creds = mlflow.utils.databricks_utils.get_databricks_host_creds()

# client = OpenAI(
#     api_key=mlflow_creds.token,
#     base_url=f"{mlflow_creds.host}/serving-endpoints"
# )

In [0]:
prompt_v1 = '''Identify all filler sounds (e.g., \"uh\"), words (e.g., \"like\"), or phrases (e.g., \"you know\") that indicate cognitive impairment in the following utterance:

{}

List each instance of filler as a bullet point in the order that they are spoken. Do not include any explanations. If a filler is repeated, list each occurrence in its own bullet point. If there are no fillers, return \"None\".'''

prompt_v2 = '''Your task is to identify all filler sounds (e.g., \"uh\"), words (e.g., \"like\"), or phrases (e.g., \"you know\") in text.

Your output must be a single JSON object with a single key "fillers" whose value is JSON array of objects. Each object in the array represents one detected filler and must have the following three keys:
- "filler_text": The exact filler text that was identified.
- "start_char": The starting character index of the filler in the utterance.
- "end_char": The index of the character *after* the last character of the filler.
'''

In [0]:
fn = lambda text: llm_call(client, "openai_gpt_4o", None, prompt.format(text), {"type": "json_object"})

with mlflow.start_run(run_name="llm_eval_gpt4o_pv2") as run:
    result = mlflow.genai.evaluate(
        predict_fn=fn,
        data=trn_dataset,
        scorers=scorers
    )

In [0]:
# TODO compute performance metrics

##### Correlation with outcome variables

In [0]:
# TODO run on test data and compute feature correlations

## Summary metrics for filler detections

In [0]:
outcomes = load_outcomes()
outcomes.head()

In [0]:
tst_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test")].index.values
tst_ad_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test") & (outcomes["AD_dx"] == 1)].index.values
tst_cn_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test") & (outcomes["AD_dx"] == 0)].index.values

Filler Rate = total number of detected fillers / total number of words spoken

In [0]:
# filler rate
num = outputs.apply(len).groupby(level=("split", "ID")).sum()
den = adress_trans.apply(lambda x: sum([1 for token in nlp(x["Transcript_clean"]) if not (token.is_punct or token.is_space or token._.is_silence_tag or token._.is_inaudible_tag or token._.is_event_tag)]) if x["Speaker"] == "Patient" else 0, axis=1).groupby(level=("split", "ID")).sum()
filler_rates = 100 * num / den

In [0]:
plt.violinplot(
    (filler_rates.loc[tst_ad_pts],
     filler_rates.loc[tst_cn_pts]),
    showmedians=True,
)
plt.xticks([1, 2], ["AD Group", "Control Group"])
plt.ylim([0, 100])
plt.ylabel("Filler Rate")
plt.grid()
plt.show()

Mean and stddev number of words between fillers

In [0]:
# mean/std number of words between fillers
num_words_btwn_fillers = pd.DataFrame(index=filler_rates.index, columns=["mean", "std", "mean_norm", "std_norm"], dtype=float)
for i, grp in adress_trans.groupby(level=("split", "ID")):
    pt_utts = grp.loc[grp["Speaker"] == "Patient", "Transcript_clean"]
    pt_trans = " ".join(pt_utts)
    pt_fillers = grp.loc[grp["Speaker"] == "Patient", "baseline_output"]
    # print(pt_utts)
    # print(pt_trans)
    # print(pt_fillers)

    ## get word-level character spans
    doc = nlp(pt_trans)
    # remove spaces, punctuation, and tags
    word_spans = [(token.idx, token.idx + len(token.text), token.text) for token in doc if not (token.is_punct or token.is_space or token._.is_silence_tag or token._.is_inaudible_tag or token._.is_event_tag)]
    # print(word_spans)
    # break

    ## offset character spans
    offset = 0
    for j in range(pt_fillers.shape[0] - 1):
        offset += len(pt_utts.loc[i].iloc[j]) + 1
        pt_fillers.loc[i].iloc[j+1] = [(span[0] + offset, span[1] + offset, span[2]) for span in pt_fillers.loc[i].iloc[j+1]]

    filler_spans = pt_fillers.sum()
    # print(filler_spans)
    # break

    ## compute metrics
    filler_word_idxs = [word_spans.index(span) for span in pt_fillers.sum()]
    dist_btwn_fillers = [filler_word_idxs[i] - filler_word_idxs[i-1] for i in range(1, len(filler_word_idxs))]
    num_words_btwn_fillers.loc[i, "mean"] = np.mean(dist_btwn_fillers) if len(dist_btwn_fillers) > 0 else len(word_spans)
    num_words_btwn_fillers.loc[i, "std"] = np.std(dist_btwn_fillers) if len(dist_btwn_fillers) > 0 else 0.0
    num_words_btwn_fillers.loc[i, "mean_norm"] = num_words_btwn_fillers.loc[i, "mean"] / len(word_spans)
    num_words_btwn_fillers.loc[i, "std_norm"] = num_words_btwn_fillers.loc[i, "std"] / len(word_spans)
    # break

In [0]:
plt.violinplot(
    (num_words_btwn_fillers.loc[tst_ad_pts, "mean"],
     num_words_btwn_fillers.loc[tst_cn_pts, "mean"],
     num_words_btwn_fillers.loc[tst_ad_pts, "std"],
     num_words_btwn_fillers.loc[tst_cn_pts, "std"]),
    showmedians=True,
)
plt.xticks(range(1, 5), ["Mean AD Group", "Mean Control Group", "Std AD Group", "Std Control Group"], rotation=45, ha="right")
plt.ylabel("Filler Rate")
plt.grid()
plt.show()
plt.close()

In [0]:
plt.violinplot(
    (num_words_btwn_fillers.loc[tst_ad_pts, "mean_norm"],
     num_words_btwn_fillers.loc[tst_cn_pts, "mean_norm"],
     num_words_btwn_fillers.loc[tst_ad_pts, "std_norm"],
     num_words_btwn_fillers.loc[tst_cn_pts, "std_norm"]),
    showmedians=True,
)
plt.xticks(range(1, 5), ["Mean (norm) AD Group", "Mean (norm) Control Group", "Std (norm) AD Group", "Std (norm) Control Group"], rotation=45, ha="right")
plt.ylabel("Filler Rate")
plt.grid()
plt.show()
plt.close()

Analyze correlation between our metrics and the outcome variable

In [0]:
for metric, scores in zip(["filler rate", "mean words between fillers", "std words between fillers"], [filler_rates, num_words_btwn_fillers["mean"], num_words_btwn_fillers["std"]]):
    # score averages
    mean_ad = scores.loc[tst_ad_pts].mean()
    std_ad = scores.loc[tst_ad_pts].std()
    mean_cn = scores.loc[tst_cn_pts].mean()
    std_cn = scores.loc[tst_cn_pts].std()

    # correlation metrics
    res_ttest = ttest_ind(outcomes.loc[tst_pts, "AD_dx"], scores.loc[tst_pts])
    res_mannw = mannwhitneyu(outcomes.loc[tst_pts, "AD_dx"], scores.loc[tst_pts])
    res_auc   = roc_auc_score(outcomes.loc[tst_pts, "AD_dx"], scores.loc[tst_pts])

    print("%s & %.2f (%.2f) & %.2f (%.2f) & %.2f (%s) & %.2f (%s) & %.2f \\\\" % (metric,
                                                                    mean_ad,
                                                                    std_ad,
                                                                    mean_cn,
                                                                    std_cn,
                                                                    res_ttest.statistic,
                                                                    str(round(res_ttest.pvalue, 3)) if res_ttest.pvalue >= 0.001 else "<0.001", 
                                                                    res_mannw.statistic, 
                                                                    str(round(res_ttest.pvalue, 3)) if res_ttest.pvalue >= 0.001 else "<0.001",
                                                                    res_auc)
    )

## External Validation on Penn Data

In [0]:
from data.observer import load_penn_transcripts, load_penn_outcomes

In [0]:
penn_trans = load_penn_transcripts()
penn_trans.head()
# TODO add zero padding to ids

In [0]:
trans.index.get_level_values("provider_id").map(lambda x: f"PR{int(str(x)[2:]):03d}")

In [0]:
penn_outcomes = load_penn_outcomes()
penn_outcomes.head()

In [0]:
# run on data
outputs = penn_trans.apply(lambda x: keyword_detector_final.detect(x["Transcript"]) if x["Speaker"] == "Patient" else [], axis=1)
# compute filler rates
num = outputs.apply(len).groupby(level="visit_file").sum()
den = penn_trans.apply(lambda x: len(re.findall(r'\w+', x["Transcript"])) if x["Speaker"] == "Patient" else 0, axis=1).groupby(level="visit_file").sum()
filler_rates = 100 * num / den

In [0]:
ttest_ind(penn_outcomes.loc[filler_rates.index, "AD_dx"], filler_rates)

In [0]:
mannwhitneyu(penn_outcomes.loc[filler_rates.index, "AD_dx"], filler_rates)