# Detecting Vague Speech

**TODO**:
- Generator + LLM Critic experiment

In [0]:
%load_ext autoreload
%autoreload 1
%aimport data.adress
%aimport detectors.common_detectors.keyword_detector
%aimport utils

In [0]:
import sys
sys.path.append("..")
import pandas as pd
from pprint import pprint
# Evaluation
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
# Generative AI
from openai import OpenAI
from utils import llm_call
import mlflow
from mlflow.genai.scorers import Safety, scorer
from mlflow.entities import Feedback

## Load the Data

In [0]:
from data.adress import load_transcripts

In [0]:
adress_trans = load_transcripts()
adress_trans = adress_trans[["Timestamp", "Speaker", "Transcript", "Transcript_clean", "Vague"]]
adress_trans.head()

In [0]:
trn_pt_utt_idx = (adress_trans.index.get_level_values("split") == "train") & (adress_trans["Speaker"] == "Patient")
dev_pt_utt_idx = (adress_trans.index.get_level_values("split") == "dev")   & (adress_trans["Speaker"] == "Patient")
tst_pt_utt_idx = (adress_trans.index.get_level_values("split") == "test")  & (adress_trans["Speaker"] == "Patient")

## Baseline: Keyword Search

In [0]:
from detectors.common_detectors.keyword_detector import KeywordDetector as VagueKeywordDetector
from utils import create_custom_nlp

First, we look at the annotated vague speech in the train dataset to help inform our list of keywords.

In [0]:
with pd.option_context("display.max_colwidth", 100):
    print(adress_trans.loc[trn_pt_utt_idx & (adress_trans["Vague"] == 1), "Transcript_clean"])

Then we define our keyword lists.

In [0]:
non_specific_refs = [
    "anybody", 
    "anyone", 
    "anything", 
    "area", 
    "everything", 
    "here",
    "it",
    "nothing",
    "one",
    "ones"
    "part",
    "place",
    "people", 
    "person",
    "that",
    "there",
    "thing",
    "things",
    "this",
    "someone",
    "somebody", 
    "something",
    "stuff",
    "whatever",
    "whichever",
]  

hedges = [
    "basically",
    "maybe", 
    "probably", 
    "possibly",
    "potentially",
    "perhaps",
    "somewhat",
    "i guess", 
    "i think",
    "kind of",
    "pretty much",
    "sort of",
    "i don't know",
    "more or less",
]

# vague_phrases = [
#     "and all", 
#     "a little", 
#     "and stuff", 
#     "all that",
#     "and things", 
#     # "i think",
#     # "looks like",
#     "or something", 
#     # "pretty much",
#     "sort of"
# ]

#### Explore different keyword sets

In [0]:
from joblib import Parallel, delayed
from tqdm import tqdm

In [0]:
# configs = {
#     "non_specific_nouns": (non_specific_nouns,),
#     "general_nouns": (general_nouns,),
#     "hedges": (hedges,),
#     "vague_phrases": (vague_phrases,),
#     "non_specific_nouns+general_nouns": (non_specific_nouns + general_nouns,),
#     "non_specific_nouns+hedges": (non_specific_nouns + hedges,),
#     "non_specific_nouns+vague_phrases": (non_specific_nouns + vague_phrases,),
#     "general_nouns+vague_phrases": (general_nouns + vague_phrases,),
#     "general_nouns+hedges": (general_nouns + hedges,),
#     "hedges+vague_phrases": (hedges + vague_phrases,),
#     "non_specific_nouns+general_nouns+hedges": (non_specific_nouns + general_nouns + hedges,),
#     "non_specific_nouns+general_nouns+vague_phrases": (non_specific_nouns + general_nouns + vague_phrases,),
#     "general_nouns+hedges+vague_phrases": (general_nouns + hedges + vague_phrases,),
#     "non_specific_nouns+general_nouns+hedges+vague_phrases": (non_specific_nouns + general_nouns + hedges + vague_phrases,)
# }

configs = {
    "non_specific_refs": (non_specific_refs,),
    "hedges": (hedges,),
    "non_specific_refs+hedges": (non_specific_refs+hedges,)
}

In [0]:
def run(cfg_name, cfg, dataset):
    # Create spaCy vocab
    nlp = create_custom_nlp()
    # Initialize keyword detector with config
    d = VagueKeywordDetector(nlp, *cfg)
    # Run detector on dataset
    outputs = dataset["Transcript_clean"].apply(d.detect)
    # Evaluate performance
    true = dataset["Vague"]
    pred = outputs.apply(lambda x: len(x["detections"]) > 0).astype(int)
    prec = precision_score(true, pred)
    rec  = recall_score(true, pred)
    f1   = f1_score(true, pred)
    acc  = accuracy_score(true, pred)
    return cfg_name, prec, rec, f1, acc

In [0]:
results = Parallel(n_jobs=10, return_as="generator_unordered")(delayed(run)(cfg_name, configs[cfg_name], adress_trans.loc[trn_pt_utt_idx]) for cfg_name in configs)
results = [res for res in tqdm(results, total=len(configs))]

table = pd.DataFrame(results, columns=["config", "precision", "recall", "f1", "accuracy"])
table.to_csv("vague_trn_results.csv")

In [0]:
# table.sort_values("f1", ascending=False).round(3)
print(table.round(3).to_latex(index=False))

#### Evaluate best vague keyword detector configuration

In [0]:
# create custom spaCy
nlp = create_custom_nlp()
# init detector
keyword_detector_final = VagueKeywordDetector(nlp, non_specific_refs)
# run on all patitent utterances
adress_trans["keyword_dets"] = adress_trans.apply(lambda x: keyword_detector_final.detect(x["Transcript_clean"]) if x["Speaker"] == "Patient" else pd.NA, axis=1)

###### Performance on the test data.

In [0]:
def evaluate(true, pred):
    prec = precision_score(true, pred)
    rec  = recall_score(true, pred)
    f1   = f1_score(true, pred)
    acc  = accuracy_score(true, pred)
    return f"{prec:.3f} & {rec:.3f} & {f1:.3f} & {acc:.3f} \\\\"

In [0]:
true = adress_trans.loc[tst_pt_utt_idx, "Vague"]
pred = adress_trans.loc[tst_pt_utt_idx, "keyword_dets"].apply(lambda x: len(x["detections"]) > 0).astype(int)
print(evaluate(true, pred))

## LLM-Based Detector

##### Explore different Prompts

In [0]:
mlflow_creds = mlflow.utils.databricks_utils.get_databricks_host_creds()

client = OpenAI(
    api_key=mlflow_creds.token,
    base_url=f"{mlflow_creds.host}/serving-endpoints"
)

In [0]:
prompt = ''''''

In [0]:
trn_dataset = []
for idx, row in adress_trans.loc[trn_pt_utt_idx].iterrows():
    trn_dataset.append({
        "split": idx[0],
        "ID": idx[1],
        "utt_num": idx[2],
        "inputs": {"text": row["Transcript_clean"]},
        "expectations": {"has_filler": bool(row["Filler"])}
    })

In [0]:
@scorer
def correct(expectations, outputs):
    return Feedback(value=(expectations["has_filler"] == (len(outputs["detections"]) > 0)))

In [0]:
fn = lambda text: llm_call(client, "openai_gpt_4o", None, prompt.format(text), {"type": "json_object"})

with mlflow.start_run(run_name="llm_trn_gpt4o_pV7_2") as run:
    gpt_dets_trn = mlflow.genai.evaluate(
        predict_fn=fn,
        data=trn_dataset,
        scorers=[correct]
    )

In [0]:
def process_mlflow_outputs(result):
    outputs = result.tables["eval_results"][["response", "assessments"]]
    outputs["pred"] = outputs["response"].apply(lambda x: (len(x["detections"]) > 0) if type(x) == dict else False).astype(int)
    outputs["label"] = outputs["assessments"].apply(lambda x: [a["value"] for a in x if a["name"] == "has_filler"][0]).astype(int)
    return outputs

In [0]:
outputs = process_mlflow_outputs(gpt_dets_trn)
print(evaluate(outputs["label"], outputs["pred"]))

##### Compare performance of different LLMs

In [0]:
fn = lambda text: llm_call(client, "meta_llama_v3_1_8b_instruct", None, prompt.format(text), {"type": "json_object"})

with mlflow.start_run(run_name="llm_eval_llama3_1_8b_pV7_2") as run:
    llama_dets_trn = mlflow.genai.evaluate(
        predict_fn=fn,
        data=trn_dataset,
        scorers=[correct]
    )

In [0]:
outputs = process_mlflow_outputs(llama_dets_trn)
print(evaluate(outputs["label"], outputs["pred"]))

Llama actually performs extremely terrible with this prompt.

##### Evaluate the best prompt on the test data

In [0]:
tst_dataset = []
for idx, row in adress_trans.loc[tst_pt_utt_idx].iterrows():
    tst_dataset.append({
        "split": idx[0],
        "ID": idx[1],
        "utt_num": idx[2],
        "inputs": {"text": row["Transcript_clean"]},
        "expectations": {"has_filler": bool(row["Filler"])}
    })

In [0]:
fn = lambda text: llm_call(client, "openai_gpt_4o", None, prompt.format(text), {"type": "json_object"})

with mlflow.start_run(run_name="gpt_eval_tst") as run:
    gpt_dets_tst = mlflow.genai.evaluate(
        predict_fn=fn,
        data=tst_dataset,
        scorers=[correct]
    )

In [0]:
outputs = process_mlflow_outputs(gpt_dets_tst)
print(evaluate(outputs["label"], outputs["pred"]))

## (@Sriharsha) Try using a LLM critic to post-process keyword detections.
We are already acheiving high performance with the keyword detector using filler sounds and uncommon letters. Can we improve performance by using an LLM to remove false positive detections?

*Experiment*: For each patient session, aggregate the utterances into a transcript and the filler detection lists (remember to offset them). Query an LLM to remove detections that are not actually filler. Try 3 different LLMs once you've settled on a good prompt. Also, try using MLFlow to log LLM outputs. It makes comparing the effect of different prompt versions a little easier to visualize.

##### Investigate the failure cases or prior methods.

Keyword search detector

In [0]:
with pd.option_context("display.max_rows", None, "display.max_colwidth", None):
    print("False positives:")
    pprint(adress_trans.loc[dev_pt_utt_idx & (adress_trans["Filler"] == 0) & (adress_trans["keyword_dets"].apply(lambda x: len(x["detections"]) > 0 if type(x) == dict else False)), ["Transcript", "keyword_dets"]].values)

In [0]:
with pd.option_context("display.max_rows", None, "display.max_colwidth", None):
    print("False negatives:")
    pprint(adress_trans.loc[dev_pt_utt_idx & (adress_trans["Filler"] == 1) & (adress_trans["keyword_dets"].apply(lambda x: len(x["detections"]) == 0 if type(x) == dict else False)), ["Transcript", "keyword_dets"]].values)

LLM-based detector

In [0]:
with pd.option_context("display.max_rows", None, "display.max_colwidth", None):
    print("False positives:")
    pprint(adress_trans.loc[dev_pt_utt_idx & (adress_trans["Filler"] == 0) & (adress_trans["llm_dets"].apply(lambda x: len(x["detections"]) > 0 if type(x) == dict else False)), ["Transcript", "llm_dets"]].values)

In [0]:
with pd.option_context("display.max_rows", None, "display.max_colwidth", None):
    print("False negatives:")
    pprint(adress_trans.loc[dev_pt_utt_idx & (adress_trans["Filler"] == 1) & (adress_trans["llm_dets"].apply(lambda x: len(x["detections"]) == 0 if type(x) == dict else False)), ["Transcript", "llm_dets"]].values)

##### Explore different prompts.

In [0]:
# @Sriharsha Have the llm output a json object with the key "fillers" whose value is a list of json objects for each detection. Each detection object must have a "text" key whose value is the filler sound/word/phrase and a "span" key whose value is a list of the start and end character index of the filler in the utterance.

critic_prompt = '''{}'''

In [0]:
dataset_kw = []
dataset_llm = []
dataset = []

In [0]:
fn = lambda transcript, detections: llm_call(client, "openai_gpt_4o", None, critic_prompt.format(transcript, detections), {"type": "json_object"})

with mlflow.start_run(run_name="Keywords + Critic pv0") as run:
    result = mlflow.genai.evaluate(
        predict_fn=fn,
        data=dataset,
        scorers=[correct]
    )

##### Evaluate the best prompt

In [0]:
fn = lambda text: llm_call(client, "openai_gpt_4o", None, critic_prompt.format(text), {"type": "json_object"})

with mlflow.start_run(run_name="LLM + Critic pv0") as run:
    result = mlflow.genai.evaluate(
        predict_fn=fn,
        data=dataset_kw,
        scorers=[correct]
    )

In [0]:
fn = lambda text: llm_call(client, "openai_gpt_4o", None, critic_prompt.format(text), {"type": "json_object"})

with mlflow.start_run(run_name="LLM + Critic pv0") as run:
    result = mlflow.genai.evaluate(
        predict_fn=fn,
        data=dataset_llm,
        scorers=[correct]
    )

## Summary metrics for vague speech detections

In [0]:
from data.adress import load_outcomes

In [0]:
outcomes = load_outcomes()
outcomes.head()

In [0]:
trn_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "train")].index.values
trn_ad_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "train") & (outcomes["AD_dx"] == 1)].index.values
trn_cn_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "train") & (outcomes["AD_dx"] == 0)].index.values

In [0]:
tst_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test")].index.values
tst_ad_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test") & (outcomes["AD_dx"] == 1)].index.values
tst_cn_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test") & (outcomes["AD_dx"] == 0)].index.values

**Vague Term Rate** = Total number of detected vague terms / Total number of words spoken

In [0]:
def compute_vague_term_rate(outputs):
    num = outputs.apply(lambda x: len(x["detections"]) if not pd.isna(x) else 0).groupby(level=("split", "ID")).sum()
    den = adress_trans.apply(lambda x: sum([1 for token in nlp(x["Transcript_clean"]) if not (token.is_punct or token.is_space or token._.is_silence_tag or token._.is_inaudible_tag or token._.is_event_tag)]) if x["Speaker"] == "Patient" else 0, axis=1).groupby(level=("split", "ID")).sum()
    return 100 * num / den

In [0]:
outcomes["kw_vague_term_rate"] = compute_vague_term_rate(adress_trans["keyword_dets"])

**Vague Utterance Ratio**: Number of utterances containing vague terms / Total number of patient utterances

In [0]:
def vague_utterance_ratio(output_name):
    num = adress_trans[output_name].apply(lambda x: int(len(x["detections"]) > 0) if not pd.isna(x) else 0).groupby(level=("split", "ID")).sum()
    den = (adress_trans["Speaker"] == "Patient").groupby(level=("split", "ID")).sum()
    return 100 * num / den

In [0]:
outcomes["kw_vague_utt_ratio"] = vague_utterance_ratio("keyword_dets")

Test for statistically significant differences in feature values between the AD and non-AD groups.

In [0]:
# Generates the rows of Table 10 and 11
def analysis(split_idx, split_ad, split_cn, metrics):
    for m in metrics:
        ## score averages
        mean_ad = outcomes.loc[split_ad, m].mean()
        std_ad = outcomes.loc[split_ad, m].std()
        mean_cn = outcomes.loc[split_cn, m].mean()
        std_cn = outcomes.loc[split_cn, m].std()

        ## correlation metrics
        res_ttest = ttest_ind(outcomes.loc[split_idx, "AD_dx"], outcomes.loc[split_idx, m])
        res_mannw = mannwhitneyu(outcomes.loc[split_idx, "AD_dx"], outcomes.loc[split_idx, m])
        res_auc   = roc_auc_score(outcomes.loc[split_idx, "AD_dx"], outcomes.loc[split_idx, m])

        print("%s & %.2f (%.2f) & %.2f (%.2f) & %.2f (%s) & %.2f (%s) & %.2f \\\\" % 
                (m,
                mean_ad,
                std_ad,
                mean_cn,
                std_cn,
                res_ttest.statistic,
                str(round(res_ttest.pvalue, 3)) if res_ttest.pvalue >= 0.001 else "$<$0.001", 
                res_mannw.statistic, 
                str(round(res_ttest.pvalue, 3)) if res_ttest.pvalue >= 0.001 else "$<$0.001",
                res_auc)
        )

In [0]:
mets = [
    "kw_vague_term_rate",
    "kw_vague_utt_ratio", 
]

In [0]:
analysis(trn_pts, trn_ad_pts, trn_cn_pts, mets)

In [0]:
analysis(tst_pts, tst_ad_pts, tst_cn_pts, mets)

In [0]:
outcomes[["kw_vague_term_rate", "kw_vague_utt_ratio"]].to_csv("vague_feats.csv")