# Detecting Vague Speech

**TODO**:
- Generator + LLM Critic experiment

In [0]:
%load_ext autoreload
%autoreload 1
%aimport data.adress
%aimport detectors.common_detectors.keyword_detector
%aimport utils

In [0]:
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
import pickle
from pprint import pprint
# Evaluation
from utils import evaluate
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
# Generative AI
import json
from openai import OpenAI
from utils import llm_call
import mlflow
from mlflow.genai.scorers import Safety, scorer
from mlflow.entities import Feedback

## Load the Data

In [0]:
from data.adress import load_transcripts

In [0]:
adress_trans = load_transcripts()
adress_trans = adress_trans[["Timestamp", "Speaker", "Transcript", "Transcript_clean", "Vague"]]
adress_trans.head()

In [0]:
trn_pt_utt_idx = (adress_trans.index.get_level_values("split") == "train") & (adress_trans["Speaker"] == "Patient")
dev_pt_utt_idx = (adress_trans.index.get_level_values("split") == "dev")   & (adress_trans["Speaker"] == "Patient")
tst_pt_utt_idx = (adress_trans.index.get_level_values("split") == "test")  & (adress_trans["Speaker"] == "Patient")

## Baseline: Keyword Search

In [0]:
from detectors.common_detectors.keyword_detector import KeywordDetector as VagueKeywordDetector
from utils import create_custom_nlp

First, we look at the annotated vague speech in the train dataset to help inform our list of keywords.

In [0]:
with pd.option_context("display.max_colwidth", 100):
    print(adress_trans.loc[trn_pt_utt_idx & (adress_trans["Vague"] == 1), "Transcript_clean"])

Then we define our keyword lists.

In [0]:
non_specific_refs = [
    "anybody", 
    "anyone", 
    "anything", 
    "area", 
    "everything", 
    "here",
    "it",
    "nothing",
    "one",
    "ones"
    "part",
    "place",
    "people", 
    "person",
    "that",
    "there",
    "thing",
    "things",
    "this",
    "someone",
    "somebody", 
    "something",
    "stuff",
    "whatever",
    "whichever",
]  

hedges = [
    "basically",
    "maybe", 
    "probably", 
    "possibly",
    "potentially",
    "perhaps",
    "somewhat",
    "i guess", 
    "i think",
    "kind of",
    "pretty much",
    "sort of",
    "i don't know",
    "more or less",
]

#### Explore different keyword sets

In [0]:
from joblib import Parallel, delayed
from tqdm import tqdm

In [0]:
configs = {
    "non_specific_refs": (non_specific_refs,),
    "hedges": (hedges,),
    "non_specific_refs+hedges": (non_specific_refs+hedges,)
}

In [0]:
def run(cfg_name, cfg, dataset):
    # Create spaCy vocab
    nlp = create_custom_nlp()
    # Initialize keyword detector with config
    d = VagueKeywordDetector(nlp, *cfg)
    # Run detector on dataset
    outputs = dataset["Transcript_clean"].apply(d.detect)
    # Evaluate performance
    true = dataset["Vague"]
    pred = outputs.apply(lambda x: len(x["detections"]) > 0).astype(int)
    scores = evaluate(true, pred)
    return cfg_name, scores

In [0]:
results = Parallel(n_jobs=10, return_as="generator_unordered")(delayed(run)(cfg_name, configs[cfg_name], adress_trans.loc[trn_pt_utt_idx]) for cfg_name in configs)
results = [res for res in tqdm(results, total=len(configs))]

configs, cfg_scores = zip(*results)
table = pd.DataFrame(cfg_scores, index=configs)
table.to_csv("vague_trn_results.csv")

In [0]:
table

In [0]:
# table.sort_values("f1", ascending=False).round(3)
print(table.round(3).to_latex())

#### Evaluate best vague keyword detector configuration

In [0]:
# create custom spaCy
nlp = create_custom_nlp()
# init detector
keyword_detector_final = VagueKeywordDetector(nlp, non_specific_refs)
# run on all patitent utterances
adress_trans["keyword_dets"] = adress_trans.apply(lambda x: keyword_detector_final.detect(x["Transcript_clean"]) if x["Speaker"] == "Patient" else pd.NA, axis=1)

###### Performance on the test data.

In [0]:
true = adress_trans.loc[tst_pt_utt_idx, "Vague"]
pred = adress_trans.loc[tst_pt_utt_idx, "keyword_dets"].apply(lambda x: len(x["detections"]) > 0).astype(int)
print(evaluate(true, pred, return_latex=True))

## LLM-Based Detector

In [0]:
mlflow_creds = mlflow.utils.databricks_utils.get_databricks_host_creds()

client = OpenAI(
    api_key=mlflow_creds.token,
    base_url=f"{mlflow_creds.host}/serving-endpoints"
)

In [0]:
datasets = {
    "train": [],
    "dev": [],
    "test": []
}

for (split, pt_id), grp in adress_trans.groupby(level=["split", "ID"]):
    transcript = "\n".join((grp.index.get_level_values("utt_num").astype(str) + ": [" + grp["Speaker"] + "] " + grp["Transcript_clean"]).values)
    datasets[split].append({
        "split": "train",
        "pt_id": pt_id,
        "inputs": {"text": transcript},
        "expectations": {"is_vague": {utt_num: row["Vague"] for (_, _, utt_num), row in grp.iterrows() if row["Speaker"] == "Patient"}}
    })

# pprint(datasets["train"])

In [0]:
@scorer
def correct(expectations, outputs):
    det_utts = [det["utt_num"] for det in outputs["detections"]]

    det_err = expectations["is_vague"]
    val = 100 * sum([1 for utt_id in det_err if ((utt_id in det_utts) and det_err[utt_id]) or ((utt_id not in det_utts) and not det_err[utt_id])]) / len(det_err)

    return Feedback(value=val)

In [0]:
def process_mlflow_outputs(result):
    outputs = result.tables["eval_results"][["response", "assessments"]]
    outputs["labels"] = outputs["assessments"].apply(lambda x: [a["value"] for a in x if a["name"] == "is_vague"][0])
    outputs["labels"] = outputs["labels"].apply(json.loads)
    return outputs

def extract_true_pred_from_ouputs(outputs):
    outputs["utts_with_dets"] = outputs["response"].apply(lambda x: np.unique([det["utt_num"] for det in x["detections"]]))

    true, pred = [], []
    for i, row in outputs.iterrows():
        true.extend(row["labels"].values())
        pred.extend([1 if int(utt_num) in row["utts_with_dets"] else 0 for utt_num in row["labels"]])
    
    return true, pred

##### Try zero-shot prompting

In [0]:
"0"
'''# INSTRUCTIONS
You are a neurologist analyzing a patient's speech sample for signs of cognitive impairment.

Your task is to identify all instances of vague terms or phrases in the input provided below. 

### Definition
Vague speech occurs when a person struggles to retrieve specific, concrete words and opts to use general placeholders or less specific terms instead.

### Output Format
Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected vague speech segment and must have the following key-value pairs:
- "type": ``vague''.
- "text": The verbatim text of the vague speech detection.
- "utt_num": The utterance number where "text" was detected.
- "span": The character span of "text" in the "utt_num" utterance.

# INPUT
{input_text}
'''

"1_1"
'''# INSTRUCTIONS
You are a neurologist analyzing a patient's speech sample for signs of cognitive impairment.

Your task is to identify all instances of vague terms or phrases in a patient's speech provided in the input below. 

### Definition
Vague speech occurs when a person struggles to retrieve specific, concrete words and opts to use general placeholders or less specific terms instead. However, do not flag a general term if its meaning is either made clear by the context (e.g., it has a specific antecedent) or if it represents a common and appropriate pattern of casual speech.

### Output Format
Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected vague speech segment and must have the following key-value pairs:
- "type": ``vague''.
- "text": The verbatim text of the vague speech detection.
- "utt_num": The utterance number where "text" was detected.
- "span": The character span of "text" in the "utt_num" utterance.

# INPUT
{input_text}
'''

version = "2_1"
zs_prompt = '''# INSTRUCTIONS
You are a neurologist analyzing a patient's speech sample for signs of cognitive impairment.

Your task is to identify all instances of vague terms or phrases in a patient's speech provided in the input below. 

### Definition
Vague speech occurs when a person struggles to retrieve specific, concrete words and opts to use general placeholders or less specific terms instead. However, do not flag a vague term or phrase if its meaning is either made clear by the context (e.g., it has a specific antecedent) or if it represents a common and appropriate pattern of casual speech that does not indicate word-finding difficulty or uncertainty. Do not flag filler or event and inaudible tokens as vague terms. Do not flag an entire utterance as vague if only a few words or phrases are the source of the vagueness.

### Output Format
Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected vague speech segment and must have the following key-value pairs:
- "type": ``vague''.
- "text": The verbatim text of the vague speech detection.
- "utt_num": The utterance number where "text" was detected.
- "span": The character span of "text" in the "utt_num" utterance.

# INPUT
{input_text}
'''

"3"
'''# INSTRUCTIONS
You are a neurologist analyzing a patient's speech sample for signs of cognitive impairment.

Your task is to identify all instances of vague terms or phrases in a patient's speech provided in the input below. 

### Definition
Vague speech occurs when a person struggles to retrieve specific, concrete words and opts to use general placeholders or less specific terms instead. 

**Exclusion Criteria**
- Do not flag an entire utterance as vague if only a few words or phrases are the source of the vagueness.
- Do not flag filler or event and inaudible tokens as vague terms. 
- Do not flag a vague term or phrase if its meaning is either made clear by the context (e.g., it has a specific antecedent).
- Do not flag a vague term or phrase if it represents a common and appropriate pattern of casual speech rather than word-finding difficulty or uncertainty. 

### Output Format
Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected vague speech segment and must have the following key-value pairs:
- "type": ``vague''.
- "text": The verbatim text of the vague speech detection.
- "utt_num": The utterance number where "text" was detected.
- "span": The character span of "text" in the "utt_num" utterance.

# INPUT
{input_text}
'''

In [0]:
llm_detector = lambda text: llm_call(client, "openai_gpt_4o", None, zs_prompt.format(input_text=text), {"type": "json_object"})

with mlflow.start_run(run_name=f"llm_trn_gpt4o_pV{version}") as run:
    gpt_dets_trn = mlflow.genai.evaluate(
        predict_fn=llm_detector,
        data=datasets["train"],
        scorers=[correct]
    )

In [0]:
outputs = process_mlflow_outputs(gpt_dets_trn)
print(outputs)

##### Try few-shot prompting

First we do an error analysis of the best zero-shot prompt on the development data.

In [0]:
llm_detector = lambda text: llm_call(client, "openai_gpt_4o", None, zs_prompt.format(input_text=text), {"type": "json_object"})

with mlflow.start_run(run_name=f"llm_dev_gpt4o_pV{version}") as run:
    gpt_dets_dev = mlflow.genai.evaluate(
        predict_fn=llm_detector,
        data=datasets["dev"],
        scorers=[correct]
    )

Then we try adding the examples to the prompt.

In [0]:
fs_version = "2_1_fewshot"
fs_prompt = '''# INSTRUCTIONS
You are a neurologist analyzing a patient's speech sample for signs of cognitive impairment.

Your task is to identify all instances of vague terms or phrases in a patient's speech provided in the input below. 

### Definition
Vague speech occurs when a person struggles to retrieve specific, concrete words and opts to use general placeholders or less specific terms instead. However, do not flag a vague term or phrase if its meaning is either made clear by the context (e.g., it has a specific antecedent) or if it represents a common and appropriate pattern of casual speech that does not indicate word-finding difficulty or uncertainty. Do not flag filler or event and inaudible tokens as vague terms. Do not flag an entire utterance as vague if only a few words or phrases are the source of the vagueness.

### Output Format
Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected vague speech segment and must have the following key-value pairs:
- "type": ``vague''.
- "text": The verbatim text of the vague speech detection.
- "utt_num": The utterance number where "text" was detected.
- "span": The character span of "text" in the "utt_num" utterance.

### Examples
**Input**:
5: Provider: okay tell me what else you see . 
6: Patient: [silence] some little knots or somethin . 
7: [Patient] I don't know . 
8: [Patient] [silence] [inaudible] . 
9: [Patient] [silence] some kind of a [inaudible] pan or somethin . 
10: [Patient] [silence] and that girl is there .

**Correct Output**:
{{
    "detections": [
        {{"type": "vague", "text": "or somethin", "utt_num": 6, "span": [32, 42]}},
        {{"type": "vague", "text": "or somethin", "utt_num": 9, "span": [41, 51]}},
    ]
}}

**Incorrect Output**
{{
    "detections": [
        {{"type": "vague", "text": "[silence]", "utt_num": 8, "span": [0, 9]}},
        {{"type": "vague", "text": "that girl", "utt_num": 10, "span": [32, 42]}},
    ]
}}
Error: Incorrectly flags (1) "that girl" which was used to identify a person by age and gender rather than express uncertainty about who this person is, and (2) annotation markers ([silence]). 

# INPUT
{input_text}
'''

In [0]:
llm_detector = lambda text: llm_call(client, "openai_gpt_4o", None, fs_prompt.format(input_text=text), {"type": "json_object"})

with mlflow.start_run(run_name=f"llm_trn_gpt4o_pV{fs_version}_run3") as run:
    gpt_fs_dets_trn = mlflow.genai.evaluate(
        predict_fn=llm_detector,
        data=datasets["train"],
        scorers=[correct]
    )

In [0]:
outputs = process_mlflow_outputs(gpt_fs_dets_trn)
outputs.to_pickle(f"llm_outputs/vague_trn_gpt_pV{fs_version}.pkl")

In [0]:
true, pred = extract_true_pred_from_ouputs(outputs)
print(evaluate(true, pred, return_latex=True))

In [0]:
with open("llm_outputs/vague_trn_gpt_pV2_1_fewshot.pkl", "rb") as f:
    outputs = pickle.load(f)

true, pred = extract_true_pred_from_ouputs(outputs)
print(evaluate(true, pred, return_latex=True))

##### Finally, evaluate the best prompt

In [0]:
llm_detector = lambda text: llm_call(client, "openai_gpt_4o", None, fs_prompt.format(input_text=text), {"type": "json_object"})

with mlflow.start_run(run_name=f"llm_tst_gpt4o_pV{fs_version}") as run:
    gpt_dets_tst = mlflow.genai.evaluate(
        predict_fn=llm_detector,
        data=datasets["test"],
        scorers=[correct]
    )

In [0]:
outputs = process_mlflow_outputs(gpt_dets_tst)
outputs.to_pickle(f"llm_outputs/vague_tst_gpt_pV{fs_version}.pkl")

In [0]:
true, pred = extract_true_pred_from_ouputs(outputs)
print(evaluate(true, pred, return_latex=True))

## (@Sriharsha) Try using a LLM critic to post-process keyword detections.
We are already acheiving high performance with the keyword detector using filler sounds and uncommon letters. Can we improve performance by using an LLM to remove false positive detections?

*Experiment*: For each patient session, aggregate the utterances into a transcript and the filler detection lists (remember to offset them). Query an LLM to remove detections that are not actually filler. Try 3 different LLMs once you've settled on a good prompt. Also, try using MLFlow to log LLM outputs. It makes comparing the effect of different prompt versions a little easier to visualize.

##### Investigate the failure cases or prior methods.

Keyword search detector

In [0]:
with pd.option_context("display.max_rows", None, "display.max_colwidth", None):
    print("False positives:")
    pprint(adress_trans.loc[dev_pt_utt_idx & (adress_trans["Filler"] == 0) & (adress_trans["keyword_dets"].apply(lambda x: len(x["detections"]) > 0 if type(x) == dict else False)), ["Transcript", "keyword_dets"]].values)

In [0]:
with pd.option_context("display.max_rows", None, "display.max_colwidth", None):
    print("False negatives:")
    pprint(adress_trans.loc[dev_pt_utt_idx & (adress_trans["Filler"] == 1) & (adress_trans["keyword_dets"].apply(lambda x: len(x["detections"]) == 0 if type(x) == dict else False)), ["Transcript", "keyword_dets"]].values)

LLM-based detector

In [0]:
with pd.option_context("display.max_rows", None, "display.max_colwidth", None):
    print("False positives:")
    pprint(adress_trans.loc[dev_pt_utt_idx & (adress_trans["Filler"] == 0) & (adress_trans["llm_dets"].apply(lambda x: len(x["detections"]) > 0 if type(x) == dict else False)), ["Transcript", "llm_dets"]].values)

In [0]:
with pd.option_context("display.max_rows", None, "display.max_colwidth", None):
    print("False negatives:")
    pprint(adress_trans.loc[dev_pt_utt_idx & (adress_trans["Filler"] == 1) & (adress_trans["llm_dets"].apply(lambda x: len(x["detections"]) == 0 if type(x) == dict else False)), ["Transcript", "llm_dets"]].values)

##### Explore different prompts.

In [0]:
# @Sriharsha Have the llm output a json object with the key "fillers" whose value is a list of json objects for each detection. Each detection object must have a "text" key whose value is the filler sound/word/phrase and a "span" key whose value is a list of the start and end character index of the filler in the utterance.

critic_prompt = '''{}'''

In [0]:
dataset_kw = []
dataset_llm = []
dataset = []

In [0]:
fn = lambda transcript, detections: llm_call(client, "openai_gpt_4o", None, critic_prompt.format(transcript, detections), {"type": "json_object"})

with mlflow.start_run(run_name="Keywords + Critic pv0") as run:
    result = mlflow.genai.evaluate(
        predict_fn=fn,
        data=dataset,
        scorers=[correct]
    )

##### Evaluate the best prompt

In [0]:
fn = lambda text: llm_call(client, "openai_gpt_4o", None, critic_prompt.format(text), {"type": "json_object"})

with mlflow.start_run(run_name="LLM + Critic pv0") as run:
    result = mlflow.genai.evaluate(
        predict_fn=fn,
        data=dataset_kw,
        scorers=[correct]
    )

In [0]:
fn = lambda text: llm_call(client, "openai_gpt_4o", None, critic_prompt.format(text), {"type": "json_object"})

with mlflow.start_run(run_name="LLM + Critic pv0") as run:
    result = mlflow.genai.evaluate(
        predict_fn=fn,
        data=dataset_llm,
        scorers=[correct]
    )

## Summary metrics for vague speech detections

In [0]:
from data.adress import load_outcomes

In [0]:
outcomes = load_outcomes()
outcomes.head()

In [0]:
trn_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "train")].index.values
trn_ad_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "train") & (outcomes["AD_dx"] == 1)].index.values
trn_cn_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "train") & (outcomes["AD_dx"] == 0)].index.values

In [0]:
tst_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test")].index.values
tst_ad_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test") & (outcomes["AD_dx"] == 1)].index.values
tst_cn_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test") & (outcomes["AD_dx"] == 0)].index.values

**Vague Term Rate** = Total number of detected vague terms / Total number of words spoken

In [0]:
def compute_vague_term_rate(outputs):
    num = outputs.apply(lambda x: len(x["detections"]) if not pd.isna(x) else 0).groupby(level=("split", "ID")).sum()
    den = adress_trans.apply(lambda x: sum([1 for token in nlp(x["Transcript_clean"]) if not (token.is_punct or token.is_space or token._.is_silence_tag or token._.is_inaudible_tag or token._.is_event_tag)]) if x["Speaker"] == "Patient" else 0, axis=1).groupby(level=("split", "ID")).sum()
    return 100 * num / den

In [0]:
outcomes["kw_vague_term_rate"] = compute_vague_term_rate(adress_trans["keyword_dets"])

**Vague Utterance Ratio**: Number of utterances containing vague terms / Total number of patient utterances

In [0]:
def vague_utterance_ratio(output_name):
    num = adress_trans[output_name].apply(lambda x: int(len(x["detections"]) > 0) if not pd.isna(x) else 0).groupby(level=("split", "ID")).sum()
    den = (adress_trans["Speaker"] == "Patient").groupby(level=("split", "ID")).sum()
    return 100 * num / den

In [0]:
outcomes["kw_vague_utt_ratio"] = vague_utterance_ratio("keyword_dets")

Test for statistically significant differences in feature values between the AD and non-AD groups.

In [0]:
# Generates the rows of Table 10 and 11
def analysis(split_idx, split_ad, split_cn, metrics):
    for m in metrics:
        ## score averages
        mean_ad = outcomes.loc[split_ad, m].mean()
        std_ad = outcomes.loc[split_ad, m].std()
        mean_cn = outcomes.loc[split_cn, m].mean()
        std_cn = outcomes.loc[split_cn, m].std()

        ## correlation metrics
        res_ttest = ttest_ind(outcomes.loc[split_idx, "AD_dx"], outcomes.loc[split_idx, m])
        res_mannw = mannwhitneyu(outcomes.loc[split_idx, "AD_dx"], outcomes.loc[split_idx, m])
        res_auc   = roc_auc_score(outcomes.loc[split_idx, "AD_dx"], outcomes.loc[split_idx, m])

        print("%s & %.2f (%.2f) & %.2f (%.2f) & %.2f (%s) & %.2f (%s) & %.2f \\\\" % 
                (m,
                mean_ad,
                std_ad,
                mean_cn,
                std_cn,
                res_ttest.statistic,
                str(round(res_ttest.pvalue, 3)) if res_ttest.pvalue >= 0.001 else "$<$0.001", 
                res_mannw.statistic, 
                str(round(res_ttest.pvalue, 3)) if res_ttest.pvalue >= 0.001 else "$<$0.001",
                res_auc)
        )

In [0]:
mets = [
    "kw_vague_term_rate",
    "kw_vague_utt_ratio", 
]

In [0]:
analysis(trn_pts, trn_ad_pts, trn_cn_pts, mets)

In [0]:
analysis(tst_pts, tst_ad_pts, tst_cn_pts, mets)

In [0]:
outcomes[["kw_vague_term_rate", "kw_vague_utt_ratio"]].to_csv("vague_feats.csv")