# Detecting Vague Speech

**TODO**:
- Keyword detector experiment
- Generator + LLM Critic experiment

In [0]:
from detectors.common_detectors.keyword_detector import KeywordDetector

nlp = create_custom_nlp()
detector = KeywordDetector(nlp, ["i mean"])
detector.detect("i mean i guess that's fine")

In [0]:
from detectors.vague_speech.vague_keyword_search import VagueKeywordDetector

nlp = create_custom_nlp()
detector = VagueKeywordDetector(nlp, ["i mean"])
detector.detect("i mean i guess that's fine")

In [0]:
%load_ext autoreload
%autoreload 1
%aimport data.adress
%aimport detectors.vague_speech.vague_keyword_search
%aimport utils

In [0]:
import sys
sys.path.append("..")
import json
import numpy as np
import pandas as pd
import re
from pprint import pprint
# Evaluation
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, mannwhitneyu
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
# Generative AI
from openai import OpenAI
from utils import llm_call
import mlflow
from mlflow.genai.scorers import Safety, scorer
from mlflow.entities import Feedback

## Load the Data

In [0]:
from data.adress import load_transcripts

In [0]:
adress_trans = load_transcripts()
adress_trans = adress_trans[["Timestamp", "Speaker", "Transcript", "Transcript_clean", "Vague"]]
adress_trans.head()

In [0]:
trn_pt_utt_idx = (adress_trans.index.get_level_values("split") == "train") & (adress_trans["Speaker"] == "Patient")
dev_pt_utt_idx = (adress_trans.index.get_level_values("split") == "dev")   & (adress_trans["Speaker"] == "Patient")
tst_pt_utt_idx = (adress_trans.index.get_level_values("split") == "test")  & (adress_trans["Speaker"] == "Patient")

## NLP Baseline: Keyword Search

In [0]:
from detectors.vague_speech.vague_keyword_search import VagueKeywordDetector
from utils import create_custom_nlp

First, we look at the annotated fillers in the development dataset to help inform our list of keywords.

In [0]:
with pd.option_context("display.max_colwidth", 100):
    print(adress_trans.loc[trn_pt_utt_idx & (adress_trans["Vague"] == 1), "Transcript_clean"])

Then we define our keyword lists.

In [0]:
non_specific_nouns = [
    "anything", 
    "everything", 
    "nothing", 
    "thing",
    "things"
    "something",
    "somethings",
    "stuff"
]

general_nouns = [
    "person", 
    "people", 
    "man", 
    "woman", 
    "area", 
    "place"
]

vague_phrases = [
    "sort of", 
    "kind of", 
    "a little", 
    "or something", 
    "I dont know", 
    "I guess", 
    "I think",
    "i dont know"
]

#### Explore different keyword sets

In [0]:
from joblib import Parallel, delayed
from tqdm import tqdm

In [0]:
configs = {
    "non_specific_nouns": non_specific_nouns,
    "general_nouns": general_nouns,
    "vague_phrases": vague_phrases,
    "non_specific_nouns+general_nouns": non_specific_nouns + general_nouns,
    "non_specific_nouns+vague_phrases": non_specific_nouns + vague_phrases,
    "general_nouns+vague_phrases": general_nouns + vague_phrases,
    "non_specific_nouns+general_nouns+vague_phrases": non_specific_nouns + general_nouns + vague_phrases
}

In [0]:
def run(cfg_name, cfg, dataset):
    # Create spaCy vocab
    nlp = create_custom_nlp()
    # Initialize keyword detector with config
    d = VagueKeywordDetector(nlp, cfg)
    # Run detector on dataset
    outputs = dataset["Transcript_clean"].apply(d.detect)
    # Evaluate performance
    true = dataset["Vague"]
    pred = outputs.apply(lambda x: len(x["detections"]) > 0).astype(int)
    prec = precision_score(true, pred)
    rec  = recall_score(true, pred)
    f1   = f1_score(true, pred)
    acc  = accuracy_score(true, pred)
    return cfg_name, prec, rec, f1, acc

In [0]:
results = Parallel(n_jobs=10, return_as="generator_unordered")(delayed(run)(cfg_name, configs[cfg_name], adress_trans.loc[trn_pt_utt_idx]) for cfg_name in configs)
results = [res for res in tqdm(results, total=len(configs))]

table = pd.DataFrame(results, columns=["config", "precision", "recall", "f1", "accuracy"])
table.to_csv("vague_trn_results.csv")

In [0]:
# table.sort_values("f1", ascending=False).round(3)
print(table.round(3).to_latex(index=False))

#### Evaluate best filler keyword detector configuration

In [0]:
# create custom spaCy
nlp = create_custom_nlp()
# init detector
keyword_detector_final = VagueKeywordDetector(nlp, vague_phrases)
# run on all data
# adress_trans["keyword_dets"] = adress_trans.apply(lambda x: keyword_detector_final.detect(x["Transcript_clean"]) if x["Speaker"] == "Patient" else pd.NA, axis=1)

In [0]:
keyword_detector_final.detect("hello i dont know")

###### Performance on the test data.

In [0]:
def evaluate(true, pred):
    prec = precision_score(true, pred)
    rec  = recall_score(true, pred)
    f1   = f1_score(true, pred)
    acc  = accuracy_score(true, pred)
    return f"{prec:.3f} & {rec:.3f} & {f1:.3f} & {acc:.3f} \\\\"

In [0]:
true = adress_trans.loc[tst_pt_utt_idx, "Vague"]
pred = adress_trans.loc[tst_pt_utt_idx, "keyword_dets"].apply(lambda x: len(x["detections"]) > 0).astype(int)
print(evaluate(true, pred))

## LLM-Based Detector

## Summary metrics for vague speech detections

In [0]:
from data.adress import load_outcomes

In [0]:
outcomes = load_outcomes()
outcomes.head()

In [0]:
tst_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test")].index.values
tst_ad_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test") & (outcomes["AD_dx"] == 1)].index.values
tst_cn_pts = outcomes.loc[(outcomes.index.get_level_values("split") == "test") & (outcomes["AD_dx"] == 0)].index.values

**Vague Term Rate**: Total number of detected vague terms / Total number of words spoken

In [0]:
def compute_vague_term_rate(outputs):
    num = outputs.apply(lambda x: len(x["detections"]) if not pd.isna(x) else 0).groupby(level=("split", "ID")).sum()
    den = adress_trans.apply(lambda x: sum([1 for token in nlp(x["Transcript_clean"]) if not (token.is_punct or token.is_space or token._.is_silence_tag or token._.is_inaudible_tag or token._.is_event_tag)]) if x["Speaker"] == "Patient" else 0, axis=1).groupby(level=("split", "ID")).sum()
    return 100 * num / den

In [0]:
outcomes["kw_vague_term_rate"] = compute_vague_term_rate(adress_trans["keyword_dets"])

**Vague Utterance Ratio**: Proportion of utterances that contain vague language.

In [0]:
vague_utt_ratio = lambda grp: sum([1 for det in grp["keyword_dets"] if det["detections"]]) / grp.shape[0]
outcomes["kw_vague_utt_ratio"] = adress_trans.groupby(level=("split", "ID")).apply(lambda grp: vague_utt_ratio(grp))

In [0]:
outcomes["gt_vague_utt_ratio"] = compute_vague_term_rate()

Test for statistically significant differences in feature values between the AD and non-AD groups.

In [0]:
metrics = [
    "kw_vague_term_rate", 
    "kw_vague_utt_ratio", 
    "gt_vague_utt_ratio", 
]

In [0]:
for m in metrics:
    ## score averages
    mean_ad = outcomes.loc[tst_ad_pts, m].mean()
    std_ad = outcomes.loc[tst_ad_pts, m].std()
    mean_cn = outcomes.loc[tst_cn_pts, m].mean()
    std_cn = outcomes.loc[tst_cn_pts, m].std()

    ## statistical tests
    res_ttest = ttest_ind(outcomes.loc[tst_pts, "AD_dx"], outcomes.loc[tst_pts, m])
    res_mannw = mannwhitneyu(outcomes.loc[tst_pts, "AD_dx"], outcomes.loc[tst_pts, m])
    res_auc   = roc_auc_score(outcomes.loc[tst_pts, "AD_dx"], outcomes.loc[tst_pts, m])

    print("%s & %.2f (%.2f) & %.2f (%.2f) & %.2f (%s) & %.2f (%s) & %.2f \\\\" % 
            (m,
            mean_ad,
            std_ad,
            mean_cn,
            std_cn,
            res_ttest.statistic,
            str(round(res_ttest.pvalue, 3)) if res_ttest.pvalue >= 0.001 else "$<$0.001", 
            res_mannw.statistic, 
            str(round(res_ttest.pvalue, 3)) if res_ttest.pvalue >= 0.001 else "$<$0.001",
            res_auc)
    )

In [0]:
outcomes[["kw_vague_term_rate", "kw_vague_utt_ratio"]].to_csv("vague_feats.csv")