# Penn Data

In [0]:
%load_ext autoreload
%autoreload 1
%aimport data.observer
%aimport utils

In [0]:
import sys
sys.path.append("..")

import data.observer as OBSERVER

import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt

## Load the Data

#### Cognitive Impairment Outcomes

In [0]:
labels = pd.read_excel("/Volumes/biomedicalinformatics_analytics/dev_lab_johnson/watch/penn_CI_labels.xlsx")
labels["patient_id"] = labels["patient_id"].str.strip()
labels.head()

In [0]:
labels.shape

In [0]:
print("%d & %d\\%% & %.1f $\\pm$ %.1f & %d & %d\\%% & %.1f $\\pm$ %.1f \\\\" %
      ((labels["CI"] == 1).sum(),
        100 * ((labels["CI"] == 1) & (labels["Gender"] == "Female")).sum() / (labels["CI"] == 1).sum(),
        labels.loc[labels["CI"] == 1, "Age (in Years)"].mean(),
        labels.loc[labels["CI"] == 1, "Age (in Years)"].std(),
       (labels["CI"] == 0).sum(),
        100 * ((labels["CI"] == 0) & (labels["Gender"] == "Female")).sum() / (labels["CI"] == 0).sum(),
        labels.loc[labels["CI"] == 0, "Age (in Years)"].mean(),
        labels.loc[labels["CI"] == 0, "Age (in Years)"].std()
      ))

Downsampling the cognitively normal cohort to be balanced with the cognitively impaired cohort.

In [0]:
np.random.seed(1234567)

ci_mean_age = labels.loc[labels["CI"] == 1, "Age (in Years)"].mean()
ci_pct_female = 100 * ((labels["CI"] == 1) & (labels["Gender"] == "Female")).sum() / (labels["CI"] == 1).sum()

cn_patients = labels.loc[labels["CI"] == 0, "patient_id"]

results = []
for i in range(10000):
    sample = np.random.choice(cn_patients, size=(labels["CI"] == 1).sum(), replace=False)
    results.append({
        "pt_ids": sample, 
        "avg_age": labels.loc[labels["patient_id"].isin(sample), "Age (in Years)"].mean(),
        "pct_female": 100 * (labels["patient_id"].isin(sample) & (labels["Gender"] == "Female")).sum() / sample.shape[0]
    })
    
candidate_subsamples = pd.DataFrame(results)
candidate_subsamples["mean_age_diff"] = abs(ci_mean_age - candidate_subsamples["avg_age"])
candidate_subsamples["pct_female_diff"] = abs(ci_pct_female - candidate_subsamples["pct_female"])
candidate_subsamples["composite_diff"] = candidate_subsamples["mean_age_diff"] + candidate_subsamples["pct_female_diff"]
pt_subsample = candidate_subsamples.loc[candidate_subsamples["composite_diff"].idxmin(), "pt_ids"]
pt_subsample

'PT025', 'PT099', 'PT075', 'PT016', 'PT044', 'PT085', 'PT035', 'PT103', 'PT036', 'PT058', 'PT002', 'PT095'

In [0]:
labels_ds = labels.loc[(labels["CI"] == 1) | labels["patient_id"].isin(pt_subsample)]

print("%d & %d\\%% & %.1f $\\pm$ %.1f & %d & %d\\%% & %.1f $\\pm$ %.1f \\\\" %
      ((labels_ds["CI"] == 1).sum(),
        100 * ((labels_ds["CI"] == 1) & (labels_ds["Gender"] == "Female")).sum() / (labels_ds["CI"] == 1).sum(),
        labels_ds.loc[labels_ds["CI"] == 1, "Age (in Years)"].mean(),
        labels_ds.loc[labels_ds["CI"] == 1, "Age (in Years)"].std(),
       (labels_ds["CI"] == 0).sum(),
        100 * ((labels_ds["CI"] == 0) & (labels_ds["Gender"] == "Female")).sum() / (labels_ds["CI"] == 0).sum(),
        labels_ds.loc[labels_ds["CI"] == 0, "Age (in Years)"].mean(),
        labels_ds.loc[labels_ds["CI"] == 0, "Age (in Years)"].std()
      ))

#### Cognitive Test Scores

In [0]:
lbls = OBSERVER.load_labels()
lbls.head()

In [0]:
bins = np.linspace(0, 30, 15)

plt.hist(lbls["MMSE"], bins=bins, edgecolor="k", zorder=3)
plt.xlabel("MMSE")
plt.ylabel("Frequency")
plt.xlim([-1, 31])
plt.grid(zorder=0)
plt.show()

In [0]:
bins = np.linspace(0, 30, 15)

plt.hist(lbls["FRS"], bins=bins, edgecolor="k", zorder=3)
plt.xlabel("FRS")
plt.ylabel("Frequency")
plt.xlim([-1, 31])
plt.grid(zorder=0)
plt.show()

## Labeling Probable MCI using GPT

In [0]:
!pip install --upgrade openai
dbutils.library.restartPython()

In [0]:
import os
import pandas as pd
import re
import tiktoken
import time
from openai import OpenAI

Load the data

In [0]:
directory = "/Volumes/biomedicalinformatics_analytics/dev_lab_johnson/swimcap/Penn OBSERVER/problem_lists/"

idx, pls = [], []
for file in os.listdir(directory):
    with open(os.path.join(directory, file), "r") as fp:
        idx.append(file.rsplit(".")[0])
        pls.append(fp.read())

problem_lists = pd.DataFrame(data=pls, index=idx, columns=["problem_list"])
problem_lists

In [0]:
def num_tokens_from_messages(messages, model):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using o200k_base encoding.")
        encoding = tiktoken.get_encoding("o200k_base")
    if model in {
        "gpt-3.5-turbo-0125",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        "gpt-4o-mini-2024-07-18",
        "gpt-4o-2024-08-06"
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0125.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0125")
    elif "gpt-4o-mini" in model:
        print("Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.")
        return num_tokens_from_messages(messages, model="gpt-4o-mini-2024-07-18")
    elif "gpt-4o" in model:
        print("Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.")
        return num_tokens_from_messages(messages, model="gpt-4o-2024-08-06")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [0]:
DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

client = OpenAI(
    api_key=DATABRICKS_TOKEN,
    base_url="https://adb-2035410508966251.11.azuredatabricks.net/serving-endpoints"
)

model   = "openai_gpt_4o"
tpm     = 1e6
rpm     = 6150

prompt  = (
    "Here is a patient's problem list summarizing their active health issues "
    "(e.g., diagnoses, chronic conditions, injuries):\n\n"
    "{}\n\n"
    "Based on this information, label the patient as either:\n\n"
    "- Probable MCI (Mild Cognitive Impairment), or\n"
    "- Healthy Control\n\n"
    "If the problem list includes multiple conditions that are commonly associated "
    "with cognitive decline, MCI, or Alzheimer's dementia (e.g., memory loss, gait abnormality), consider assigning "
    'the \"Probable MCI\" label. Otherwise, assign \"Healthy Control\".\n\n'
    "Format your response as follows:\n\n"
    "- Label: <Probable MCI or Healthy Control>\n"
    "- Reason: <comma-separated list of relevant issues from the problem list, or \"N/A\" if Healthy Control>"
)


tokens_used = 0
t = time.time()
for i, row in problem_lists.iterrows():
    messages = [{"role": "user", "content": prompt.format(row.problem_list)}]
    n_query_tokens = num_tokens_from_messages(messages, model.split("_", maxsplit=1)[1].replace("_", "-"))
    print("N query tokens:", n_query_tokens)

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    output = response.choices[0].message.content
    n_response_tokens = response.usage.completion_tokens
    print("N response tokens:", n_response_tokens)

    tokens_used += n_response_tokens

    matches = re.findall(r'^\s*[^:]+:\s*(.*)', output, re.MULTILINE)
    problem_lists.loc[i, "generated_label"] = matches[0].rstrip()
    problem_lists.loc[i, "reason"] = matches[1].rstrip()


In [0]:
problem_lists.to_excel("visit_problem_lists_labeled.xlsx")

## Run detectors

In [0]:
from detectors.filler_speech.keyword_search import FillerKeywordDetector
from detectors.repetitive_speech.unigram_analysis import UnigramAnalysisDetector as WordRepetitionDetector
from detectors.vague_speech.keyword_search import VagueKeywordDetector
from utils import create_custom_nlp

In [0]:
# Load all trans from Matt's file
trans = pd.read_csv("all_watch_observer_trans.csv")
trans.head()

In [0]:
pt_utts = (trans["Speaker"] == "Patient")

Filler speech

In [0]:
# init detector
nlp = create_custom_nlp()
filler_detector = FillerKeywordDetector(nlp)
# apply detector to all patient utterances
trans["filler_dets"] = trans.loc[pt_utts, "Transcript"].apply(filler_detector.detect)

Repetitive speech

In [0]:
# init detector
repetition_detector = WordRepetitionDetector(nlp, window_size=2)
# apply detector to all patient utterances
trans["repetition_dets"] = trans.loc[pt_utts, "Transcript"].apply(repetition_detector.detect)

Substitution errors

In [0]:
import time
import mlflow
from openai import OpenAI
from utils import llm_call

In [0]:
prompt = '''# INSTRUCTIONS
You are a neurologist analyzing a patient's speech sample for signs of cognitive impairment. 

Your task is to identify all substitution errors in a patient's speech provided in the input below.

### Definition
Substitution errors occur when a person involuntarily replaces their intended word with an unintended word while speaking. Focus on detecting the following five substitution errors types:
- Phonemic paraphasias, where sounds within the intended word are added, dropped, substituted, or rearranged (e.g., saying ``papple'' for ``apple''). 
- Semantic paraphasias, where the intended word is substituted entirely with another real word (e.g., saying ``cat'' for ``dog'').
- Neologisms, where the entire intended word is substituted with a non-word (e.g., saying "foundament" for "foundation").
- Morphological errors, where the intended word is used in the incorrect form, such as the wrong number (e.g., saying "child" for "children") or tense (e.g., saying "walked" for "walk").
- Intra-word dysfluencies, where the production of the intended word is disrupted by an inserted sound (e.g., saying "beuhcause" for "because").

Only flag single words that are clinically significant substitution errors and use the surrounding utterances to better understand the context of any given word.

### Output Format
Your output must be a single JSON object with a single key "detections" whose value is an array of JSON objects. Each object in the array represents one detected substitution error and must have the following keys-value pairs:
- "type": "substitution error".
- "utt_num": The number of the utterance in which the error occurs.
- "text": The verbatim text of the substition error.
- "span": The character span for the "text" in the "utt_num"-th utterance.
- "justification": A brief explanation of why the "text" is a substitution error within its specific context.

# INPUT
{input_text}
'''

mlflow_creds = mlflow.utils.databricks_utils.get_databricks_host_creds()

client = OpenAI(
    api_key=mlflow_creds.token,
    base_url=f"{mlflow_creds.host}/serving-endpoints"
)

sub_err_detector = lambda text: llm_call(client, "openai_gpt_4o", None, prompt.format(input_text=text), {"type": "json_object"})

In [0]:
detections = {}
with tqdm(total=trans["patient_id"].unique().shape[0]) as pbar:
    for idx, grp in trans.groupby("patient_id"):
        pt_transcript = "\n".join(grp.index.astype(str) + " " + grp["Speaker"].astype(str) + ": " + grp["Transcript"].astype(str))
        try:
            detections[idx] = sub_err_detector(pt_transcript)
        except Exception as e:
            print(e)

        pbar.update(1)
        time.sleep(3)

In [0]:
trans["sub_err_dets"] = pd.NA
for pt_id, dets in detections.items():
    for d in dets["detections"]:
        cur_dets = trans.at[d["utt_num"], "sub_err_dets"]

        if pd.isna(cur_dets):
            cur_dets = {"detections": []}
        
        cur_dets["detections"].append(d)
        trans.at[d["utt_num"], "sub_err_dets"] = cur_dets

Vague speech

In [0]:
# init detector
vague_detector = VagueKeywordDetector(nlp)
# apply detector to all patient utterances
trans["vague_dets"] = trans.loc[pt_utts, "Transcript"].apply(vague_detector.detect)

Aggregate features

In [0]:
def compute_detection_rate(trans, output_name, pt_id_col_name, nlp):
    num = trans.groupby(pt_id_col_name).apply(lambda grp: grp[output_name].apply(lambda x: len(x["detections"]) if not pd.isna(x) else 0).sum())
    den = trans.groupby(pt_id_col_name).apply(lambda grp: grp.apply(lambda x: sum([1 for token in nlp(x["Transcript"]) if not (token.is_punct or token.is_space or token._.is_silence_tag or token._.is_inaudible_tag or token._.is_event_tag)]) if x["Speaker"] == "Patient" else 0, axis=1).sum())
    return 100 * num / den

In [0]:
filler_rate = compute_detection_rate(trans, "filler_dets", "patient_id", nlp)
repetition_rate = compute_detection_rate(trans, "repetition_dets", "patient_id", nlp)
vague_term_rate = compute_detection_rate(trans, "vague_dets", "patient_id", nlp)
sub_error_rate = compute_detection_rate(trans, "sub_err_dets", "patient_id", nlp)

In [0]:
from sklearn.preprocessing import MinMaxScaler
import pickle

In [0]:
def inter_detection_distance(trans, output_name, pt_id_col_name, nlp, scaler=None, feat_name="IDD"):
    pt_ids = trans[pt_id_col_name].unique()
    ifd_metrics = pd.DataFrame(index=pt_ids, columns=[f"mean_{feat_name}", f"std_{feat_name}", f"mean_{feat_name}_norm", f"mean_{feat_name}_imputed", f"std_{feat_name}_imputed"], dtype=float)

    for pt_id, grp in trans.groupby(pt_id_col_name):
        ifds = []
        for idx, row in grp.iterrows():
            if row["Speaker"] == "Patient":
                if not pd.isna(row[output_name]) and len(row[output_name]["detections"]) > 1:
                    # get word spans for utterance
                    doc = nlp(row["Transcript"])
                    word_spans = [(token.text, token.idx, token.idx + len(token.text)) for token in doc if not (token.is_punct or token.is_space or token._.is_silence_tag or token._.is_inaudible_tag or token._.is_event_tag)]
                    # print("word_spans", word_spans)

                    # get filler words indices
                    filler_word_idxs = []
                    for det in row[output_name]["detections"]:
                        try:
                            filler_word_idxs.extend([word_spans.index((det["text"], det["span"][0], det["span"][1]))])
                        except Exception as e:
                            print(e)
                    
                    # filler_word_idxs = [word_spans.index((det["text"], det["span"][0], det["span"][1])) for det in row[output_name]["detections"]]
                    # print("filler_word_idxs:", filler_word_idxs)

                    # inter filler distance
                    ifds.extend([filler_word_idxs[i+1] - filler_word_idxs[i] - 1 for i in range(len(filler_word_idxs) - 1)])
                    # print("ifd", ifds[-1])
                    # break

        ifd_metrics.loc[pt_id, f"mean_{feat_name}"] = np.mean(ifds)
        ifd_metrics.loc[pt_id, f"std_{feat_name}"] = np.std(ifds)

    ifd_metrics[f"mean_{feat_name}_norm"] = scaler.transform(ifd_metrics[[f"mean_{feat_name}"]])
    ifd_metrics[f"mean_{feat_name}_imputed"] = ifd_metrics[f"mean_{feat_name}_norm"].fillna(1.0)
    ifd_metrics[f"std_{feat_name}_imputed"] = ifd_metrics[f"std_{feat_name}"].fillna(0)


    return ifd_metrics

In [0]:
with open("IFD_scaler.pkl", "rb") as f:
    IFD_scaler = pickle.load(f)

IFD = inter_detection_distance(trans, "filler_dets", "patient_id", nlp, scaler=IFD_scaler, feat_name="IFD")

with open("ISED_scaler.pkl", "rb") as f:
    ISED_scaler = pickle.load(f)

ISED = inter_detection_distance(trans, "sub_err_dets", "patient_id", nlp, scaler=ISED_scaler, feat_name="ISED")

In [0]:
def detected_utterances_ratio(trans, output_name, pt_id_col_name):
    num = trans.groupby(pt_id_col_name).apply(lambda grp: grp[output_name].apply(lambda x: len(x["detections"]) > 0 if not pd.isna(x) else False).sum())
    den = trans.groupby(pt_id_col_name).apply(lambda grp: grp.shape[0])
    return 100 * num / den

In [0]:
vague_utt_ratio = detected_utterances_ratio(trans, "vague_dets", "patient_id")

In [0]:
POS = [
    "ADJ",      # adjective
    "ADP",      # adposition
    "ADV",      # adverb
    "AUX",      # auxiliary
    "CCONJ",    # coordinating conjunction
    "DET",      # determiner
    "INTJ",     # interjection
    "NOUN",     # noun
    "NUM",      # numeral
    "PART",     # particle
    "PRON",     # pronoun
    "PROPN",    # proper noun
    # "PUNCT",    # punctuation
    "SCONJ",    # subordinating conjunction
    # "SYM",      # symbol
    "VERB",     # verb
    "X",        # other
    # "SPACE",    # space
]

def pos_repetition_counts(trans, output_name, pt_id_col_name):
    pt_ids = trans[pt_id_col_name].unique()
    pos_metrics = pd.DataFrame(index=pt_ids, columns=POS, dtype=float)

    for pt_id in pos_metrics.index:
        counts = dict.fromkeys(POS, 0)
        num_words = 0
        for utt_num, row in trans.loc[trans[pt_id_col_name] == pt_id].iterrows():
            if row["Speaker"] == "Patient":
                num_words += sum([1 for token in nlp(row["Transcript"]) if not (token.is_punct or token.is_space or token._.is_silence_tag or token._.is_inaudible_tag or token._.is_event_tag)])

                if row[output_name]["detections"]:
                    for det in row[output_name]["detections"]:
                        doc = nlp(det["text1"])
                        try:
                            counts[doc[0].pos_] += 1                              
                        except KeyError as e:
                            print(e)
                            print(row["Transcript"])
                            print(det)

        for pos in POS:
            pos_metrics.loc[pt_id, pos] = 100 * counts[pos] / num_words

    return pos_metrics

In [0]:
pos_rep_rates = pos_repetition_counts(trans, "repetition_dets", "patient_id")

Compile into feature file

In [0]:
features = pd.concat([filler_rate, repetition_rate, sub_error_rate, vague_term_rate, vague_utt_ratio], keys=["filler_rate", "repetition_rate", "sub_err_rate", "vague_term_rate", "vague_utt_ratio"], axis=1) 
features = pd.concat([features, IFD[["mean_IFD_imputed", "std_IFD_imputed"]], ISED[["mean_ISED_imputed", "std_ISED_imputed"]]], axis=1)
features = pd.concat([features, pos_rep_rates.add_suffix('_repetition_rate')], axis=1)
features

In [0]:
features.to_csv("OBSERVER_features.csv")

In [0]:
trans.to_pickle("OBSERVER_trans_with_dets.pkl")