In [0]:
%load_ext autoreload
%autoreload 1
%aimport data.adress

In [0]:
import sys
sys.path.append("..")
import pickle
from pprint import pprint

### Validation Data

In [0]:
from data.adress import load_CHAT_transcripts

data = load_CHAT_transcripts()
data = data[["Speaker", "Transcript", "Transcript_clean", "Utterance", "Filler Speech"]]
data.head()

## LLM-Based Detector

In [0]:
from openai import OpenAI

DATABRICKS_TOKEN    = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
DB_ENDPOINT_URL     = "https://adb-2035410508966251.11.azuredatabricks.net/serving-endpoints"

client = OpenAI(api_key=DATABRICKS_TOKEN, base_url=DB_ENDPOINT_URL)

In [0]:
def run(prompt, model, utt_per_query):
    outputs = {}
    for (trn_tst, pt_id), transcript in data.groupby(level=["train_test", "patient_id"]):
        for end in range(transcript.shape[0]):
            # Skip provider turns
            if transcript.loc[(trn_tst, pt_id, end), "Speaker"] == "Provider":
                continue

            # Sliding window
            start = max(0, end - utt_per_query + 1)
            text = "\n".join(transcript.loc[(slice(None), slice(None), slice(start, end)), "Utterance"].to_list())

            # Execute query
            response = client.chat.completions.create(
                model=model, 
                messages=[
                    {
                        "role": "user", 
                        "content": prompt.format(text)
                    }
                ]
            )
            outputs[(trn_tst, pt_id, end)] = response.choices[0].message.content

    return outputs

### Prompts

Best version: v1

In [0]:
prompt_v1 = "Identify all filler sounds (e.g., \"uh\"), words (e.g., \"like\"), or phrases (e.g., \"you know\") that indicate cognitive impairment in the following utterance:\n\n{}\n\nList each instance of filler as a bullet point in the order that they are spoken. Do not include any explanations. If a filler is repeated, list each occurrence in its own bullet point. If there are no fillers, return \"None\"."

prompt_v2 = "Identify all filler sounds (e.g., \"uh\"), words (e.g., \"like\"), or phrases (e.g., \"you know\") that indicate cognitive impairment in the last line of the following transcript:\n\n{}\n\nList each instance of filler in the last line as a bullet point in the order that they are spoken. Do not include any explanations. If a filler is repeated, list each occurrence in its own bullet point. If there are no fillers, return \"None\"."

In [0]:
outputs = run(prompt_v1, "openai_gpt_4o", 1)

In [0]:
with open("outputs_filler_speech.pkl", "wb") as f:
    pickle.dump(outputs, f)

In [0]:
# with open("outputs.pkl", "rb") as f:  
#     outputs = pickle.load(f)

In [0]:
data["output"] = data.index.map(outputs)

In [0]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

true = data.loc[data["Speaker"] == "Patient", "Filler speech"].values
pred = np.array(list(map(lambda x: 0 if x == "None" else 1, outputs.values())))

print("ACC: %.2f" % (100 * accuracy_score(true, pred)))
print("F1:  %.2f" % (100 * f1_score(true, pred)))

mtx = confusion_matrix(true, pred)
print("TPR: %.2f" % (100 * (mtx[1, 1] / (mtx[1, 1] + mtx[1, 0]))))
print("TNR: %.2f" % (100 * (mtx[0, 0] / (mtx[0, 0] + mtx[0, 1]))))
print("FPR: %.2f" % (100 * (mtx[0, 1] / (mtx[0, 1] + mtx[0, 0]))))
print("FNR: %.2f" % (100 * (mtx[1, 0] / (mtx[1, 0] + mtx[1, 1]))))
print("PRE: %.2f" % (100 * (mtx[1, 1] / (mtx[1, 1] + mtx[0, 1]))))

### DEMO

In [0]:
demo_data = """PATIENT: um I don’t know why they brought me in to see you, to be honest with you.

PATIENT: um I would say that um and for my members, pretty good. But there’s a. So, you know, some things I forget.

PATIENT: You know, to be honest with you. Yeah. You know, I don’t forget too much uh on this with you.

PATIENT: Well um I live alone, so I make my meals. I um I go out to the store myself with my walker with the weather’s nice and it’s not too far. And I um you know. I talk to my friends on the phone, things like that.

PATIENT: I sleep well.

PATIENT: No, well, you know, in the afternoon, late afternoon, I start watching television at 5 o’clock.

PATIENT: No, I do not.

PATIENT: My mood? Or why, you know uh you know, not a whole lot of things I can do. You know, I’m happy to.

PATIENT: And so uh doesn’t seem that old to me. OK, because you know, I wish. Yeah. Yeah. And she lived alone for most of the time. Just towards the end. She needed help.

PATIENT: Yeah.

PATIENT: That’s.

PATIENT: Yeah.

PATIENT: Oh. Oh, well, I did home care.

PATIENT: I didn’t. When they gave me.

PATIENT: Yeah

PATIENT: I love my job. But I finally had to, you know, driving. I couldn’t drive. It wasn’t safe for me to be driving around

PATIENT: Well, I was pretty old. I worked till I was something they didn’t know it.

PATIENT: I did not know how old I was.

PATIENT: uh So I was able to get away with working. I think I was soft too. And I, OK, when I finally said, look, you know uh I’m done.

PATIENT: Would I go by?

PATIENT: I’m in. I mean, questioned

PATIENT: um The date today is um Oh um I’m not sure. And maybe maybe the seven close clothes. What is it? Those things um I think it’s um Tuesday. You know

PATIENT: Oh, Friday. I was I was thinking. Friday. But anyway, it’s all right uh I made a mistake.

PATIENT: 2024.

PATIENT: L R O W.

PATIENT: Yeah.

PATIENT: um Oh.

PATIENT: Oh, Carmella uh Carmella. Carmella.

PATIENT: Carmella. Carmella. I can’t think of the right name. But she’s going to be our next president. Maybe

PATIENT: um something I’ve seen recently. Well, I see Carmela and her partner. She recently got a vice president. So I watch that.

PATIENT: Oh, I can’t remember his name. The uh short name is a short name. He does. Yeah, I don’t know what it is at the. Moment.

PATIENT: This what you mean your thumb or the thumb?

PATIENT: Knuckles.

PATIENT: cactus?

PATIENT: What’s what?"""

demo_data = demo_data.split("\n\n")

In [0]:
outputs = []
for text in demo_data:
    # Execute query
    response = client.chat.completions.create(
        model="openai_gpt_4o", 
        messages=[
            {
                "role": "user", 
                "content": prompt_v1.format(text)
            }
        ]
    )
    outputs.append(response.choices[0].message.content)

In [0]:
pprint(list(zip(demo_data,outputs)))