In [0]:
!pip install --upgrade openai
dbutils.library.restartPython()

In [0]:
%load_ext autoreload
%autoreload 1
%aimport data.load_adress_data

In [0]:
import sys
sys.path.append("..")

import pandas as pd
import time
from openai import OpenAI
from tqdm.notebook import tqdm
from utils import num_tokens_from_messages

In [0]:
model       = "gpt-4o"
model_name  = "openai_gpt_4o"
temperature = 0.0
top_p       = 1.0
tpm         = 1e6
rpm         = 6150
chunk_lines = 10

p_short_responses = (
    "Identify all instances where the patient speaks in short utterances that may indicate cognitive impairment (e.g., patient responds to \"How are you feeling?\" with a one word answer \"Okay.\") in the following transcript:\n\n"
    "{}\n\n"
    "Return a bullet list of the short utterances. If there are none, return \"None\"."
)

p_filler_speech = (
    "Identify all filler words, phrases, or sounds that indicate possible cognitive impairment (e.g., \"uh\", \"um\", and \"ah\") from the following utterance:\n\n"
    "{}\n\n"
    "Return a bullet list of relevant fillers. If there are none, return \"None\"."
)

p_repetitive_speech = (
    "Identify all instances where the patient’s speech shows repetition that may indicate cognitive impairment in the following transcript:\n\n"
    "{}\n\n"
    "Return a bullet list of the repeated content exactly as spoken. If there are none, return \"None\"."
)

p_vague_speech = (
    "Identify all vague words or phrases that indicate possible cognitive impairment (e.g., \"you know\" or \"that thing\") from the following transcript:"
    "{}\n\n"
    "Return a bullet list of vague words or phrases exactly as spoken. If none are present, return \"None\"."
)

In [0]:
def extract_features(data, prompt, model, model_name, temperature, top_p, tpm, rpm):
    DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

    client = OpenAI(
        api_key=DATABRICKS_TOKEN,
        base_url="https://adb-2035410508966251.11.azuredatabricks.net/serving-endpoints"
    )

    t_per_token = 60 / tpm
    t_per_request = 60 / rpm

    with tqdm(total=data.shape[0]) as pbar:
        for idx, row in data.iterrows():
            messages = [{"role": "user", "content": prompt.format(row["Query"])}]
            data.loc[idx, "query_tokens"] = num_tokens_from_messages(messages, model)

            # send the query
            result = client.chat.completions.create(
                model=model_name,
                messages=messages,
                temperature=temperature,
                top_p=top_p
            )

            # log the response
            data.loc[idx, "response"] = result.choices[0].message.content
            data.loc[idx, "response_tokens"] = result.usage.completion_tokens
            pbar.update(1)

            # sleep to abide by rate limit
            delay = max(t_per_request, (data.loc[idx, "query_tokens"] + data.loc[idx, "response_tokens"]) * t_per_token)
            time.sleep(delay)
            

    return data

Load the data

In [0]:
import data.load_adress_data

transcripts = data.load_adress_data.load_adress_CHAT_transcripts()

In [0]:
# filler speech
data = transcripts.loc[transcripts["Speaker"] == "Patient", "Utterance"].to_frame(name="Query")
data = extract_features(data, p_filler_speech, model, model_name, temperature, top_p, tpm, rpm)
data.head()

In [0]:
data.to_excel("adress_filler_speech.xlsx")

In [0]:
# repetitive speech
data2 = transcripts["Utterance"].groupby((pd.Series(range(transcripts.shape[0])) // chunk_lines).values).agg(lambda col: "\n".join(col)).to_frame(name="Query")
data2 = extract_features(data2, p_repetitive_speech, model, model_name, temperature, top_p, tpm, rpm)
data2.head()

In [0]:
data2.to_excel("adress_repetitive_speech.xlsx")

In [0]:
# short responses
data3 = transcripts["Utterance"].groupby((pd.Series(range(transcripts.shape[0])) // chunk_lines).values).agg(lambda col: "\n".join(col)).to_frame(name="Query")
data3 = extract_features(data3, p_short_responses, model, model_name, temperature, top_p, tpm, rpm)
data3.head()

In [0]:
data3.to_excel("adress_short_responses.xlsx")

In [0]:
# vague speech
data4 = transcripts["Utterance"].groupby((pd.Series(range(transcripts.shape[0])) // chunk_lines).values).agg(lambda col: "\n".join(col)).to_frame(name="Query")
data4 = extract_features(data4, p_vague_speech, model, model_name, temperature, top_p, tpm, rpm)
data4.head()

In [0]:
data4.to_excel("adress_vague_speech.xlsx")