In [55]:
import sys

import pandas as pd
import numpy as np

from bertopic import BERTopic

import asyncio

from text_topic_extraction.concept_extractor.concept_extractor import Concept, PatientConcepts
from text_topic_extraction.text_topic_processor import TextTopicProcessor, AsyncTextTopicProcessor, BERTopicModeler, QuestionGenerator, AnswerGenerator
from text_topic_extraction.config import API_VERSION, AZURE_ENDPOINT, OPENAI_API_KEY, USE_AZURE

In [2]:
np.random.seed(0)

In [3]:
question_prompt = """
The following topics were found in clinical notes. Generate yes/no questions for the following medical topics:
{topics_text}
Return a JSON object with topic numbers as keys and questions as values. Example:
{
"1": "Does the note mention the patient having a history of diabetes?"
}
"""

answer_prompt = """
You will be given a clinical note. I will give you a series of questions. Your task is answer each question with a probability from 0 to 1. Summarize the response with a JSON that includes your answer to all of the questions. Questions:
{prompt_questions}

clinical note:
{sentence}

Example answer: {
    "0": 0,
    "1": 1,
    "2": 0.5
}
Answer all the questions and do not answer with anything else besides valid JSON. Do not add comments to the JSON.
"""

In [4]:
llm_extract_df = pd.read_csv("../llm-bart/exp_mimic/_output/long_notes/max_obs_-1/seed_0/gpt-4o-mini/concept_extractions.csv")

In [67]:
llm_extract_df.llm_output

0       male, man, gentleman,age 82, elderly, senior,s...
1       81-year-old, elderly, senior,female, woman, ad...
2       75 years, elderly, senior,man, male,retired, u...
3       66 year old, elderly, senior,male, man, adult,...
4       female, woman, patient,elderly, senior, aged,m...
                              ...                        
7038    tobacco use, smoking, nicotine use,15 pack yea...
7039    86 M, elderly, senior,prostate cancer, cancer,...
7040    86 M, elderly, senior,prostate cancer, cancer,...
7041    elderly, senior, advanced age,female, woman,fe...
7042    male, man, gender,68 years old, elderly, senio...
Name: llm_output, Length: 7043, dtype: object

In [5]:
rand_idxs = np.random.choice(llm_extract_df.shape[0], llm_extract_df.shape[0], replace=False)
train_idxs = rand_idxs[:400]
test_idxs = rand_idxs[400:800]

In [31]:
patient_notes = {}
for patient_id in train_idxs:
    note_text = llm_extract_df.iloc[patient_id].sentence
    note_type = "H&P"
    patient_notes[str(patient_id)] = [{"note_text": note_text, "note_type": note_type}]

In [None]:
concept_id = 0
patient_concepts = {}
for patient_id in train_idxs:
    a = llm_extract_df.iloc[patient_id].llm_output.split(",")
    concept_list = [Concept(id=str(concept_id + concept_offset_id), descriptor=protoconcept.strip().lower()) for concept_offset_id, protoconcept in enumerate(a)]
    concept_id += len(concept_list)
    patient_concepts[str(patient_id)] = PatientConcepts(
        patient_id=str(patient_id),
        all_concepts=concept_list
    )

{'2200': PatientConcepts(patient_id='2200', all_concepts=[Concept(id='0', descriptor='nkda', synonyms=[], source='llm', patient_id='', note_id='', metadata={}), Concept(id='1', descriptor='no known drug allergies', synonyms=[], source='llm', patient_id='', note_id='', metadata={}), Concept(id='2', descriptor='social history', synonyms=[], source='llm', patient_id='', note_id='', metadata={}), Concept(id='3', descriptor='lifestyle factors', synonyms=[], source='llm', patient_id='', note_id='', metadata={}), Concept(id='4', descriptor='history', synonyms=[], source='llm', patient_id='', note_id='', metadata={}), Concept(id='5', descriptor='medical history', synonyms=[], source='llm', patient_id='', note_id='', metadata={}), Concept(id='6', descriptor='demographics', synonyms=[], source='llm', patient_id='', note_id='', metadata={}), Concept(id='7', descriptor='population data', synonyms=[], source='llm', patient_id='', note_id='', metadata={}), Concept(id='8', descriptor='health status',

In [58]:
processor = TextTopicProcessor(
    model_type="gpt-4o-mini-2024-07-18", #"gpt-4o-2024-08-06",
    temperature=0.0,
    num_topics=None,
    use_cache=True,
    use_structured_output=True,
    use_azure=False,
    azure_endpoint=AZURE_ENDPOINT,
    api_version=API_VERSION,
    api_key=OPENAI_API_KEY,
    cache_dir="cache",
    concept_prompt_template="{notes}",
    concept_questions_prompt_template=question_prompt,
    answer_prompt_template=answer_prompt
)

2025-03-13 07:47:41,575 - text_topic_extraction.cache.cache_manager - INFO - Text Topic Extraction cache initialized at cache/text_topic_cache.duckdb


2025-03-13 07:47:41,585 - text_topic_extraction.text_topic_processor - INFO - Initialized TextTopicProcessor with model gpt-4o-mini-2024-07-18


In [59]:
processor.patient_notes = patient_notes

In [60]:
processor.openai_client.client

<openai.OpenAI at 0x3af902210>

In [61]:
topic_modeler = BERTopicModeler(
    num_topics=processor.num_topics,
    embedding_model=(
        processor.embedding_model if processor.embedding_model != "default" else None
    ),
    vectorizer_model=processor.vectorizer_model,
    min_topic_size=processor.min_topic_size,
    language=processor.language,
    n_gram_range=processor.n_gram_range,
)

question_generator = QuestionGenerator(
    llm_api=processor.openai_client,
    batch_prompt_template=processor.concept_questions_prompt_template,
)
answer_generator = AnswerGenerator(
    llm_api=processor.openai_client, prompt_template=processor.answer_prompt_template
)

2025-03-13 07:47:54,119 - text_topic_extraction.topic_modeler.bertopic_modeler - INFO - Initialized BERTopicModeler with None topics


In [9]:
topics = processor._perform_topic_modeling(
    patient_concepts=patient_concepts,
    topic_modeler=topic_modeler,
)

2025-03-13 06:15:30,364 - text_topic_extraction.text_topic_processor - INFO - Performing topic modeling on concepts from 400 patients
2025-03-13 06:15:30,393 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
2025-03-13 06:15:30,393 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the env

In [20]:
batch_size = 100
all_topic_questions = []
for i in range(0, len(topics), batch_size):
    topic_batch = topics[i:i + batch_size]
    topic_questions = processor._generate_questions(
        topics=topic_batch,
        question_generator=question_generator,
    )
    print("topic question ex:", topic_questions[0])
    print("num topics", len(topic_questions))
    all_topic_questions += topic_questions

2025-03-13 07:06:41,788 - text_topic_extraction.text_topic_processor - INFO - Generating questions for 100 topics
2025-03-13 07:07:25,019 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:07:25,073 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key 2db52454ab...
2025-03-13 07:07:25,075 - text_topic_extraction.text_topic_processor - INFO - Generated 100 questions
2025-03-13 07:07:25,075 - text_topic_extraction.text_topic_processor - INFO - Question for topic 0: Is the patient described as an elderly or aging person?
2025-03-13 07:07:25,075 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1: Is the patient identified as a male cyclist?
2025-03-13 07:07:25,075 - text_topic_extraction.text_topic_processor - INFO - Question for topic 2: Is the patient's social history unknown?
2025-03-13 07:07:25,075 - text_topic_extraction.text_topic_processor - INFO - Question for topic

topic question ex: topic_id=0 topic_name='elderly aging person' question='Is the patient described as an elderly or aging person?' keywords=['elderly', 'aging', 'person', 'population', 'worker', 'mother', 'aged', '', '', ''] keyword_scores=[0.13809806533186764, 0.010444242025935841, 0.009521822757240776, 0.008182954483843301, 0.007694280806424764, 0.007232136342474878, 0.005685869822041315, 1e-05, 1e-05, 1e-05]
num topics 100


2025-03-13 07:07:58,180 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:07:58,214 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key ea74eadeb4...
2025-03-13 07:07:58,215 - text_topic_extraction.text_topic_processor - INFO - Generated 100 questions
2025-03-13 07:07:58,215 - text_topic_extraction.text_topic_processor - INFO - Question for topic 100: Is the patient retired or semiretired?
2025-03-13 07:07:58,216 - text_topic_extraction.text_topic_processor - INFO - Question for topic 101: Does the patient receive support from welfare or social services?
2025-03-13 07:07:58,216 - text_topic_extraction.text_topic_processor - INFO - Question for topic 102: Does the patient have any neurological conditions such as sclerosis or Parkinson's?
2025-03-13 07:07:58,216 - text_topic_extraction.text_topic_processor - INFO - Question for topic 103: Is the patient currently afebrile?
2025-03-13 07:07:58,

topic question ex: topic_id=100 topic_name='retired semiretired ' question='Is the patient retired or semiretired?' keywords=['retired', 'semiretired', '', '', '', '', '', '', '', ''] keyword_scores=[0.44407066909305615, 0.0979544590171156, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05]
num topics 100


2025-03-13 07:08:31,995 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:08:32,024 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key 014ba907c1...
2025-03-13 07:08:32,025 - text_topic_extraction.text_topic_processor - INFO - Generated 100 questions
2025-03-13 07:08:32,025 - text_topic_extraction.text_topic_processor - INFO - Question for topic 200: Is there a mention of a blockage or blocked artery?
2025-03-13 07:08:32,026 - text_topic_extraction.text_topic_processor - INFO - Question for topic 201: Is the location of interest near a door or in North Carolina?
2025-03-13 07:08:32,026 - text_topic_extraction.text_topic_processor - INFO - Question for topic 202: Is there a condition affecting the heart or cardiac function?
2025-03-13 07:08:32,027 - text_topic_extraction.text_topic_processor - INFO - Question for topic 203: Does the CXR show any proximal findings?
2025-03-13 07:08:32,027 - t

topic question ex: topic_id=200 topic_name='blockage blocked blockages' question='Is there a mention of a blockage or blocked artery?' keywords=['blockage', 'blocked', 'blockages', 'hardening', 'completely', 'proxy', 'partial', 'tips', 'full', 'severe'] keyword_scores=[0.4097204529421187, 0.17562817512034215, 0.15064215162786676, 0.0927989611741095, 0.0927989611741095, 0.07676921172102719, 0.05854272504011405, 0.05854272504011405, 0.05854272504011405, 0.050202533904243084]
num topics 100


2025-03-13 07:09:15,137 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:09:15,165 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key dec4d04a88...
2025-03-13 07:09:15,166 - text_topic_extraction.text_topic_processor - INFO - Generated 100 questions
2025-03-13 07:09:15,167 - text_topic_extraction.text_topic_processor - INFO - Question for topic 300: Is the son doing well?
2025-03-13 07:09:15,167 - text_topic_extraction.text_topic_processor - INFO - Question for topic 301: Does the patient have a history of NSCLC?
2025-03-13 07:09:15,167 - text_topic_extraction.text_topic_processor - INFO - Question for topic 302: Was an electrocardiogram performed to assess electrophysiological imbalance?
2025-03-13 07:09:15,168 - text_topic_extraction.text_topic_processor - INFO - Question for topic 303: Is the patient on anticoagulant therapy?
2025-03-13 07:09:15,168 - text_topic_extraction.text_topic_pr

topic question ex: topic_id=300 topic_name='son boy well' question='Is the son doing well?' keywords=['son', 'boy', 'well', 'unsupportive', 'sons', 'guy', 'boys', 'one', '', ''] keyword_scores=[0.5029629205425215, 0.32411825866800004, 0.1934455262195073, 0.15332002280939833, 0.15332002280939833, 0.15332002280939833, 0.15332002280939833, 0.06697587134552821, 1e-05, 1e-05]
num topics 100


2025-03-13 07:09:58,555 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:09:58,582 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key 34dd8a6b3e...
2025-03-13 07:09:58,583 - text_topic_extraction.text_topic_processor - INFO - Generated 100 questions
2025-03-13 07:09:58,583 - text_topic_extraction.text_topic_processor - INFO - Question for topic 400: Is there any mention of mitral or tricuspid regurgitation in the notes?
2025-03-13 07:09:58,584 - text_topic_extraction.text_topic_processor - INFO - Question for topic 401: Does the note mention sputum production or pseudogout?
2025-03-13 07:09:58,584 - text_topic_extraction.text_topic_processor - INFO - Question for topic 402: Is there any indication of speech being mute or silent in the notes?
2025-03-13 07:09:58,584 - text_topic_extraction.text_topic_processor - INFO - Question for topic 403: Does the note mention addiction or any related i

topic question ex: topic_id=400 topic_name='regurgitation mitral tricuspid' question='Is there any mention of mitral or tricuspid regurgitation in the notes?' keywords=['regurgitation', 'mitral', 'tricuspid', 'mildmoderate', 'moderate', 'mild', 'significant', 'no', '', ''] keyword_scores=[0.34598616026223356, 0.24645233086575832, 0.14830823676828894, 0.07836356721369248, 0.032760502527259275, 0.022373438653666373, 0.014168385097891879, 0.0007675627833816906, 1e-05, 1e-05]
num topics 100


2025-03-13 07:10:42,792 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:10:42,818 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key 6d2de472a9...
2025-03-13 07:10:42,820 - text_topic_extraction.text_topic_processor - INFO - Generated 100 questions
2025-03-13 07:10:42,821 - text_topic_extraction.text_topic_processor - INFO - Question for topic 500: Has the patient experienced any bradycardic episodes?
2025-03-13 07:10:42,821 - text_topic_extraction.text_topic_processor - INFO - Question for topic 501: Were any imaging studies conducted for the patient?
2025-03-13 07:10:42,821 - text_topic_extraction.text_topic_processor - INFO - Question for topic 502: Is there a mention of chronic diseases in the patient's history?
2025-03-13 07:10:42,822 - text_topic_extraction.text_topic_processor - INFO - Question for topic 503: Has the patient had multiple visits to the emergency department?
2025-03-

topic question ex: topic_id=500 topic_name='bradycardia bradycardic episodes' question='Has the patient experienced any bradycardic episodes?' keywords=['bradycardia', 'bradycardic', 'episodes', 'of', '', '', '', '', '', ''] keyword_scores=[0.9326929798923711, 0.1959089180342312, 0.06728279289415555, 0.005356944840087295, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05]
num topics 100


2025-03-13 07:11:29,526 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:11:29,560 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key b1d9d42498...
2025-03-13 07:11:29,563 - text_topic_extraction.text_topic_processor - INFO - Generated 100 questions
2025-03-13 07:11:29,566 - text_topic_extraction.text_topic_processor - INFO - Question for topic 600: Is the patient experiencing an upset stomach?
2025-03-13 07:11:29,566 - text_topic_extraction.text_topic_processor - INFO - Question for topic 601: Is there evidence of an intracranial hemorrhage?
2025-03-13 07:11:29,568 - text_topic_extraction.text_topic_processor - INFO - Question for topic 602: Is there a mention of pulmonary embolism (PE) in the notes?
2025-03-13 07:11:29,569 - text_topic_extraction.text_topic_processor - INFO - Question for topic 603: Have repeated ultrasounds been performed?
2025-03-13 07:11:29,570 - text_topic_extraction

topic question ex: topic_id=600 topic_name='upset stomach unsettled' question='Is the patient experiencing an upset stomach?' keywords=['upset', 'stomach', 'unsettled', 'stomachrelated', 'empty', 'gastrointestinal', 'no', '', '', ''] keyword_scores=[0.5446156418685284, 0.20143012681960099, 0.12594144730772006, 0.12594144730772006, 0.1022214600331953, 0.016312800089132676, 0.0012335830447205742, 1e-05, 1e-05, 1e-05]
num topics 100


2025-03-13 07:12:03,590 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:12:03,618 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key d0fbb0abce...
2025-03-13 07:12:03,619 - text_topic_extraction.text_topic_processor - INFO - Generated 100 questions
2025-03-13 07:12:03,619 - text_topic_extraction.text_topic_processor - INFO - Question for topic 700: Does the note mention high blood sugar levels?
2025-03-13 07:12:03,620 - text_topic_extraction.text_topic_processor - INFO - Question for topic 701: Does the note mention the use of PEEP in the patient's treatment?
2025-03-13 07:12:03,620 - text_topic_extraction.text_topic_processor - INFO - Question for topic 702: Does the note mention a history of deep vein thrombosis (DVT)?
2025-03-13 07:12:03,620 - text_topic_extraction.text_topic_processor - INFO - Question for topic 703: Does the note mention facial droop or palsy?
2025-03-13 07:12:03,621

topic question ex: topic_id=700 topic_name='sugar sugars high' question='Does the note mention high blood sugar levels?' keywords=['sugar', 'sugars', 'high', 'blood', 'disorder', 'issues', 'elevated', 'low', '', ''] keyword_scores=[0.37307719195694844, 0.07950558002581856, 0.035648919622125845, 0.024647281799542002, 0.0085145877848128, 0.008133985332005548, 0.0066019903299199875, 0.0043636627338354056, 1e-05, 1e-05]
num topics 100


2025-03-13 07:12:39,327 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:12:39,358 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key 71a391d641...
2025-03-13 07:12:39,359 - text_topic_extraction.text_topic_processor - INFO - Generated 100 questions
2025-03-13 07:12:39,360 - text_topic_extraction.text_topic_processor - INFO - Question for topic 800: Does the note mention any personal habits of the patient?
2025-03-13 07:12:39,360 - text_topic_extraction.text_topic_processor - INFO - Question for topic 801: Is there a mention of palliative or hospice care in the note?
2025-03-13 07:12:39,360 - text_topic_extraction.text_topic_processor - INFO - Question for topic 802: Does the note discuss an interfacility transfer?
2025-03-13 07:12:39,360 - text_topic_extraction.text_topic_processor - INFO - Question for topic 803: Is there a mention of strep throat or ST depression in the note?
2025-03-13

topic question ex: topic_id=800 topic_name='habits habit personal' question='Does the note mention any personal habits of the patient?' keywords=['habits', 'habit', 'personal', 'healthy', 'changes', 'in', 'past', 'no', '', ''] keyword_scores=[0.4933202564975499, 0.3012843032557335, 0.10944429166735978, 0.06956609684117471, 0.026262692953630935, 0.02438340415773635, 0.014752735008113596, 0.0018179118553776881, 1e-05, 1e-05]
num topics 100


2025-03-13 07:13:07,030 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:13:07,062 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key 3300cad7ec...
2025-03-13 07:13:07,064 - text_topic_extraction.text_topic_processor - INFO - Generated 100 questions
2025-03-13 07:13:07,064 - text_topic_extraction.text_topic_processor - INFO - Question for topic 900: Is the patient a widow or widower?
2025-03-13 07:13:07,064 - text_topic_extraction.text_topic_processor - INFO - Question for topic 901: Is there an elevated white blood cell count?
2025-03-13 07:13:07,065 - text_topic_extraction.text_topic_processor - INFO - Question for topic 902: Is the patient taking Pepcid for their condition?
2025-03-13 07:13:07,065 - text_topic_extraction.text_topic_processor - INFO - Question for topic 903: Does the patient have a genetic predisposition to any hereditary conditions?
2025-03-13 07:13:07,065 - text_topic_

topic question ex: topic_id=900 topic_name='widow companion widower' question='Is the patient a widow or widower?' keywords=['widow', 'companion', 'widower', '', '', '', '', '', '', ''] keyword_scores=[1.0140741236461426, 0.8898494206097336, 0.35263605246161617, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05]
num topics 100


2025-03-13 07:13:36,411 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:13:36,436 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key b4da17d661...
2025-03-13 07:13:36,438 - text_topic_extraction.text_topic_processor - INFO - Generated 100 questions
2025-03-13 07:13:36,438 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1000: Does the note mention the presence of diffuse mild mediastinitis?
2025-03-13 07:13:36,438 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1001: Did the patient receive help or assistance?
2025-03-13 07:13:36,439 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1002: Does the note indicate a sedentary lifestyle?
2025-03-13 07:13:36,439 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1003: Are the patient's troponin levels elevated?
2025-03-13 07:13:36,440 - text_topic_ext

topic question ex: topic_id=1000 topic_name='diffuse mild mediastinitis' question='Does the note mention the presence of diffuse mild mediastinitis?' keywords=['diffuse', 'mild', 'mediastinitis', 'fibrosing', 'dispersing', 'increase', 'pain', 'discomfort', '', ''] keyword_scores=[0.2761007388653334, 0.22373438653666372, 0.1959089180342312, 0.1959089180342312, 0.1959089180342312, 0.08558028005261938, 0.023319125187383383, 0.013109124430181732, 1e-05, 1e-05]
num topics 100


2025-03-13 07:14:06,708 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:14:06,727 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key a66baa3d41...
2025-03-13 07:14:06,728 - text_topic_extraction.text_topic_processor - INFO - Generated 100 questions
2025-03-13 07:14:06,729 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1100: Is there evidence of fluid overload or volume buildup in the patient?
2025-03-13 07:14:06,729 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1101: Were the testing results available earlier than expected?
2025-03-13 07:14:06,729 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1102: Is the patient's hospital stay scheduled or recurrent?
2025-03-13 07:14:06,729 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1103: Was the patient found down or in a compromised position?


topic question ex: topic_id=1100 topic_name='overload volume buildup' question='Is there evidence of fluid overload or volume buildup in the patient?' keywords=['overload', 'volume', 'buildup', 'control', 'fluid', '', '', '', '', ''] keyword_scores=[0.6536134893969836, 0.34657359027997264, 0.2203975327885101, 0.060909977499883174, 0.055409665033697375, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05]
num topics 100


2025-03-13 07:14:35,763 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:14:35,811 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key bc5b5583ab...
2025-03-13 07:14:35,812 - text_topic_extraction.text_topic_processor - INFO - Generated 100 questions
2025-03-13 07:14:35,812 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1200: Is the patient a female in her 80s, 60s, or 50s?
2025-03-13 07:14:35,813 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1201: Are there any communication challenges mentioned in the notes?
2025-03-13 07:14:35,813 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1202: Was an abdominal ultrasound performed?
2025-03-13 07:14:35,813 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1203: Is there a mention of deep vein thrombosis?
2025-03-13 07:14:35,814 - text_topic_extracti

topic question ex: topic_id=1200 topic_name='81f 60f 52f' question='Is the patient a female in her 80s, 60s, or 50s?' keywords=['81f', '60f', '52f', '40s50s', '25f', '41', '', '', '', ''] keyword_scores=[0.8177716802655624, 0.5037657892308802, 0.5037657892308802, 0.5037657892308802, 0.5037657892308802, 0.35498666425542863, 1e-05, 1e-05, 1e-05, 1e-05]
num topics 100


2025-03-13 07:14:49,080 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 07:14:49,108 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key c4edb2d511...
2025-03-13 07:14:49,110 - text_topic_extraction.text_topic_processor - INFO - Generated 19 questions
2025-03-13 07:14:49,110 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1300: Is the patient receiving nonhospital medical treatment?
2025-03-13 07:14:49,110 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1301: Was a tissue sample taken for analysis?
2025-03-13 07:14:49,110 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1302: Is the severity of the condition considered medium or average?
2025-03-13 07:14:49,111 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1303: Is there any mention of obstructed airways or obtundation?
2025-03-13 07:14:49,111

topic question ex: topic_id=1300 topic_name='nonhospital treatment medical' question='Is the patient receiving nonhospital medical treatment?' keywords=['nonhospital', 'treatment', 'medical', '', '', '', '', '', '', ''] keyword_scores=[0.2385167400774557, 0.07497007364545744, 0.03210455534029746, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05]
num topics 19


In [39]:
patient_id = str(train_idxs[0])
topic_answers = answer_generator.generate_answers(
    patient_concepts=patient_concepts[patient_id],
    topic_questions=all_topic_questions,
    patient_notes=patient_notes[patient_id],
)

2025-03-13 07:34:03,469 - text_topic_extraction.answer_generator.answer_generator - INFO - Generating answers for patient 2200 for 1319 questions
2025-03-13 07:34:03,470 - text_topic_extraction.answer_generator.answer_generator - INFO - Sending batch of 1319 questions for patient 2200
2025-03-13 07:34:03,485 - text_topic_extraction.llm_api.openai_client - INFO - Using cached structured response (key: 3ebf4b8fa8...)


In [68]:
topic_answers

[TopicAnswer(topic_id=0, topic_name='elderly aging person', question='Is the patient described as an elderly or aging person?', probability=0.0, answer=False),
 TopicAnswer(topic_id=1, topic_name='male cyclist individual', question='Is the patient identified as a male cyclist?', probability=0.0, answer=False),
 TopicAnswer(topic_id=2, topic_name='social history unknown', question="Is the patient's social history unknown?", probability=1.0, answer=True),
 TopicAnswer(topic_id=3, topic_name='senior gentleman man', question='Is the patient referred to as a senior gentleman or man?', probability=0.0, answer=False),
 TopicAnswer(topic_id=4, topic_name='man lad ', question='Is the patient described as a man or lad?', probability=0.0, answer=False),
 TopicAnswer(topic_id=5, topic_name='female girl 75y', question='Is the patient a 75-year-old female or girl?', probability=0.0, answer=False),
 TopicAnswer(topic_id=6, topic_name='woman lady she', question='Is the patient referred to as a woman o

In [65]:
patient_concepts_subset = {str(idx): patient_concepts[str(idx)] for idx in train_idxs[:50]}
processor.patient_notes = {str(idx): patient_notes[str(idx)] for idx in train_idxs[:50]}

patient_answers = processor._generate_answers(
    patient_concepts=patient_concepts_subset,
    topic_questions=all_topic_questions,
    answer_generator=answer_generator,
)

2025-03-13 07:50:24,451 - text_topic_extraction.text_topic_processor - INFO - Generating answers for 50 patients
2025-03-13 07:50:24,451 - text_topic_extraction.text_topic_processor - INFO - Generating answers for patient 2200
2025-03-13 07:50:24,452 - text_topic_extraction.answer_generator.answer_generator - INFO - Generating answers for patient 2200 for 1319 questions
2025-03-13 07:50:24,453 - text_topic_extraction.answer_generator.answer_generator - INFO - Sending batch of 1319 questions for patient 2200
2025-03-13 07:50:24,465 - text_topic_extraction.llm_api.openai_client - INFO - Using cached structured response (key: 3ebf4b8fa8...)
2025-03-13 07:50:24,470 - text_topic_extraction.text_topic_processor - INFO - Generating answers for patient 4627
2025-03-13 07:50:24,470 - text_topic_extraction.answer_generator.answer_generator - INFO - Generating answers for patient 4627 for 1319 questions
2025-03-13 07:50:24,471 - text_topic_extraction.answer_generator.answer_generator - INFO - Sen

KeyboardInterrupt: 

In [None]:
# Create topic matrix
processor.topic_matrix = answer_generator.create_topic_matrix(patient_answers)

# Create matrix DataFrames (always do this even if not saving)
processor.binary_matrix_df = MatrixBuilder.create_binary_matrix(topic_matrix)
processor.probability_matrix_df = MatrixBuilder.create_probability_matrix(
    topic_matrix
)

# Create return dictionary with DataFrames
results = {
    "binary_matrix_df": binary_matrix_df,
    "probability_matrix_df": probability_matrix_df,
    "topic_matrix": topic_matrix,
    "topics": topics,
    "topic_questions": topic_questions,
}

''

In [None]:
processor._save_results(
    patient_concepts=patient_concepts,
    topics=topics,
    topic_questions=topic_questions,
    patient_answers=patient_answers,
    topic_matrix=topic_matrix,
    output_dir="exp_mimic/_output",
    save_intermediate=False,
)