In [14]:
import sys
sys.path.append("text_topic_extraction")

import pandas as pd
import numpy as np

from bertopic import BERTopic

import asyncio
import pickle
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

from text_topic_extraction.utils.matrix_builder import MatrixBuilder
from text_topic_extraction.concept_extractor.concept_extractor import Concept, PatientConcepts
from text_topic_extraction.text_topic_processor import TextTopicProcessor, AsyncTextTopicProcessor, BERTopicModeler, QuestionGenerator, AnswerGenerator, AsyncAnswerGenerator
from text_topic_extraction.config import API_VERSION, AZURE_ENDPOINT, OPENAI_API_KEY, USE_AZURE

In [48]:
np.random.seed(0)

In [49]:
question_prompt = """
The following topics were found in clinical notes. Generate yes/no questions for the following medical topics:
{topics_text}
Return a JSON object with topic numbers as keys and questions as values. Example:
{
"1": "Does the note mention the patient having a history of diabetes?"
}
"""

answer_prompt = """
You will be given a clinical note. I will give you a series of questions. Your task is answer each question with a probability from 0 to 1. Summarize the response with a JSON that includes your answer to all of the questions. Questions:
{prompt_questions}

clinical note:
{sentence}

Example answer: {
    "Does this person smoke?": 0,
    "Does this person drink?": 1,
    "Does this note mention alcohol?": 0.5
}
Answer all the questions and do not answer with anything else besides valid JSON. Do not add comments to the JSON.
"""

In [50]:
llm_extract_df = pd.read_csv("../llm-bart/exp_mimic/_output/long_notes/max_obs_-1/seed_0/gpt-4o-mini/concept_extractions.csv")

In [51]:
llm_extract_df.llm_output

0       male, man, gentleman,age 82, elderly, senior,s...
1       81-year-old, elderly, senior,female, woman, ad...
2       75 years, elderly, senior,man, male,retired, u...
3       66 year old, elderly, senior,male, man, adult,...
4       female, woman, patient,elderly, senior, aged,m...
                              ...                        
7038    tobacco use, smoking, nicotine use,15 pack yea...
7039    86 M, elderly, senior,prostate cancer, cancer,...
7040    86 M, elderly, senior,prostate cancer, cancer,...
7041    elderly, senior, advanced age,female, woman,fe...
7042    male, man, gender,68 years old, elderly, senio...
Name: llm_output, Length: 7043, dtype: object

In [52]:
rand_idxs = np.random.choice(llm_extract_df.shape[0], llm_extract_df.shape[0], replace=False)
train_idxs = rand_idxs[:400]
test_idxs = rand_idxs[400:800]

In [7]:
patient_notes = {}
for patient_id in rand_idxs[:800]:
    note_text = llm_extract_df.iloc[patient_id].sentence
    note_type = "H&P"
    patient_notes[str(patient_id)] = [{"note_text": note_text, "note_type": note_type}]

In [8]:
concept_id = 0
patient_concepts = {}
for patient_id in rand_idxs[:800]:
    a = llm_extract_df.iloc[patient_id].llm_output.split(",")
    concept_list = [Concept(id=str(concept_id + concept_offset_id), descriptor=protoconcept.strip().lower()) for concept_offset_id, protoconcept in enumerate(a)]
    concept_id += len(concept_list)
    patient_concepts[str(patient_id)] = PatientConcepts(
        patient_id=str(patient_id),
        all_concepts=concept_list
    )

In [9]:
processor = TextTopicProcessor(
    model_type="gpt-4o-mini-2024-07-18",
    # "gpt-4o-mini-2024-07-18", 
    # model_type="gpt-4o-2024-08-06",
    temperature=0.0,
    num_topics=400,
    use_cache=True,
    use_structured_output=True,
    use_azure=False,
    azure_endpoint=AZURE_ENDPOINT,
    api_version=API_VERSION,
    api_key=OPENAI_API_KEY,
    cache_dir="cache",
    concept_prompt_template="{notes}",
    concept_questions_prompt_template=question_prompt,
    answer_prompt_template=answer_prompt,
)
async_processor = AsyncTextTopicProcessor(
    model_type="gpt-4o-mini-2024-07-18",
    # "gpt-4o-mini-2024-07-18", 
    # model_type="gpt-4o-2024-08-06",
    temperature=0.0,
    num_topics=400,
    use_cache=True,
    use_structured_output=True,
    use_azure=False,
    azure_endpoint=AZURE_ENDPOINT,
    api_version=API_VERSION,
    api_key=OPENAI_API_KEY,
    cache_dir="cache",
    concept_prompt_template="{notes}",
    concept_questions_prompt_template=question_prompt,
    answer_prompt_template=answer_prompt,
    max_concurrent_patients=40,
)

2025-03-13 16:01:12,200 - text_topic_extraction.cache.cache_manager - INFO - Text Topic Extraction cache initialized at cache/text_topic_cache.duckdb
2025-03-13 16:01:12,233 - text_topic_extraction.text_topic_processor - INFO - Initialized TextTopicProcessor with model gpt-4o-mini-2024-07-18
2025-03-13 16:01:12,239 - text_topic_extraction.cache.cache_manager - INFO - Text Topic Extraction cache initialized at cache/text_topic_cache.duckdb
2025-03-13 16:01:12,245 - text_topic_extraction.text_topic_processor - INFO - Initialized AsyncTextTopicProcessor with model gpt-4o-mini-2024-07-18


In [10]:
processor.patient_notes = patient_notes

In [11]:
processor.openai_client.client

<openai.OpenAI at 0x106616dd0>

In [12]:
topic_modeler = BERTopicModeler(
    num_topics=processor.num_topics,
    embedding_model=(
        processor.embedding_model if processor.embedding_model != "default" else None
    ),
    vectorizer_model=processor.vectorizer_model,
    min_topic_size=processor.min_topic_size,
    language=processor.language,
    n_gram_range=processor.n_gram_range,
)

question_generator = QuestionGenerator(
    llm_api=processor.openai_client,
    batch_prompt_template=processor.concept_questions_prompt_template,
)
answer_generator = AsyncAnswerGenerator(
    llm_api=async_processor.openai_client, prompt_template=processor.answer_prompt_template
)

2025-03-13 16:01:12,255 - text_topic_extraction.topic_modeler.bertopic_modeler - INFO - Initialized BERTopicModeler with 400 topics


In [13]:
topics = processor._perform_topic_modeling(
    patient_concepts=patient_concepts,
    topic_modeler=topic_modeler,
)

2025-03-13 16:01:12,258 - text_topic_extraction.text_topic_processor - INFO - Performing topic modeling on concepts from 800 patients
2025-03-13 16:01:12,284 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
2025-03-13 16:01:12,284 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


KeyboardInterrupt: 

In [None]:
batch_size = 200
all_topic_questions = []
for i in range(0, len(topics), batch_size):
    topic_batch = topics[i:i + batch_size]
    topic_questions = processor._generate_questions(
        topics=topic_batch,
        question_generator=question_generator,
    )
    print("topic question ex:", topic_questions[0])
    print("num topics", len(topic_questions))
    all_topic_questions += topic_questions

2025-03-13 15:03:47,644 - text_topic_extraction.text_topic_processor - INFO - Generating questions for 200 topics
2025-03-13 15:04:14,368 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 15:04:14,402 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key cb2a80c8fe...
2025-03-13 15:04:14,403 - text_topic_extraction.text_topic_processor - INFO - Generated 200 questions
2025-03-13 15:04:14,404 - text_topic_extraction.text_topic_processor - INFO - Question for topic 0: Is the patient an elderly senior adult?
2025-03-13 15:04:14,404 - text_topic_extraction.text_topic_processor - INFO - Question for topic 1: Is the patient a nonsmoker?
2025-03-13 15:04:14,404 - text_topic_extraction.text_topic_processor - INFO - Question for topic 2: Is the patient a woman?
2025-03-13 15:04:14,404 - text_topic_extraction.text_topic_processor - INFO - Question for topic 3: Has the patient been hospitalized?
2025-03-13 

topic question ex: topic_id=0 topic_name='elderly senior adult' question='Is the patient an elderly senior adult?' keywords=['elderly', 'senior', 'adult', 'middleaged', 'yo', 'years', 'old', 'age', 'young', 'older'] keyword_scores=[0.05966476998871776, 0.059362585114674474, 0.050684648743146696, 0.041141828700968594, 0.04084985157403527, 0.0373779493951345, 0.03722355930087635, 0.03665847953079638, 0.03298190066395977, 0.027227526765457755]
num topics 200


2025-03-13 15:04:44,475 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-13 15:04:44,519 - text_topic_extraction.cache.cache_manager - INFO - Stored response in cache with key f84484bb77...
2025-03-13 15:04:44,521 - text_topic_extraction.text_topic_processor - INFO - Generated 199 questions
2025-03-13 15:04:44,521 - text_topic_extraction.text_topic_processor - INFO - Question for topic 200: Is resuscitation being performed?
2025-03-13 15:04:44,521 - text_topic_extraction.text_topic_processor - INFO - Question for topic 201: Is there adequate blood flow in the circulatory system?
2025-03-13 15:04:44,521 - text_topic_extraction.text_topic_processor - INFO - Question for topic 202: Is radiation therapy being administered?
2025-03-13 15:04:44,522 - text_topic_extraction.text_topic_processor - INFO - Question for topic 203: Is Lasix (furosemide) being given to the patient?
2025-03-13 15:04:44,522 - text_topic_extraction.text_topic_proc

topic question ex: topic_id=200 topic_name='resuscitation resuscitate do' question='Is resuscitation being performed?' keywords=['resuscitation', 'resuscitate', 'do', 'not', 'resuscitated', 'cardiopulmonary', 'fluid', 'revived', 'actively', 'field'] keyword_scores=[0.5854071466114107, 0.35405664936809095, 0.29724377308394734, 0.13744134982272763, 0.1322701848574556, 0.1322701848574556, 0.11937424860421932, 0.07735244051303433, 0.07735244051303433, 0.0661350924287278]
num topics 199


In [None]:
NUM_EXTRACT = 400
patient_concepts_subset = {str(idx): patient_concepts[str(idx)] for idx in train_idxs[:NUM_EXTRACT]}
async_processor.patient_notes = {str(idx): patient_notes[str(idx)] for idx in train_idxs[:NUM_EXTRACT]}

patient_answers = await async_processor._generate_answers_async(
    patient_concepts=patient_concepts_subset,
    topic_questions=all_topic_questions,
    answer_generator=answer_generator,
)

2025-03-13 15:33:27,574 - text_topic_extraction.text_topic_processor - INFO - Generating answers for 400 patients and 399 questions
2025-03-13 15:33:27,576 - text_topic_extraction.text_topic_processor - INFO - Generating answers for patient 2200
2025-03-13 15:33:27,576 - text_topic_extraction.answer_generator.answer_generator - INFO - Generating answers asynchronously for patient 2200 for 399 questions
2025-03-13 15:33:27,577 - text_topic_extraction.answer_generator.answer_generator - INFO - Sending batch of 399 questions for patient 2200 asynchronously
2025-03-13 15:33:27,578 - text_topic_extraction.text_topic_processor - INFO - Generating answers for patient 4627
2025-03-13 15:33:27,578 - text_topic_extraction.answer_generator.answer_generator - INFO - Generating answers asynchronously for patient 4627 for 399 questions
2025-03-13 15:33:27,580 - text_topic_extraction.answer_generator.answer_generator - INFO - Sending batch of 399 questions for patient 4627 asynchronously
2025-03-13 1

In [None]:
# Create topic matrix
topic_matrix = answer_generator.create_topic_matrix(patient_answers)

# Create matrix DataFrames (always do this even if not saving)
binary_matrix_df = MatrixBuilder.create_binary_matrix(topic_matrix)
probability_matrix_df = MatrixBuilder.create_probability_matrix(
    topic_matrix
)

# Create return dictionary with DataFrames
results = {
    "binary_matrix_df": binary_matrix_df,
    "probability_matrix_df": probability_matrix_df,
    "topic_matrix": topic_matrix,
    "topics": topics,
    "topic_questions": all_topic_questions,
}

In [None]:
# with open("exp_mimic/_output/train.pkl", "wb") as f:
#     pickle.dump(results, f)

In [53]:
with open("exp_mimic/_output/train.pkl", "rb") as f:
    train_results = pickle.load(f)
    probability_matrix_df = train_results['probability_matrix_df']
    all_topic_questions = train_results['topic_questions']

In [54]:
# make fake labels
COEF = 4
true_prob = 1/(1 + np.exp(-(
    # -100 +
    llm_extract_df.label_employment_False * COEF + llm_extract_df.label_housing_False * COEF + (llm_extract_df.label_alcohol_Past + llm_extract_df.label_alcohol_Present) * COEF
    + (llm_extract_df.label_tobacco_Past + llm_extract_df.label_tobacco_Present) * COEF
    + (llm_extract_df.label_drugs_Past + llm_extract_df.label_drugs_Present) * (COEF + 1)
    )))
llm_extract_df['y'] =  np.random.binomial(n=1, p=true_prob, size=llm_extract_df.shape[0])

In [55]:
llm_extract_df.columns

Index(['label_community_present', 'label_community_absent', 'label_education',
       'label_employment_False', 'label_employment_None',
       'label_employment_True', 'label_housing_False', 'label_housing_None',
       'label_housing_True', 'label_alcohol_Never', 'label_alcohol_None',
       'label_alcohol_Past', 'label_alcohol_Present', 'label_alcohol_Unsure',
       'label_tobacco_Never', 'label_tobacco_None', 'label_tobacco_Past',
       'label_tobacco_Present', 'label_tobacco_Unsure', 'label_drugs_Never',
       'label_drugs_None', 'label_drugs_Past', 'label_drugs_Present',
       'label_drugs_Unsure', 'sentence', 'llm_output', 'y'],
      dtype='object')

In [56]:
NUM_EXTRACT = 400
lr = LogisticRegressionCV(penalty="l2", solver="saga", cv=10, n_jobs=5) #, l1_ratios=[0.1,.9])
lr.fit(probability_matrix_df, llm_extract_df.y.iloc[train_idxs[:NUM_EXTRACT]])

In [57]:
coef_df = pd.DataFrame({
    "coef": lr.coef_[0],
    "abs_coef": np.abs(lr.coef_[0]),
    "question": [topic_q.question for topic_q in all_topic_questions]
})
coef_df.sort_values(by="abs_coef", ascending=False).head(20)
# print(lr.intercept_)

Unnamed: 0,coef,abs_coef,question
104,1.261716,1.261716,Is the patient retired?
0,-0.916829,0.916829,Is the patient an elderly senior adult?
180,0.81547,0.81547,Is the patient a smoker?
4,0.657332,0.657332,Is the patient consuming alcohol?
26,0.617676,0.617676,Does the patient have unhealthy lifestyle fact...
58,0.597225,0.597225,Is the patient coughing?
203,0.584264,0.584264,Is Lasix (furosemide) being given to the patient?
220,-0.576725,0.576725,Is the patient responsive and mobile?
12,-0.533678,0.533678,Is the patient drug-free?
126,0.4904,0.4904,Does the patient have COPD?


In [58]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [59]:
rf = RandomForestClassifier(n_jobs=5)
rf.fit(probability_matrix_df, llm_extract_df.y.iloc[train_idxs[:NUM_EXTRACT]])
rf_df = pd.DataFrame({
    "feat_import": rf.feature_importances_,
    "question": [topic_q.question for topic_q in all_topic_questions]
})
rf_df.sort_values(by="feat_import", ascending=False).head(20)

Unnamed: 0,feat_import,question
0,0.03166,Is the patient an elderly senior adult?
104,0.02126,Is the patient retired?
60,0.02051,Is the patient consuming alcohol?
61,0.016481,Is the patient cohabitating with someone?
26,0.015763,Does the patient have unhealthy lifestyle fact...
4,0.015108,Is the patient consuming alcohol?
2,0.014113,Is the patient a woman?
45,0.013002,Does the patient have children?
367,0.012679,Is the patient's sex biological?
12,0.012449,Is the patient drug-free?


In [60]:
gb = GradientBoostingClassifier()
gb.fit(probability_matrix_df, llm_extract_df.y.iloc[train_idxs[:NUM_EXTRACT]])
gb_df = pd.DataFrame({
    "feat_import": gb.feature_importances_,
    "question": [topic_q.question for topic_q in all_topic_questions]
})
gb_df.sort_values(by="feat_import", ascending=False).head(20)

Unnamed: 0,feat_import,question
104,0.116339,Is the patient retired?
0,0.090566,Is the patient an elderly senior adult?
4,0.088487,Is the patient consuming alcohol?
26,0.05874,Does the patient have unhealthy lifestyle fact...
180,0.030939,Is the patient a smoker?
260,0.026064,Is the patient experiencing self-harm or suici...
2,0.02507,Is the patient a woman?
211,0.020435,Did the onset of symptoms occur suddenly?
72,0.020214,Is the patient's condition deteriorating?
118,0.020065,Has the patient denied substance use?


## TEST?

In [26]:
NUM_EXTRACT = 400
patient_concepts_subset = {str(idx): patient_concepts[str(idx)] for idx in test_idxs[:NUM_EXTRACT]}
async_processor.patient_notes = {str(idx): patient_notes[str(idx)] for idx in test_idxs[:NUM_EXTRACT]}

test_patient_answers = await async_processor._generate_answers_async(
    patient_concepts=patient_concepts_subset,
    topic_questions=all_topic_questions,
    answer_generator=answer_generator,
)

2025-03-13 16:02:45,658 - text_topic_extraction.text_topic_processor - INFO - Generating answers for 400 patients and 399 questions
2025-03-13 16:02:45,660 - text_topic_extraction.text_topic_processor - INFO - Generating answers for patient 3276
2025-03-13 16:02:45,660 - text_topic_extraction.answer_generator.answer_generator - INFO - Generating answers asynchronously for patient 3276 for 399 questions
2025-03-13 16:02:45,661 - text_topic_extraction.answer_generator.answer_generator - INFO - Sending batch of 399 questions for patient 3276 asynchronously
2025-03-13 16:02:45,661 - text_topic_extraction.text_topic_processor - INFO - Generating answers for patient 6688
2025-03-13 16:02:45,662 - text_topic_extraction.answer_generator.answer_generator - INFO - Generating answers asynchronously for patient 6688 for 399 questions
2025-03-13 16:02:45,662 - text_topic_extraction.answer_generator.answer_generator - INFO - Sending batch of 399 questions for patient 6688 asynchronously
2025-03-13 1

CancelledError: 

In [None]:
# Create topic matrix
test_topic_matrix = answer_generator.create_topic_matrix(test_patient_answers)

# Create matrix DataFrames (always do this even if not saving)
test_binary_matrix_df = MatrixBuilder.create_binary_matrix(test_topic_matrix)
test_probability_matrix_df = MatrixBuilder.create_probability_matrix(
    test_topic_matrix
)

# Create return dictionary with DataFrames
test_results = {
    "binary_matrix_df": test_binary_matrix_df,
    "probability_matrix_df": test_probability_matrix_df,
    "topic_matrix": test_topic_matrix,
    "topics": topics,
    "topic_questions": all_topic_questions,
}

In [None]:
# with open("exp_mimic/_output/test.pkl", "wb") as f:
#     pickle.dump(test_results, f)

In [61]:
with open("exp_mimic/_output/test.pkl", "rb") as f:
    test_results = pickle.load(f)
    test_probability_matrix_df = test_results['probability_matrix_df']

In [62]:
from sklearn.metrics import roc_auc_score

In [63]:
roc_auc_score(llm_extract_df.y.iloc[test_idxs[:NUM_EXTRACT]], rf.predict_proba(test_probability_matrix_df)[:,1])

0.6774193548387096

In [64]:
roc_auc_score(llm_extract_df.y.iloc[test_idxs[:NUM_EXTRACT]], gb.predict_proba(test_probability_matrix_df)[:,1])

0.7232057644588662

In [65]:
roc_auc_score(llm_extract_df.y.iloc[test_idxs[:NUM_EXTRACT]], lr.predict_proba(test_probability_matrix_df)[:,1])

0.707052872685627

In [66]:
roc_auc_score(llm_extract_df.y.iloc[test_idxs[:NUM_EXTRACT]], true_prob[test_idxs[:NUM_EXTRACT]])

0.8728526436342814

## check extractions

In [None]:
from scipy.stats import pearsonr
pearsonr(probability_matrix_df.iloc[:,1], llm_extract_df.label_tobacco_Present.iloc[train_idxs[:NUM_EXTRACT]])

PearsonRResult(statistic=-0.6790301721684652, pvalue=2.1958160898966095e-55)

In [None]:
len(set([topic_q.question for topic_q in all_topic_questions]))

397