In [None]:
# generate term groupings but chunk it up and loop it to create several jsons

In [1]:
# Import the os package
import os

# Import the openai package
import openai

import os
import openai
from scipy.spatial import distance
import plotly.express as px
from sklearn.cluster import KMeans
import json
import re

In [2]:
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
import pandas as pd

concepts_syn_df = pd.read_csv("concept_synonyms_CVsubset.csv")
concepts_syn_df.head()

Unnamed: 0,general_ancestor_concept_name,general_ancestor_concept_id,descendant_concept_id,concept_name,rc,dbc,drc,ddbc,max_levels_of_separation,concept_id,concept_synonym_name
0,Disorder of cardiovascular system,134057,443784,Vascular disorder,186458,13,1408753835,22,1,443784,Angiopathy
1,Disorder of cardiovascular system,134057,443784,Vascular disorder,186458,13,1408753835,22,1,443784,Disorder of blood vessel
2,Disorder of cardiovascular system,134057,443784,Vascular disorder,186458,13,1408753835,22,1,443784,Vascular disease
3,Disorder of cardiovascular system,134057,443784,Vascular disorder,186458,13,1408753835,22,1,443784,Disorder of blood vessel (disorder)
4,Disorder of cardiovascular system,134057,44784217,Cardiac arrhythmia,84417590,19,809155077,22,2,44784217,Arrhythmia


# GPT Term Grouping Task Start

In [172]:
import openai

# Function to split a long list into smaller chunks
def chunk_list(lst, chunk_size=50):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

def generate_term_grouping(term_list, ancestor_list):
    # Define chunk size (e.g., 500 terms per batch)
    chunk_size = 50

    # Initialize a list to store all the responses
    all_responses = []

    # Iterate through the term list in chunks
    for chunk in chunk_list(term_list, chunk_size):
        # Convert chunk to string for prompt
        chunk_str = ", ".join(chunk)

        # Construct the prompt for the current chunk
        prompt = f"""Please group these terms: {chunk_str} into these categories: {ancestor_list}
        
        use all terms that is provided, and only place them in categories that are provided.
        If there is a term that does not belong, you can put it in a category called "other". 
        No category should have no terms. (ie. do not allow empty categories.)
        
        output only the json object.
        
        Please break your JSON output into smaller, manageable chunks as needed. Each chunk should represent a logical portion of the grouping. 
    Indicate which part of the hierarchy you're returning by including the part number, for example: 
    [start chunk 1/5] ```json ...  ``` [end chunk 1/5]
    
        """

        # Call the API for each chunk
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system", 
                    "content": """Organize a list of cardiovascular diseases and conditions into meaningful groupings based on how medically similar they are.

# Steps

1. **List Cardiovascular Conditions**: Review the provided list of medical terms.

2. **Categorize Conditions**: Group these conditions based on their medical similarities, such as:
   - **Heart Diseases**: Conditions specifically related to heart function.
   - **Vascular Diseases**: Conditions affecting blood vessels.
   - **Congenital Conditions**: Heart or vessel conditions present at birth.
   - **Arrhythmias**: Disorders of heart rhythm.

3. **Refine Groupings**: Ensure accuracy in categorization, adjusting terms to refine group coherence and relevance.

# Output Format

Provide the output as a list of categories, each followed by their grouped cardiovascular conditions. Clearly label each category and list the terms in an organized manner using bullet points.

# Examples

**Example Start**

- **Heart Diseases**:
  - Myocardial Infarction
  - Coronary Artery Disease

- **Vascular Diseases**:
  - Atherosclerosis
  - Peripheral Artery Disease

- **Congenital Conditions**:
  - Tetralogy of Fallot
  - Ventricular Septal Defect

- **Arrhythmias**:
  - Atrial Fibrillation
  - Ventricular Tachycardia

**Example End**

# Notes

- Ensure terms are verified for correct categorization.
- Explore subcategories if needed for complex conditions to enhance understanding.
- The focus is solely on diseases and conditions. Ensure clarity for medical educational purposes."""
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            max_tokens=16000,  # Adjust if necessary for your use case
            temperature=0.5  # Low temperature for clearer, more focused responses
        )

        # Extract the full response text from the chat model response
        full_response = response.choices[0].message.content

        # Append the response to the all_responses list
        all_responses.append(full_response)

    # Combine all responses into one (optional, depending on your needs)
    final_response = "\n".join(all_responses)
    
    return final_response  # Return the combined response or process it further

In [173]:
sampled_terms = concepts_syn_df.sample(n=500, replace=False, random_state=42)
full_terms = concepts_syn_df['concept_name'].unique()
# sample_list = sampled_terms['concept_name'].unique()

ancestors_list = ['acute disease of cardiovascular system', 'aneurysm',
       'cardiac arrhythmia', 'cardiovascular injury',
       'chronic disease of cardiovascular system',
       'congenital anomaly of cardiovascular structure of trunk',
       'congenital anomaly of cardiovascular system',
       'congenital vascular disorder', 'embolism', 'heart valve disorder',
       'hemorrhage of blood vessel', 'injury of blood vessel',
       'myocardial disease', 'thrombosis', 'thrombosis of blood vessel',
       'vascular disease of abdomen', 'vascular disorder', 'vasculitis',
       'venous varices']

In [175]:
result_chunked = generate_term_grouping(full_terms, ancestors_list)
print(result_chunked)

[start chunk 1/5]
```json
{
  "acute disease of cardiovascular system": [
    "Acute disease of cardiovascular system",
    "Acute heart disease",
    "Acute ischemic heart disease",
    "Myocardial infarction",
    "Angina pectoris"
  ],
  "cardiac arrhythmia": [
    "Cardiac arrhythmia",
    "Supraventricular arrhythmia",
    "Atrial arrhythmia",
    "Fibrillation",
    "Atrial fibrillation"
  ],
  "chronic disease of cardiovascular system": [
    "Chronic disease of cardiovascular system",
    "Chronic heart disease",
    "Chronic heart failure",
    "Ischemic heart disease",
    "Congestive heart failure",
    "Heart failure",
    "Hypertensive heart disease"
  ],
  "heart valve disorder": [
    "Heart valve disorder",
    "Mitral valve disorder",
    "Aortic valve disorder",
    "Non-rheumatic heart valve disorder"
  ]
}
```
[end chunk 1/5]

[start chunk 2/5]
```json
{
  "myocardial disease": [
    "Myocardial disease",
    "Cardiomyopathy",
    "Systolic heart failure",
    "Dias

# combine json files

In [176]:
import re

# Split the result string by triple backticks and filter out the non-JSON content
chunks = re.findall(r'```json\n(.*?)\n```', result_chunked, re.DOTALL)

# Initialize an empty list to store the data
json_data = []

# Process each chunk and parse the valid JSON
for chunk in chunks:
    try:
        # Parse the JSON string into a Python dictionary
        json_obj = json.loads(chunk)
        json_data.append(json_obj)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        print("Attempting manual cleanup of chunk:", chunk[:100])  # Print first 100 characters for debugging

# Now, save each parsed chunk as a separate JSON file
for i, data in enumerate(json_data, start=1):
    with open(f'CV_Grouping_chunk_{i}.json', 'w') as json_file:
        json.dump(data, json_file, indent=2)

print("JSON files have been saved.")


JSON files have been saved.


In [177]:
def merge_json_files(directory="."):
    # Pattern to match your JSON files
    pattern = os.path.join(directory, "CV_Grouping_chunk_*.json")
    json_files = glob.glob(pattern)
    
    merged_data = {}

    for fpath in json_files:
        with open(fpath, 'r', encoding='utf-8') as f:
            data = json.load(f)
            # Merge dictionaries
            for key, value_list in data.items():
                # If key already exists, extend the list
                if key in merged_data:
                    # Combine the lists, optionally remove duplicates
                    merged_data[key].extend(value_list)
                    # Remove duplicates while preserving order
                    seen = set()
                    merged_data[key] = [x for x in merged_data[key] if not (x in seen or seen.add(x))]
                else:
                    merged_data[key] = value_list

    return merged_data

all_data = merge_json_files()
# Print or save merged results
print(json.dumps(all_data, indent=2))

{
  "vascular disorder": [
    "Vascular hemostatic disease",
    "Aortic thromboembolism",
    "Carotid artery stenosis",
    "Carotid artery occlusion",
    "Stenosis of precerebral artery",
    "Vertebral artery obstruction",
    "Retinal artery occlusion",
    "Central retinal vein occlusion with macular edema",
    "Transient arterial retinal occlusion",
    "Telangiectasia disorder",
    "Cutaneous vascular malformation",
    "Angiodysplasia of colon",
    "Male genital organ vascular diseases",
    "Carotid atherosclerosis",
    "Dilatation of pulmonary artery",
    "Brainstem stroke syndrome",
    "Pulmonary hypertensive venous disease",
    "Pulmonary hypertension due to lung disease and/or hypoxia",
    "Pulmonary hypertension due to left heart disease",
    "Persistent pulmonary hypertension of the newborn",
    "Giant cell arteritis with polymyalgia rheumatica",
    "Polyarteritis",
    "Polyarteritis nodosa",
    "Cerebral vasculitis",
    "Arterial steal syndrome",
    "C

In [178]:
import json
import glob
import os

def diagnose_non_list_values(directory="."):
    pattern = os.path.join(directory, "CV_Grouping_chunk_*.json")
    json_files = glob.glob(pattern)
    
    for fpath in json_files:
        with open(fpath, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for key, value in data.items():
                # Check if the value is not a list
                if not isinstance(value, list):
                    print(f"File: {fpath}")
                    print(f"Key: '{key}' contains a {type(value).__name__} instead of a list.")
                    print(f"Value: {value}")
                    print("---")
                    
# Run the diagnostic function
diagnose_non_list_values()


In [179]:
# Specify the output file name
output_file = "merged_CV_Grouping.json"

# Write the merged dictionary to a JSON file
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_data, f, indent=2, ensure_ascii=False)

# extract relationships

In [216]:
with open("merged_CV_Grouping.json") as json_file:
    json_data = json.load(json_file)
    # print(json_data)

In [218]:
# Load data from the merged JSON file
with open("merged_CV_Grouping.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert the dictionary items into a list of tuples and create a DataFrame
gpt_groupings = pd.DataFrame(data.items(), columns=["ancestor_concept_name", "group_members"])


In [219]:
gpt_groupings

Unnamed: 0,ancestor_concept_name,group_members
0,vascular disorder,"[Vascular hemostatic disease, Aortic thromboem..."
1,vasculitis,"[Retinal vasculitis, Necrotizing vasculitis, C..."
2,venous varices,[Varicose veins of lower extremity with inflam...
3,other,"[Multi-infarct dementia, Postcardiotomy syndro..."
4,chronic disease of cardiovascular system,"[Chronic congestive heart failure, Systolic hy..."
5,congenital anomaly of cardiovascular structure...,"[Congenital anomaly of vena cava, Congenital a..."
6,congenital anomaly of cardiovascular system,"[Hypoplasia of right heart, Right hypoplastic ..."
7,congenital vascular disorder,"[Arteriovenous malformation of limb, Spinal ar..."
8,thrombosis,"[Thrombotic microangiopathy, Thrombus of cardi..."
9,thrombosis of blood vessel,"[Thrombosis of iliac artery, Thrombosis of abd..."


In [257]:
# Extract all lists from the 'group_members' column and flatten them into one list
all_grouped_terms = [item for sublist in gpt_groupings['group_members'] for item in sublist]

len(set(all_grouped_terms))

1516

In [222]:
abc = [i.lower() for i in all_grouped_terms]
bcd = [i.lower() for i in full_terms]

len(set(abc).intersection(set(bcd)))

1513

In [223]:
len(full_terms)

1533

In [254]:
# drop rows 
gpt_groupings_final = gpt_groupings[gpt_groupings['ancestor_concept_name'].isin(ancestors_list)]

# Extract all lists from the 'group_members' column and flatten them into one list
filtered_grouped_terms = [item for sublist in gpt_groupings_final['group_members'] for item in sublist]

len(set(filtered_grouped_terms))

1220

In [227]:
# number of categories in the final filtered df
len(gpt_groupings_final['ancestor_concept_name'].unique())

19

In [228]:
abc = [i.lower() for i in filtered_grouped_terms]
bcd = [i.lower() for i in full_terms]

len(set(abc).intersection(set(bcd)))

1218

In [None]:
valid_terms = set(t.lower() for t in full_terms)

len(set(abc).intersection(set(bcd)))

# Evaluation

In [196]:
# get gold standard groups

gold_standard = pd.read_csv("concept_subsumption_CVsubset.csv")

# Convert terms to lowercase if you want to avoid case sensitivity:
gold_standard['ancestor_concept_name'] = gold_standard['ancestor_concept_name'].str.lower()
gold_standard['descendant_concept_name'] = gold_standard['descendant_concept_name'].str.lower()

# filter gold_standard by only terms in my full list
valid_terms = set(t.lower() for t in full_terms)

# Filter rows where both ancestor and descendant are in valid_terms
filtered_df = gold_standard[
    gold_standard['ancestor_concept_name'].isin(valid_terms) & 
    gold_standard['descendant_concept_name'].isin(valid_terms)
]

In [198]:
filtered_df.head()

Unnamed: 0,descendant_concept_id,descendant_concept_name,ancestor_concept_id,ancestor_concept_name,min_levels_of_separation
0,22340,esophageal varices without bleeding,22340,esophageal varices without bleeding,0
15,4111998,esophageal varices associated with another dis...,24966,esophageal varices,1
16,22340,esophageal varices without bleeding,24966,esophageal varices,1
17,28779,bleeding esophageal varices,24966,esophageal varices,1
18,24966,esophageal varices,24966,esophageal varices,0


In [284]:
# Choose the level of separation that will define your groups
N = 5  # or 2, 3, etc., depending on how you want to slice the hierarchy

# 1. Filter for desired level
level_df = filtered_df[filtered_df['min_levels_of_separation'] == N].copy()

# 2. (Optional) Identify leaves if you want only leaf-level descendants:
all_ancestors = set(level_df['ancestor_concept_name'].unique())
all_descendants = set(level_df['descendant_concept_name'].unique())
leaf_terms = all_descendants - all_ancestors

# If you only want leaf descendants:
level_df = level_df[level_df['descendant_concept_name'].isin(leaf_terms)]

# 3. Group by ancestor to form groups
gold_levels = level_df.groupby('ancestor_concept_name')['descendant_concept_name'].apply(list).reset_index(name='group_members')



In [285]:
# 'grouped' now has two columns:
# 'ancestor' and 'group_members' (a list of descendants grouped under that ancestor)
gold_levels.head()

Unnamed: 0,ancestor_concept_name,group_members
0,acute disease of cardiovascular system,[acute st segment elevation myocardial infarct...
1,aneurysm,"[carotid artery aneurysm, aneurysm of poplitea..."
2,cardiac arrhythmia,"[bifascicular block, right bundle branch block..."
3,cardiovascular injury,[injury of radial artery at wrist and hand lev...
4,chronic disease of cardiovascular system,[acute on chronic combined systolic and diasto...


In [286]:
gold_levels["ancestor_concept_name"].unique()

array(['acute disease of cardiovascular system', 'aneurysm',
       'cardiac arrhythmia', 'cardiovascular injury',
       'chronic disease of cardiovascular system',
       'congenital anomaly of cardiovascular structure of trunk',
       'congenital anomaly of cardiovascular system',
       'congenital vascular disorder', 'embolism', 'heart valve disorder',
       'hemorrhage of blood vessel', 'injury of blood vessel',
       'myocardial disease', 'thrombosis', 'thrombosis of blood vessel',
       'vascular disease of abdomen', 'vascular disorder', 'vasculitis',
       'venous varices'], dtype=object)

In [291]:
ancestor_groups = gold_levels["ancestor_concept_name"].unique()

In [287]:
len(grouped["ancestor_concept_name"].unique())

19

In [293]:
# filter filter_df by only ancestor concepts in the ancester_groups

filtered_df_2 = filtered_df[
    filtered_df['ancestor_concept_name'].isin(ancestor_groups)
]

In [294]:
filtered_df_2

Unnamed: 0,descendant_concept_id,descendant_concept_name,ancestor_concept_id,ancestor_concept_name,min_levels_of_separation
2070,4263510,congenital stenosis of tricuspid valve,141124,congenital anomaly of cardiovascular system,5
2071,4174077,myocardial bridge of coronary artery,141124,congenital anomaly of cardiovascular system,5
2072,4035467,congenital insufficiency of pulmonary valve,141124,congenital anomaly of cardiovascular system,5
2073,4085405,congenital arterial aneurysm,141124,congenital anomaly of cardiovascular system,3
2074,4035494,congenital phlebectasia,141124,congenital anomaly of cardiovascular system,3
...,...,...,...,...,...
96890,315643,tachyarrhythmia,44784217,cardiac arrhythmia,1
96891,441872,supraventricular premature beats,44784217,cardiac arrhythmia,2
96892,443522,neonatal bradycardia,44784217,cardiac arrhythmia,2
96893,320744,complete atrioventricular block,44784217,cardiac arrhythmia,4


In [295]:
gold_grouped = filtered_df_2.groupby('ancestor_concept_name')['descendant_concept_name'].apply(list).reset_index(name='group_members')


In [296]:
gold_grouped

Unnamed: 0,ancestor_concept_name,group_members
0,acute disease of cardiovascular system,[acute myocardial infarction due to right coro...
1,aneurysm,"[aneurysm of descending aorta, ruptured aneury..."
2,cardiac arrhythmia,"[cardiac arrest during surgery, premature atri..."
3,cardiovascular injury,[injury of blood vessels at ankle and foot lev...
4,chronic disease of cardiovascular system,"[chronic hypotension, chronic ischemic heart d..."
5,congenital anomaly of cardiovascular structure...,"[right ventricular outflow tract obstruction, ..."
6,congenital anomaly of cardiovascular system,"[congenital stenosis of tricuspid valve, myoca..."
7,congenital vascular disorder,"[patent ductus arteriosus, congenital anomaly ..."
8,embolism,"[cholesterol embolus syndrome, miscarriage com..."
9,heart valve disorder,"[congenital stenosis of mitral valve, mitral v..."


# code for precision and recall

In [297]:
# 1. Merge on ancestor_concept_name
merged = pd.merge(
    gpt_groupings_final, 
    gold_grouped, 
    on="ancestor_concept_name", 
    suffixes=("_pred", "_gold")
)

merged

Unnamed: 0,ancestor_concept_name,group_members_pred,group_members_gold
0,vascular disorder,"[Vascular hemostatic disease, Aortic thromboem...","[phlebitis of portal vein, associated pulmonar..."
1,vasculitis,"[Retinal vasculitis, Necrotizing vasculitis, C...","[polyarteritis, cogan's syndrome, capillaritis..."
2,venous varices,[Varicose veins of lower extremity with inflam...,"[trunk varices, varicocele, bleeding gastric v..."
3,chronic disease of cardiovascular system,"[Chronic congestive heart failure, Systolic hy...","[chronic hypotension, chronic ischemic heart d..."
4,congenital anomaly of cardiovascular structure...,"[Congenital anomaly of vena cava, Congenital a...","[right ventricular outflow tract obstruction, ..."
5,congenital anomaly of cardiovascular system,"[Hypoplasia of right heart, Right hypoplastic ...","[congenital stenosis of tricuspid valve, myoca..."
6,congenital vascular disorder,"[Arteriovenous malformation of limb, Spinal ar...","[patent ductus arteriosus, congenital anomaly ..."
7,thrombosis,"[Thrombotic microangiopathy, Thrombus of cardi...","[thromboangiitis obliterans, obstetric blood-c..."
8,thrombosis of blood vessel,"[Thrombosis of iliac artery, Thrombosis of abd...","[superficial thrombophlebitis, thrombosis of r..."
9,vascular disease of abdomen,"[Dissection of abdominal aorta, Atherosclerosi...","[injury of abdominal aorta, injury of ovarian ..."


In [298]:
# 2. Define a function to calculate precision and recall

valid_terms = set(t.lower() for t in full_terms)

def calculate_metrics(row):
    
    pred_set = {member.lower() for member in row["group_members_pred"] if member.lower() in valid_terms}
    gold_set = {member.lower() for member in row["group_members_gold"]}
    
    TP = pred_set.intersection(gold_set)
    FP = pred_set.difference(gold_set)
    FN = gold_set.difference(pred_set)
    
    precision = len(TP) / (len(TP) + len(FP)) if (len(TP) + len(FP)) > 0 else 0
    recall = len(TP) / (len(TP) + len(FN)) if (len(TP) + len(FN)) > 0 else 0
    
    return pd.Series([precision, recall])

# 3. Apply the function to each row
merged[["precision", "recall"]] = merged.apply(calculate_metrics, axis=1)

# Now merged has columns: ancestor_concept_name, group_members_pred, group_members_gold, precision, recall
print(merged[["ancestor_concept_name", "precision", "recall"]])

                                ancestor_concept_name  precision    recall
0                                   vascular disorder   0.842105  0.155340
1                                          vasculitis   0.800000  0.388889
2                                      venous varices   0.750000  0.714286
3            chronic disease of cardiovascular system   0.307692  0.631579
4   congenital anomaly of cardiovascular structure...   0.636364  0.328125
5         congenital anomaly of cardiovascular system   0.582090  0.433333
6                        congenital vascular disorder   0.600000  0.339623
7                                          thrombosis   0.893333  0.549180
8                          thrombosis of blood vessel   0.634615  0.282051
9                         vascular disease of abdomen   0.822222  0.243421
10                                           embolism   0.982143  0.833333
11                               heart valve disorder   0.924731  0.716667
12             acute dise

In [247]:
merged

Unnamed: 0,ancestor_concept_name,group_members_pred,group_members_gold,precision,recall
0,vascular disorder,"[Vascular hemostatic disease, Aortic thromboem...","[atherosclerosis of right carotid artery, acut...",0.052632,0.106667
1,vasculitis,"[Retinal vasculitis, Necrotizing vasculitis, C...","[syphilitic aortitis, syphilitic aortitis]",0.028571,1.0
2,venous varices,[Varicose veins of lower extremity with inflam...,[varicose veins of lower extremity with ulcer ...,0.05,1.0
3,chronic disease of cardiovascular system,"[Chronic congestive heart failure, Systolic hy...",[acute on chronic combined systolic and diasto...,0.0,0.0
4,congenital anomaly of cardiovascular structure...,"[Congenital anomaly of vena cava, Congenital a...","[right ventricular outflow tract obstruction, ...",0.060606,1.0
5,congenital anomaly of cardiovascular system,"[Hypoplasia of right heart, Right hypoplastic ...","[congenital stenosis of tricuspid valve, myoca...",0.119403,0.5
6,congenital vascular disorder,"[Arteriovenous malformation of limb, Spinal ar...","[congenital anomaly of coronary artery, total ...",0.0,0.0
7,thrombosis,"[Thrombotic microangiopathy, Thrombus of cardi...",[embolism and thrombosis of the thoracic aorta...,0.226667,0.435897
8,thrombosis of blood vessel,"[Thrombosis of iliac artery, Thrombosis of abd...","[cerebral venous thrombosis in the puerperium,...",0.019231,0.333333
9,vascular disease of abdomen,"[Dissection of abdominal aorta, Atherosclerosi...",[acute gastric ulcer with hemorrhage but witho...,0.088889,0.4


In [255]:
merged.to_csv("finalEval_GPTgroups.csv", index=False)