In [17]:
# imports
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle
import community as community_louvain  # pip install python-louvain
from collections import defaultdict

In [18]:
# Extract the dataframe (aggregated)
path1 = "../dataframes/df_aggregated.pk1"
with open(path1, "rb") as f:
    df = pickle.load(f)


# Extract Information for Gephi

### Using Cosine similarity
The goal is to measure the direct similarity between diseases based on their symptom vectors (or features),

In [19]:
# Get disease names
diseases = df.iloc[:, 0]


In [20]:
# Extract symptom matrix 
symptom_matrix = df.iloc[:, 1:].values  

# Check dimensions
print("Number of diseases:", len(diseases))
print("Number of symptoms:", symptom_matrix.shape[1])
print("Symptom matrix shape:", symptom_matrix.shape)

Number of diseases: 773
Number of symptoms: 377
Symptom matrix shape: (773, 377)


In [21]:
# Compute cosine similarity matrix
cosine_sim_matrix = cosine_similarity(symptom_matrix)

In [22]:
# Convert similarity matrix to df
cosine_df = pd.DataFrame(cosine_sim_matrix, index=diseases, columns=diseases)

In [23]:
cosine_df

diseases,abdominal aortic aneurysm,abdominal hernia,abscess of nose,abscess of the lung,abscess of the pharynx,acanthosis nigricans,acariasis,achalasia,acne,actinic keratosis,...,von willebrand disease,vulvar cancer,vulvar disorder,vulvodynia,wernicke korsakoff syndrome,white blood cell disease,whooping cough,wilson disease,yeast infection,zenker diverticulum
diseases,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
abdominal aortic aneurysm,1.000000,0.129099,0.000000,0.204124,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.235702,0.000000,0.353553,0.000000,0.136083,0.182574,0.000000,0.136083,0.000000
abdominal hernia,0.129099,1.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.182574,0.000000,0.105409,0.000000,0.000000,0.105409,0.000000
abscess of nose,0.000000,0.000000,1.000000,0.176777,0.500000,0.0,0.000000,0.158114,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.353553,0.632456,0.204124,0.000000,0.204124
abscess of the lung,0.204124,0.000000,0.176777,1.000000,0.176777,0.0,0.000000,0.223607,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.288675,0.166667,0.447214,0.288675,0.000000,0.000000
abscess of the pharynx,0.000000,0.000000,0.500000,0.176777,1.000000,0.0,0.000000,0.474342,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.235702,0.474342,0.204124,0.000000,0.408248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
white blood cell disease,0.136083,0.105409,0.353553,0.166667,0.235702,0.0,0.000000,0.149071,0.000000,0.000000,...,0.0,0.000000,0.000000,0.192450,0.000000,1.000000,0.298142,0.192450,0.111111,0.000000
whooping cough,0.182574,0.000000,0.632456,0.447214,0.474342,0.0,0.000000,0.200000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.298142,1.000000,0.258199,0.000000,0.258199
wilson disease,0.000000,0.000000,0.204124,0.288675,0.204124,0.0,0.000000,0.258199,0.000000,0.000000,...,0.0,0.333333,0.000000,0.000000,0.000000,0.192450,0.258199,1.000000,0.000000,0.000000
yeast infection,0.136083,0.105409,0.000000,0.000000,0.000000,0.0,0.166667,0.000000,0.111111,0.100504,...,0.0,0.000000,0.408248,0.384900,0.000000,0.111111,0.000000,0.000000,1.000000,0.000000


In [24]:
# Set diagonal values to 0 (self-similarity)
np.fill_diagonal(cosine_sim_matrix, 0)

# Find max similarity value (off-diagonal)
max_sim = np.max(cosine_sim_matrix)

# Find disease pairs with max similarity
max_pairs = np.argwhere(cosine_sim_matrix == max_sim)

In [25]:
# Extract upper triangle (to avoid duplicate pairs)
upper_triangle_indices = np.triu_indices_from(cosine_sim_matrix, k=1)

# Get disease pairs and similarity scores
similarity_scores = cosine_sim_matrix[upper_triangle_indices]
disease_pairs = [(diseases[i], diseases[j]) for i, j in zip(*upper_triangle_indices)]

# df disease pairs and similarities
similarity_df = pd.DataFrame(disease_pairs, columns=["Disease 1", "Disease 2"])
similarity_df["Cosine Similarity"] = similarity_scores


In [26]:
similarity_df

Unnamed: 0,Disease 1,Disease 2,Cosine Similarity
0,abdominal aortic aneurysm,abdominal hernia,0.129099
1,abdominal aortic aneurysm,abscess of nose,0.000000
2,abdominal aortic aneurysm,abscess of the lung,0.204124
3,abdominal aortic aneurysm,abscess of the pharynx,0.000000
4,abdominal aortic aneurysm,acanthosis nigricans,0.000000
...,...,...,...
298373,whooping cough,yeast infection,0.000000
298374,whooping cough,zenker diverticulum,0.258199
298375,wilson disease,yeast infection,0.000000
298376,wilson disease,zenker diverticulum,0.000000


In [27]:
# Create a disease list from the top 50 pairs (unique diseases)
diseases_list = np.unique(disease_pairs)

In [28]:
# Create a zero matrix for the similarity values between diseases
similarity_matrix = np.zeros((len(diseases_list), len(diseases_list)))
similarity_values = similarity_df["Cosine Similarity"].values

## Translate Data for Gephi Import
Utilizing community analysis to create unique .csv files to import for our Gephi visual

In [29]:
disease_pairs = similarity_df[['Disease 1', 'Disease 2']].values
similarity_values = similarity_df['Cosine Similarity'].values

In [30]:
# Create edge list for Gephi (Source, Target, Weight)
edge_list = []
for (disease_1, disease_2), similarity in zip(disease_pairs, similarity_values):
    if similarity > 0:  # Only include edges with similarity > 0
        edge_list.append([disease_1, disease_2, similarity])

edge_df = pd.DataFrame(edge_list, columns=["Source", "Target", "Weight"])
edge_df.to_csv("disease_similarity_edge_list.csv", index=False)

In [31]:
# Create a graph from the edge list
G = nx.Graph()
for (disease_1, disease_2), similarity in zip(disease_pairs, similarity_values):
    if similarity > 0:
        G.add_edge(disease_1, disease_2, weight=similarity)

# Apply Louvain
partition = community_louvain.best_partition(G, weight='weight')

# Create a node list with correct ID and Label format
# Create a mapping of diseases to unique indices
disease_to_index = {disease: idx for idx, disease in enumerate(G.nodes())}

# Create a node list with unique ID, disease name, and community ID
node_list = []
for disease, community_id in partition.items():
    node_list.append([disease_to_index[disease], disease, community_id])  # Use unique index as ID

# Convert to DataFrame and save node list
node_df = pd.DataFrame(node_list, columns=["Id", "Label", "Community"])
node_df.to_csv("disease_nodes_with_communities.csv", index=False)

In [32]:
node_df

Unnamed: 0,Id,Label,Community
0,0,abdominal aortic aneurysm,6
1,1,abdominal hernia,6
2,2,abscess of the lung,3
3,3,acute bronchiolitis,1
4,4,acute bronchitis,1
...,...,...,...
768,768,trigger finger (finger disorder),4
769,769,open wound of the finger,4
770,770,open wound of the hand,4
771,771,fibrocystic breast disease,2


In [33]:
num_communities = len(set(partition.values()))
print(f"Number of Communities: {num_communities}")

Number of Communities: 7


In [34]:
# Create a dictionary to group diseases by community
community_groups = defaultdict(list)
for disease, community_id in partition.items():
    community_groups[community_id].append(disease)

# Open a file in write mode to save the communities and their diseases
with open("diseases_per_community.txt", "w") as f:
    for community_id, diseases in community_groups.items():
        f.write(f"Community {community_id}:\n")
        for disease in diseases:
            f.write(f"  - {disease}\n")
        f.write("\n")  # Add an empty line between communities for readability

# Optional: Print the output to the notebook as well
with open("diseases_per_community.txt", "r") as f:
    print(f.read())

Community 6:
  - abdominal aortic aneurysm
  - abdominal hernia
  - acute fatty liver of pregnancy (aflp)
  - acute kidney injury
  - acute pancreatitis
  - adrenal adenoma
  - alcoholic liver disease
  - anal fissure
  - appendicitis
  - ascending cholangitis
  - benign kidney cyst
  - benign vaginal discharge (leukorrhea)
  - bladder disorder
  - celiac disease
  - cervical disorder
  - cervicitis
  - chlamydia
  - cholecystitis
  - choledocholithiasis
  - chronic constipation
  - chronic pancreatitis
  - cirrhosis
  - colonic polyp
  - colorectal cancer
  - crohn disease
  - cystitis
  - diabetic ketoacidosis
  - diverticulitis
  - diverticulosis
  - drug withdrawal
  - ectopic pregnancy
  - endometriosis
  - epididymitis
  - esophageal cancer
  - esophagitis
  - fluid overload
  - foreign body in the gastrointestinal tract
  - gallstone
  - gas gangrene
  - gastritis
  - gastroduodenal ulcer
  - gastroesophageal reflux disease (gerd)
  - gastrointestinal hemorrhage
  - gastroparesi

In [35]:
# Compute Betweenness Centrality
betweenness = nx.betweenness_centrality(G, weight='weight', k=100)

df = pd.DataFrame([(disease_to_index[disease], bet) for disease, bet in betweenness.items()], 
                  columns=["Id", "Betweenness"])
df.to_csv("betweenness_values.csv", index=False)


In [36]:
# Load similarity data
disease_pairs = similarity_df[['Disease 1', 'Disease 2']].values
similarity_values = similarity_df['Cosine Similarity'].values

# Create a graph from the edge list
G = nx.Graph()
for (disease_1, disease_2), similarity in zip(disease_pairs, similarity_values):
    if similarity > 0:  # Only include edges with similarity > 0
        G.add_edge(disease_1, disease_2, weight=similarity)

# Apply Louvain
partition = community_louvain.best_partition(G, weight='weight')

# Create a mapping of diseases to unique indices
disease_to_index = {disease: idx for idx, disease in enumerate(G.nodes())}

# Compute degrees
degree_dict = dict(G.degree())

# Dictionary of diseases by community 
community_groups = defaultdict(list)
for disease, community_id in partition.items():
    community_groups[community_id].append(disease)

# Store top 4-5 highest-degree nodes per community
top_nodes_per_community = []

for community_id, diseases_in_community in community_groups.items():
    if len(diseases_in_community) > 1:  # Only consider meaningful communities
        # Sort diseases by degree in descending order
        sorted_diseases = sorted(diseases_in_community, key=lambda d: degree_dict.get(d, 0), reverse=True)
        # Take the top 4 or 5 (if at least 5 exist)
        top_diseases = sorted_diseases[:5]

        for disease in top_diseases:
            top_nodes_per_community.append([
                community_id,
                disease_to_index[disease], 
                disease, 
                degree_dict[disease]
            ])

# Convert to DataFrame and save
top_nodes_df = pd.DataFrame(top_nodes_per_community, columns=["Community", "Id", "Label", "Degree"])
top_nodes_df.to_csv("top_degree_nodes_per_community.csv", index=False)



In [41]:
# Create edge list with IDs
edge_list = []
for (disease_1, disease_2), similarity in zip(disease_pairs, similarity_values):
    if similarity > 0:
        edge_list.append([
            disease_to_index[disease_1],  # Convert Source to ID
            disease_to_index[disease_2],  # Convert Target to ID
            similarity
        ])

# Convert to DataFrame and save
edge_df = pd.DataFrame(edge_list, columns=["Source", "Target", "Weight"])
edge_df.to_csv("disease_similarity_edge_list.csv", index=False)

In [38]:
# Print the highest-degree nodes per community in Jupyter Notebook
for community_id, diseases_in_community in community_groups.items():
    if len(diseases_in_community) > 1:  # Only consider meaningful communities
        sorted_diseases = sorted(diseases_in_community, key=lambda d: degree_dict.get(d, 0), reverse=True)
        top_diseases = sorted_diseases[:5]  # Get top 4-5 nodes

        print(f"\nCommunity {community_id}:")
        for disease in top_diseases:
            print(f"  - {disease} (Degree: {degree_dict[disease]})")



Community 0:
  - hypokalemia (Degree: 403)
  - pain after an operation (Degree: 390)
  - acute kidney injury (Degree: 367)
  - gastritis (Degree: 363)
  - diabetic ketoacidosis (Degree: 355)

Community 1:
  - pleural effusion (Degree: 361)
  - strep throat (Degree: 353)
  - lymphadenitis (Degree: 352)
  - malignant hypertension (Degree: 346)
  - sarcoidosis (Degree: 341)

Community 5:
  - drug reaction (Degree: 392)
  - acute stress reaction (Degree: 381)
  - lyme disease (Degree: 357)
  - intracerebral hemorrhage (Degree: 355)
  - concussion (Degree: 354)

Community 4:
  - sickle cell crisis (Degree: 362)
  - chronic pain disorder (Degree: 346)
  - muscle spasm (Degree: 317)
  - fibromyalgia (Degree: 310)
  - injury to the trunk (Degree: 296)

Community 2:
  - shingles (herpes zoster) (Degree: 379)
  - food allergy (Degree: 284)
  - postoperative infection (Degree: 278)
  - allergy (Degree: 239)
  - impetigo (Degree: 232)

Community 3:
  - conjunctivitis due to virus (Degree: 196)
  

In [39]:
print(f"Total number of communities: {len(community_groups)}")


Total number of communities: 6


In [40]:
# Compute node degrees
degree_dict = dict(G.degree(weight=None))  # unweighted degree

# Create list to store scored pairs
scored_pairs = []

for idx, row in similarity_df.iterrows():
    d1 = row['Disease 1']
    d2 = row['Disease 2']
    similarity = row['Cosine Similarity']
    
    # Only consider connected pairs with similarity > 0
    if similarity > 0 and G.has_edge(d1, d2):
        deg1 = degree_dict.get(d1, 0)
        deg2 = degree_dict.get(d2, 0)
        avg_deg = (deg1 + deg2) / 2
        misdiagnosis_score = similarity * avg_deg
        
        scored_pairs.append({
            'Disease 1': d1,
            'Disease 2': d2,
            'Similarity': similarity,
            'Degree 1': deg1,
            'Degree 2': deg2,
            'Avg Degree': avg_deg,
            'Misdiagnosis Risk Score': misdiagnosis_score
        })

# Convert to DataFrame
risk_df = pd.DataFrame(scored_pairs)

# Sort by score in descending order
risk_df = risk_df.sort_values(by='Misdiagnosis Risk Score', ascending=False)

# Save or display top results
risk_df.to_csv('misdiagnosis_risk_scores.csv', index=False)
print(risk_df.head(10))  # Show top 10 highest-risk pairs


                        Disease 1  \
48552  infectious gastroenteritis   
38763                   gastritis   
18672               cholecystitis   
2657           acute bronchospasm   
10950                 atelectasis   
2179             acute bronchitis   
4254        acute stress reaction   
47148                 hypovolemia   
3577           acute pancreatitis   
52406                kidney stone   

                                          Disease 2  Similarity  Degree 1  \
48552                 noninfectious gastroenteritis    0.916667       333   
38763                                   hypokalemia    0.700000       363   
18672                                     gallstone    0.870388       302   
2657                                      pneumonia    0.870388       289   
10950                          poisoning due to gas    0.771517       335   
2179   chronic obstructive pulmonary disease (copd)    0.870388       323   
4254                                        anxiety  