# Prepare Analysis Dataset


In [54]:
import pandas as pd
import json
import igraph as ig

In [55]:
path = "../output/cluster-qualifications/cluster-label-tree/cluster_labels_tags_full_raw.json"
with open(path, "r") as f:
    cluster_labels_tags = json.load(f)

path = (
    "../output/cluster-qualifications/cluster-label-tree/cluster_labels_full_raw.json"
)
with open(path, "r") as f:
    cluster_labels = json.load(f)

path = "../output/cluster-qualifications/cluster-label-tree/cluster_tree_full_raw.json"
with open(path, "r") as f:
    cluster_tree = json.load(f)

### removing the procedural/noise clusters (tag 9), and those with less than 50 papers.


In [56]:
pg = "../data/07-clustered-graphs/alpha0.3_k10_res0.002.graphml"


g = ig.Graph.Read_GraphML(pg)

pdf = "../data/06-clustered-df/alpha0.3_k10_res0.002.pkl"
df = pd.read_pickle(pdf)

  g = ig.Graph.Read_GraphML(pg)


In [57]:
# cluster with less than 50 papers:
too_few_papers_50 = [
    "133",
    "134",
    "135",
    "136",
    "137",
    "138",
    "139",
    "140",
    "141",
    "142",
    "143",
    "144",
    "145",
    "146",
    "147",
    "148",
    "149",
]

noise_clusters = [
    cluster for cluster, [_, tag] in cluster_labels_tags.items() if tag == 9
]

print("Noise Clusters:", noise_clusters)
for i in noise_clusters:
    print(i, ":", cluster_labels_tags[i][0])
print("=" * 50)
print("Too Few Papers Clusters (<50):", too_few_papers_50)
for i in too_few_papers_50:
    print(i, ":", cluster_labels_tags[i][0])

clusters_to_remove = [int(cluster) for cluster in too_few_papers_50 + noise_clusters]

Noise Clusters: ['32', '39', '68', '76', '88', '94', '96', '112', '116', '147']
32 : Paroxetine
39 : Fluoxetine
68 : Sertraline for Depression
76 : Citalopram for Depression
88 : SSRI Use
94 : Gatways to Clinical Trials
96 : Sertraline
112 : Selective serotonin reuptake inhibitors
116 : Movement Disorders Associated with Fluvoxamine
147 : Gateways to Clinical Trials
Too Few Papers Clusters (<50): ['133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149']
133 : Exercise for Depression
134 : SSRIs for Narcolepsy
135 : SSRIs for Tourette's Syndrome
136 : Adulterants of SSRIs in Weight Loss Supplements
137 : Risk of Microscopic Colitis
138 : Psilocybin for Depression
139 : SSRIs for Psychotic Depression (STOP-PD Study)
140 : SSRIs for OCD-like behaviors
141 : SSRI Pharmacokinetics After Bariatric Surgery
142 : Risk of Priapism
143 : SSRIs for Depersonalization Disorder
144 : Risk of Reversible Cerebral Vasoconstriction Syndrome (

### update dataframe


In [58]:
print(df.shape)
df_analysis = df[~df["cluster_alpha0.3_k10_res0.002"].isin(clusters_to_remove)]
print(df_analysis.shape)

df_analysis.to_pickle("../data/08-analysis-data/df_analysis.pkl")

(40643, 38)
(38473, 38)


### update graph


In [59]:
initial_num_vertices = g.vcount()
initial_num_edges = g.ecount()

nodes_to_remove = [v.index for v in g.vs if v["cluster"] in clusters_to_remove]

# Delete the identified nodes from the graph
g.delete_vertices(nodes_to_remove)

# Get the number of vertices and edges after removal
final_num_vertices = g.vcount()
final_num_edges = g.ecount()

# Save the graph
g.write_graphml("../data/08-analysis-data/graph_analysis.graphml")

# Display the results
print("Before removal:")
print(f"Number of vertices: {initial_num_vertices}")
print(f"Number of edges: {initial_num_edges}")

print("\nAfter removal:")
print(f"Number of vertices: {final_num_vertices}")
print(f"Number of edges: {final_num_edges}")

Before removal:
Number of vertices: 40643
Number of edges: 602780

After removal:
Number of vertices: 38473
Number of edges: 559510


# New Qualifications


### remove clusters from filtered_cluster_labels_tags


In [60]:
# remove the clusters from the cluster label dict

filtered_cluster_labels_tags = {
    cluster: label
    for cluster, label in cluster_labels_tags.items()
    if cluster not in clusters_to_remove
}

with open(
    "../output/cluster-qualifications/cluster-label-tree/cluster_labels_tags_filtered.json",
    "w",
) as f:
    json.dump(filtered_cluster_labels_tags, f)

filtered_cluster_labels = {
    cluster: label
    for cluster, label in cluster_labels.items()
    if cluster not in clusters_to_remove
}

with open(
    "../output/cluster-qualifications/cluster-label-tree/cluster_labels_filtered.json",
    "w",
) as f:
    json.dump(filtered_cluster_labels, f)

### remove clusters from cluster_tree


In [61]:
def remove_clusters(data, remove_list):
    # Traverse the dictionary recursively
    for key, value in data.items():
        if isinstance(value, list):
            # Remove items in `remove_list` from this list
            data[key] = [num for num in value if num not in remove_list]
        elif isinstance(value, dict):
            # Recur for dictionaries
            remove_clusters(value, remove_list)


def remove_empty_dicts(data):
    # Store keys to delete after processing to avoid modifying dict while iterating
    keys_to_delete = []

    for key, value in data.items():
        if isinstance(value, dict):
            # Recur on the nested dictionary
            remove_empty_dicts(value)
            # Mark for deletion if the dictionary is empty after recursion
            if not value:
                keys_to_delete.append(key)
        elif isinstance(value, list) and not value:
            # Mark for deletion if it's an empty list
            keys_to_delete.append(key)

    # Delete all keys marked as empty
    for key in keys_to_delete:
        del data[key]


filtered_cluster_tree = cluster_tree.copy()
remove_clusters(filtered_cluster_tree, [int(c) for c in clusters_to_remove])
remove_empty_dicts(filtered_cluster_tree)

### put single clusters under the "Other" category


In [62]:
# put single clusters under the "Other" category

filtered_cluster_tree = {
    "Pharmacology": {
        "Pharmacodynamics": {
            "Mechanism of action": [
                0,
                6,
                10,
                11,
                26,
                28,
                36,
                55,
                60,
                64,
                73,
                79,
                86,
                98,
                106,
                107,
                115,
                119,
                120,
                124,
                131,
            ],
            "Animal Models of Disorders": [7, 13, 127],
            "Other": [69, 92],
        },
        "Pharmacokinetics": [5, 56],
        "Pharmacogenetics": [18, 80],
        "Analytical Methods": [3, 129],
        "Other": [2, 57, 66, 91],
    },
    "Indications": {
        "Disorders": {
            "Depressive Disorders": [
                7,
                8,
                9,
                13,
                17,
                18,
                19,
                22,
                24,
                30,
                34,
                42,
                47,
                49,
                59,
                61,
                74,
                77,
                83,
                84,
                85,
                90,
                92,
                95,
                93,
                109,
                102,
                103,
                105,
                105,
                106,
                110,
                114,
                117,
                118,
                119,
                122,
                124,
                127,
                131,
            ],
            "Anxiety": [26, 31, 37, 63, 72],
            "Weight and Eating Disorders": [29, 50],
            "Substance Abuse/Addictions": [51, 60, 104, 125],
            "Other": [
                4,
                14,
                23,
                25,
                35,
                40,
                41,
                45,
                48,
                52,
                53,
                54,
                58,
                62,
                69,
                70,
                89,
                99,
                100,
                101,
                111,
                121,
                123,
                130,
                126,
            ],
        }
    },
    "Safety": {
        "Special Populations": {
            "Pediatrics": [9, 72],
            "Geriatrics": [23, 38, 49, 61, 65, 67, 74],
            "Other": [19, 34, 47, 59, 84, 85, 93, 105, 114, 118, 122],
        },
        "Perinatal Exposure": [2, 12, 102],
        "Suicide": [7, 27],
        "Sexual Dysfunction": [15, 91],
        "Toxicity": [16, 66, 71, 82],
        "Other": [20, 43, 44, 46, 63, 75, 78, 86, 87, 92, 99, 132],
    },
    "Other": {
        "Ecotoxicology": [1, 81],
        "Alternative Treatments": [33, 103],
        "Other": [21, 30, 53, 97, 108, 128],
    },
}


path = f"../output/cluster-qualifications/cluster-label-tree/cluster_tree_filtered.json"

with open(path, "w") as f:
    json.dump(cluster_tree, f)

# create legend for ThreeJS


In [63]:
def transform_dict_to_legend(cluster_hierarchy_dict, cluster_label_dict):
    """
    Transforms the cluster hierarchy dictionary by adding cluster labels to create a legend.
    """
    # Ensure keys in cluster_label_dict are integers
    cluster_label_dict = {int(k): v for k, v in cluster_label_dict.items()}

    def transform(item):
        if isinstance(item, dict):
            return {k: transform(v) for k, v in item.items()}
        elif isinstance(item, list):
            return [transform(i) for i in item]
        elif isinstance(item, int) and item in cluster_label_dict:
            return {item: cluster_label_dict[item]}
        else:
            return item

    return transform(cluster_hierarchy_dict)


legend = transform_dict_to_legend(filtered_cluster_tree, filtered_cluster_labels)
legend

{'Pharmacology': {'Pharmacodynamics': {'Mechanism of action': [{0: 'Serotonin Receptor Modulation in SSRI Treatment'},
    {6: 'Impact of SSRIs on Neurogenesis'},
    {10: 'SSRIs Effect on Neural Processing of Emotional Cues'},
    {11: 'Paroxetine Binding'},
    {26: 'SSRIs Effect on Fear'},
    {28: 'Structural Basis of Serotonin Transporter Inhibition by SSRIs'},
    {36: 'SSRIs and Inflammation'},
    {55: 'SSRIs Effects on Neuroendocrine System'},
    {60: 'SSRIs for Substance Abuse (Cocaine)'},
    {64: 'SSRIs Effects on Ion Channels'},
    {73: 'SSRIs Effects on Antimicrobials and Gut Microbiome'},
    {79: 'Astrocyte Receptors as a Target for SSRIs'},
    {86: 'Repeated SSRIs Exposures Effects on Dopamine Receptors'},
    {98: 'SSRIs in Model Organisms ( C. Elegans and Drosophilia)'},
    {106: 'Predicting SSRI Response in Depression via miRNA and BDNF'},
    {107: 'SSRIs Effect on Reinforcement Learning in Rodents'},
    {115: 'Neurochemical and Electrophysiological Correlates

# D3 JS Structue

### Build Full Tree Incl Labels


In [96]:
class ClusterHierarchyTransformerD3JS:
    def __init__(self, tree_data, cluster_labels_dict):
        """
        Initialize the ClusterHierarchyTransformerD3JS with tree data and a cluster label mapping.

        :param tree_data: The initial tree structure (dictionary).
        :param cluster_labels_dict: Dictionary mapping of numbers to labels.
        """
        self.tree_data = tree_data
        self.cluster_labels_dict = {
            int(k): v for k, v in cluster_labels_dict.items()
        }  # Ensure keys are integers

    def replace_numbers_with_labels(self, node):
        """Recursively replace numbers with labels based on the mapping."""
        if isinstance(node, list):
            # Replace each number in the list with its label if it exists in the mapping
            return [self.cluster_labels_dict[int(item)] for item in node]
        elif isinstance(node, dict):
            # Recursively replace keys and values in the dictionary
            return {
                key: self.replace_numbers_with_labels(value)
                for key, value in node.items()
            }
        else:
            # Handle individual items
            try:
                return self.cluster_labels_dict[int(node)]
            except (ValueError, KeyError):
                return node

    def transform_structure(self, name, data):
        """Transform the structure into a detailed format with all leaves."""
        if isinstance(data, dict):
            children = [
                self.transform_structure(key, value) for key, value in data.items()
            ]
            return {"name": name, "children": children}
        elif isinstance(data, list):
            return {"name": name, "children": [{"name": item} for item in data]}
        else:
            return {"name": name}

    def transform_overview(self, name, data):
        """Transform the structure for overview, skipping leaf nodes."""
        if isinstance(data, dict):
            children = [
                self.transform_overview(key, value)
                for key, value in data.items()
                if isinstance(value, (dict, list))
            ]
            return {"name": name, "children": children}
        elif isinstance(data, list):
            return {"name": name}
        else:
            return {"name": name}

    def create_transformed_structure(self):
        """Generate the final transformed structure with overview and detailed sections."""
        self.tree_with_labels = self.replace_numbers_with_labels(self.tree_data)
        self.desired_structure = {
            "overview": self.transform_overview("Topic Overview", self.tree_with_labels)
        }

        # Add individual topics with full details
        for topic, content in self.tree_with_labels.items():
            self.desired_structure[topic.lower()] = self.transform_structure(
                topic, content
            )

        return self.desired_structure

    def save_to_json(self, file_path):
        """Save the transformed structure to a JSON file."""
        with open(file_path, "w") as f:
            json.dump(self.desired_structure, f, indent=2)
        print(f"Structure saved to {file_path}")


# Initialize the transformer
transformer = ClusterHierarchyTransformerD3JS(
    filtered_cluster_tree, filtered_cluster_labels
)

d3js_structure = transformer.create_transformed_structure()

# Save to JSON
transformer.save_to_json(
    "../output/cluster-qualifications/cluster-label-tree/D3JS_cluster_hierarchy_structure.json"
)
d3js_structure

Structure saved to ../output/cluster-qualifications/cluster-label-tree/D3JS_cluster_hierarchy_structure.json


{'overview': {'name': 'Topic Overview',
  'children': [{'name': 'Pharmacology',
    'children': [{'name': 'Pharmacodynamics',
      'children': [{'name': 'Mechanism of action'},
       {'name': 'Animal Models of Disorders'},
       {'name': 'Other'}]},
     {'name': 'Pharmacokinetics'},
     {'name': 'Pharmacogenetics'},
     {'name': 'Analytical Methods'},
     {'name': 'Other'}]},
   {'name': 'Indications',
    'children': [{'name': 'Disorders',
      'children': [{'name': 'Depressive Disorders'},
       {'name': 'Anxiety'},
       {'name': 'Weight and Eating Disorders'},
       {'name': 'Substance Abuse/Addictions'},
       {'name': 'Other'}]}]},
   {'name': 'Safety',
    'children': [{'name': 'Special Populations',
      'children': [{'name': 'Pediatrics'},
       {'name': 'Geriatrics'},
       {'name': 'Other'}]},
     {'name': 'Perinatal Exposure'},
     {'name': 'Suicide'},
     {'name': 'Sexual Dysfunction'},
     {'name': 'Toxicity'},
     {'name': 'Other'}]},
   {'name': 'Oth

In [89]:
filtered_cluster_labels

{0: 'Serotonin Receptor Modulation in SSRI Treatment',
 1: 'Aquatic Ecotoxicology',
 2: 'Risks of Prenatal Exposure',
 3: 'Quantification of SSRIs in Biological Samples',
 4: 'SSRIs for Obsessive-Compulsive Disorder (OCD)',
 5: 'SSRIs and the Cytochrome P450 System',
 6: 'Impact of SSRIs on Neurogenesis',
 7: 'The Chronic Unpredictable Mild Stress Model of Depression',
 8: 'Fluvoxamine for Depression',
 9: 'Pediatric Depression',
 10: 'SSRIs Effect on Neural Processing of Emotional Cues',
 11: 'Paroxetine Binding',
 12: 'Risk of Perinatal Exposure (Rodents)',
 13: 'SSRIs in Forced Swimming Test',
 14: 'SSRIs for PTSD',
 15: 'Sexual Dysfunction',
 16: 'Serotonin Syndrome',
 17: 'Sequenced Depression Treatment',
 18: 'Serotonin Transporter Gene and Antidepressant Response',
 19: 'Post-Stroke SSRI Use',
 20: 'Bleeding Risk',
 21: 'SSRI Utilization Patterns',
 22: 'SSRIs for Bipolar Depression',
 23: 'SSRIs in Dementias',
 24: 'Escitalopram for Depression',
 25: 'SSRIs for Pain',
 26: 'SSR

In [90]:
filtered_cluster_labels["0"]

filtered_cluster_labels = {int(k): v for k, v in filtered_cluster_labels.items()}

KeyError: '0'

In [None]:
filtered_cluster_labels = {int(k): v for k, v in filtered_cluster_labels.items()}


def replace_numbers_with_labels(node):
    """Recursively replace numbers with labels based on the mapping."""
    if isinstance(node, list):
        # Replace each number in the list with its label if it exists in the mapping
        return [filtered_cluster_labels[int(item)] for item in node]
    elif isinstance(node, dict):
        # Recursively replace keys and values in the dictionary
        return {key: replace_numbers_with_labels(value) for key, value in node.items()}
    else:
        # Handle individual items
        return node

In [92]:
twl = replace_numbers_with_labels(filtered_cluster_tree)

In [93]:
twl

{'Pharmacology': {'Pharmacodynamics': {'Mechanism of action': ['Serotonin Receptor Modulation in SSRI Treatment',
    'Impact of SSRIs on Neurogenesis',
    'SSRIs Effect on Neural Processing of Emotional Cues',
    'Paroxetine Binding',
    'SSRIs Effect on Fear',
    'Structural Basis of Serotonin Transporter Inhibition by SSRIs',
    'SSRIs and Inflammation',
    'SSRIs Effects on Neuroendocrine System',
    'SSRIs for Substance Abuse (Cocaine)',
    'SSRIs Effects on Ion Channels',
    'SSRIs Effects on Antimicrobials and Gut Microbiome',
    'Astrocyte Receptors as a Target for SSRIs',
    'Repeated SSRIs Exposures Effects on Dopamine Receptors',
    'SSRIs in Model Organisms ( C. Elegans and Drosophilia)',
    'Predicting SSRI Response in Depression via miRNA and BDNF',
    'SSRIs Effect on Reinforcement Learning in Rodents',
    'Neurochemical and Electrophysiological Correlates of SSRIs',
    'SSRIs Effect on Brain-derived Neurotrophic Factor (BDNF) Levels in Depressive Patient

In [None]:
twl = replace_numbers_with_labels(filtered_cluster_tree)

0
6
10
11
26
28
36
55
60
64
73
79
86
98
106
107
115
119
120
124
131
7
13
127
69
92
5
56
18
80
3
129
2
57
66
91
7
8
9
13
17
18
19
22
24
30
34
42
47
49
59
61
74
77
83
84
85
90
92
95
93
109
102
103
105
105
106
110
114
117
118
119
122
124
127
131
26
31
37
63
72
29
50
51
60
104
125
4
14
23
25
35
40
41
45
48
52
53
54
58
62
69
70
89
99
100
101
111
121
123
130
126
9
72
23
38
49
61
65
67
74
19
34
47
59
84
85
93
105
114
118
122
2
12
102
7
27
15
91
16
66
71
82
20
43
44
46
63
75
78
86
87
92
99
132
1
81
33
103
21
30
53
97
108
128
