# Prepare Analysis Dataset


### Things in this Notebook:

1. Remove all papers that belong to clusters that are either

   a. too small (less than 50 papers)

   b. belonging to a noise cluster

2. Remove these clusters from the tree hierarchy and cluster label dictionary
3. Remove these papers from the paper dictionary
4. Create a legend for three js
5. Create the file for D3js visualization (tree and dendrogram)


In [19]:
import pandas as pd
import json
import igraph as ig

from dotenv import load_dotenv
import os
# Load environment variables from .env file
load_dotenv()

# Access environment variables
python_path = os.getenv('PYTHONPATH')
data_dir = os.getenv('DATA_DIR')
src_dir = os.getenv('SRC_DIR')
output_dir = os.getenv('OUTPUT_DIR')
threejs_dir = os.getenv('THREEJS_OUTPUT_DIR')

In [8]:
path = output_dir + "/cluster-qualifications_2025/cluster-label-tree/cluster_labels_tags_full_raw.json"
with open(path, "r") as f:
    cluster_labels_tags = json.load(f)

path = (
    output_dir + "/cluster-qualifications_2025/cluster-label-tree/cluster_labels_full_raw.json"
)
with open(path, "r") as f:
    cluster_labels = json.load(f)

path = output_dir + "/cluster-qualifications_2025/cluster-label-tree/cluster_tree_full_raw.json"
with open(path, "r") as f:
    cluster_tree = json.load(f)

### removing the procedural/noise clusters (tag 9), and those with less than 50 papers.


In [10]:
pg = data_dir + "/07-clustered-graphs/2025/FINAL_alpha0.3_k10_res0.002_iterations400.graphml"


g = ig.Graph.Read_GraphML(pg)

pdf = data_dir + "/06-clustered-df/2025/FINAL_alpha0.3_k10_res0.002_iterations400.pkl"
df = pd.read_pickle(pdf)

  g = ig.Graph.Read_GraphML(pg)


In [11]:
# cluster with less than 100 (from 100) or 50 (from 125) papers:
#clusters_too_few_papers = list(range(125,144))
clusters_too_few_papers = list(range(100,144))
# Convert to strings to match the keys in cluster_labels_tags
clusters_too_few_papers = [str(i) for i in clusters_too_few_papers]

noise_clusters = [
    cluster for cluster, [_, tag] in cluster_labels_tags.items() if tag == 9
]
print("Noise Clusters:", noise_clusters)
for i in noise_clusters:
    print(i, ":", cluster_labels_tags[i][0])
print("=" * 50)
print("Too Few Papers Clusters (<100):", clusters_too_few_papers)
for i in clusters_too_few_papers:
    print(i, ":", cluster_labels_tags[i][0])

# Convert all to strings consistently for comparison
clusters_to_remove = clusters_too_few_papers + noise_clusters

Noise Clusters: ['98', '140', '142', '143']
98 : Gatways to Clinical Trials
140 : Gateways to Clinical Trials
142 : Fewer than 10
143 : Fewer than 10
Too Few Papers Clusters (<100): ['100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143']
100 : Folate for Depression
101 : SSRIs for Depression in Patients with Kidney Disease
102 : Psilocybin for Depression
103 : SSRIs Effect on Reinforcement Learning in Rodents
104 : SSRIs for Behavioral Addictions
105 : St. John's Wort for Depression
106 : SSRIs for Migraine
107 : Vilazodone for Depression
108 : SSRIs for Dysthymia
109 : SSRIs for Enuresis
110 : Paroxetines as G Protein-coupled Receptor Kinase 2 (GRK2) Inhibitor
111 : SSRIs for Trichotillomania
112 : Transcranial Direct Current Stimulatio

### update dataframe


In [12]:
print("Before removal:", df.shape)

# Convert column values to strings for proper comparison with string cluster IDs
# Handle float values by converting to integer first if they're whole numbers
df["cluster_alpha0.3_k10_res0.002"] = df["cluster_alpha0.3_k10_res0.002"].apply(
    lambda x: str(int(x)) if isinstance(x, float) and x.is_integer() else str(x)
)
df_analysis = df[~df["cluster_alpha0.3_k10_res0.002"].isin(clusters_to_remove)]
print("After removal:", df_analysis.shape)
df_analysis.to_pickle(data_dir + "/08-analysis-data/2025/df_analysis.pkl")

Before removal: (38961, 38)
After removal: (36510, 38)


### update graph


In [13]:
initial_num_vertices = g.vcount()
initial_num_edges = g.ecount()

# Convert the graph vertex cluster properties to strings if needed
# Handle both int and float cluster values
for v in g.vs:
    if "cluster" in v.attributes():
        # Convert float values like 19.0 to "19" (no decimal)
        if isinstance(v["cluster"], float) and v["cluster"].is_integer():
            v["cluster"] = str(int(v["cluster"]))
        else:
            v["cluster"] = str(v["cluster"])

# Use string comparison consistently
nodes_to_remove = [v.index for v in g.vs if v["cluster"] in clusters_to_remove]
# Delete the identified nodes from the graph
g.delete_vertices(nodes_to_remove)
# Get the number of vertices and edges after removal
final_num_vertices = g.vcount()
final_num_edges = g.ecount()
# Save the graph
g.write_graphml(data_dir + "/08-analysis-data/2025/graph_analysis.graphml")
# Display the results
print("Before removal:")
print(f"Number of vertices: {initial_num_vertices}")
print(f"Number of edges: {initial_num_edges}")
print("\nAfter removal:")
print(f"Number of vertices: {final_num_vertices}")
print(f"Number of edges: {final_num_edges}")

Before removal:
Number of vertices: 38961
Number of edges: 584576

After removal:
Number of vertices: 36510
Number of edges: 551227


# New Qualifications


### remove clusters from cluster_labels_tags


In [14]:

# remove the clusters from the cluster label dict - using string keys consistently
filtered_cluster_labels_tags = {
    cluster: label
    for cluster, label in cluster_labels_tags.items()
    if cluster not in clusters_to_remove
}
with open(
    output_dir + "/cluster-qualifications_2025/cluster-label-tree/cluster_labels_tags_filtered.json",
    "w",
) as f:
    json.dump(filtered_cluster_labels_tags, f)

filtered_cluster_labels = {
    cluster: label
    for cluster, label in cluster_labels.items()
    if cluster not in clusters_to_remove
}
with open(
    output_dir + "/cluster-qualifications_2025/cluster-label-tree/cluster_labels_filtered.json",
    "w",
) as f:
    json.dump(filtered_cluster_labels, f)

### remove clusters from cluster_tree


In [15]:
def remove_clusters(data, remove_list):
    # Traverse the dictionary recursively
    for key, value in data.items():
        if isinstance(value, list):
            # Convert all elements to strings for consistent comparison
            # Handle float values by converting to integer first if they're whole numbers
            value_as_str = []
            for num in value:
                if isinstance(num, float) and num.is_integer():
                    value_as_str.append(str(int(num)))
                else:
                    value_as_str.append(str(num))

            # Remove items in `remove_list` from this list
            data[key] = [num for i, num in enumerate(value) if value_as_str[i] not in remove_list]
        elif isinstance(value, dict):
            # Recur for dictionaries
            remove_clusters(value, remove_list)

def remove_empty_dicts(data):
    # Store keys to delete after processing to avoid modifying dict while iterating
    keys_to_delete = []
    for key, value in data.items():
        if isinstance(value, dict):
            # Recur on the nested dictionary
            remove_empty_dicts(value)
            # Mark for deletion if the dictionary is empty after recursion
            if not value:
                keys_to_delete.append(key)
        elif isinstance(value, list) and not value:
            # Mark for deletion if it's an empty list
            keys_to_delete.append(key)
    # Delete all keys marked as empty
    for key in keys_to_delete:
        del data[key]

filtered_cluster_tree = cluster_tree.copy()
# Use strings consistently in remove_clusters
remove_clusters(filtered_cluster_tree, clusters_to_remove)
remove_empty_dicts(filtered_cluster_tree)
filtered_cluster_tree

{'Pharmacology': {'Pharmacodynamics': {'Mechanism of action': [1,
    7,
    9,
    34,
    53,
    60,
    63,
    68,
    73,
    76,
    77,
    84,
    85,
    88,
    89,
    92],
   'Animal Models of Disorders': [8, 12, 17, 71],
   'Other': [10, 93]},
  'Pharmacokinetics': [6],
  'Pharmacogenetics': [20, 72],
  'Analytical Methods': [2, 27, 55, 64, 91],
  'Other Pharmacology': [15, 35, 70]},
 'Indications': {'Depressive Disorders': [5,
   9,
   12,
   14,
   19,
   21,
   23,
   36,
   41,
   43,
   47,
   58,
   59,
   61,
   74,
   75,
   79,
   80,
   81,
   86,
   92,
   94,
   95,
   96],
  'Anxiety': [17, 33, 34, 65],
  'OCD': [3, 69],
  'Weight and Eating Disorders': [26, 57],
  'Substance Abuse/Addictions': [51, 71],
  'Other': [11, 22, 38, 40, 44, 46, 50, 52, 54, 56, 67, 68, 78, 87, 90]},
 'Safety': {'Special Populations': {'Pediatrics': [5, 31],
   'Geriatrics': [18, 23, 24, 47, 58, 75],
   'Other': [43, 59, 86, 94]},
  'Perinatal Exposure': [4, 10, 42, 95],
  'Other': 

### put single clusters under the "Other" category


In [16]:
filtered_cluster_tree = {
    'Pharmacology': {
        'Pharmacodynamics': {
            'Mechanism of action': [1, 7, 9, 34, 53, 60, 63, 68, 73, 76, 77, 84, 85, 88, 89, 92],
            'Animal Models of Disorders': [8, 12, 17, 71],
            'Other': [10, 93]
        },
        'Other': {
            'Pharmacokinetics': [6],
            'Pharmacogenetics': [20, 72],
            'Other Pharmacology': [15, 35, 70],
            'Analytical Methods': [2, 27, 55, 64, 91]
        }
    },
    'Indications': {
        'Depressive Disorders': [5, 9, 12, 14, 19, 21, 23, 36, 41, 43, 47, 58, 59, 61, 74, 75, 79, 80, 81, 86, 92, 94, 95, 96],
        'Anxiety': [17, 33, 34, 65],
        'Other': {
            'OCD': [3, 69],
            'Weight and Eating Disorders': [26, 57],
            'Substance Abuse/Addictions': [51, 71],
            'Other': [11, 22, 38, 40, 44, 46, 50, 52, 54, 56, 67, 68, 78, 87, 90]
        }
    },
    'Safety': {
        'Special Populations': {
            'Pediatrics': [5, 31],
            'Geriatrics': [18, 23, 24, 47, 58, 75],
            'Other': [43, 59, 86, 94]
        },
        'Perinatal Exposure': [4, 10, 42, 95],
        'Other': [16, 25, 28, 30, 31, 32, 37, 45, 48, 49, 66, 82, 83, 99]
    },
    'Other': {
        'Ecotoxicology': [0, 62],
        'Other': [13, 29, 39, 97]
    }
}

path = output_dir + "/cluster-qualifications_2025/cluster-label-tree/cluster_tree_filtered.json"

with open(path, "w") as f:
    json.dump(filtered_cluster_tree, f, indent=4)

# create legend for ThreeJS


In [20]:
def transform_dict_to_legend(cluster_hierarchy_dict, cluster_label_dict):
    """
    Transforms the cluster hierarchy dictionary by adding cluster labels to create a legend.
    """
    # Ensure keys in cluster_label_dict are integers
    cluster_label_dict = {int(k): v for k, v in cluster_label_dict.items()}

    def transform(item):
        if isinstance(item, dict):
            return {k: transform(v) for k, v in item.items()}
        elif isinstance(item, list):
            return [transform(i) for i in item]
        elif isinstance(item, int) and item in cluster_label_dict:
            return {item: cluster_label_dict[item]}
        else:
            return item

    return transform(cluster_hierarchy_dict)


legend = transform_dict_to_legend(filtered_cluster_tree, filtered_cluster_labels)

# save as json
with open(output_dir + "/cluster-qualifications_2025/cluster-label-tree/legend_labels_2025.json", "w") as f:
    json.dump(legend, f, indent=4)

    # save as json
with open(data_dir + "/09-threeJs/legend_labels_2025.json", "w") as f:
    json.dump(legend, f, indent=4)

        # save as json
with open(threejs_dir + "/legend_labels_2025.json", "w") as f:
    json.dump(legend, f, indent=4)

In [21]:
legend

{'Pharmacology': {'Pharmacodynamics': {'Mechanism of action': [{1: 'Serotonin Receptor Modulation in SSRI Treatment'},
    {7: 'Impact of SSRIs on Neurogenesis'},
    {9: 'SSRIs Effect on Neural Processing of Emotional Cues'},
    {34: 'SSRIs for Panic Disorder'},
    {53: 'Neuroimaging of Serotonin Transporters'},
    {60: 'SSRIs Effects on Ion Channels'},
    {63: 'Methylenedioxymethamphetamine (MDMA) Induced Neurotoxicity'},
    {68: 'SSRIs Effect on Pulmonary Hypertension'},
    {73: 'Non-SERT Transporters in Antidepressant Action'},
    {76: 'SERT Binding Mechanisms'},
    {77: 'Astrocyte Receptors as a Target for SSRIs'},
    {84: 'SSRIs Effect on Dopamine Receptors'},
    {85: 'SSRIs Effects on Antimicrobials and Gut Microbiome'},
    {88: 'Neurometabolite Changes with Antidepressants'},
    {89: 'Galanin System in Antidepressant Action'},
    {92: 'SSRIs Effect on Brain-derived Neurotrophic Factor (BDNF) Levels in Depressive Patients'}],
   'Animal Models of Disorders': [{8: 'T

# D3 JS Structue

### Build Full Tree Incl Labels


In [27]:
class ClusterHierarchyTransformerD3JS:
    def __init__(self, tree_data, cluster_labels_dict):
        """
        Initialize the ClusterHierarchyTransformerD3JS with tree data and a cluster label mapping.

        :param tree_data: The initial tree structure (dictionary).
        :param cluster_labels_dict: Dictionary mapping of numbers to labels.
        """
        self.tree_data = tree_data
        self.cluster_labels_dict = {
            int(k): v for k, v in cluster_labels_dict.items()
        }  # Ensure keys are integers

    def replace_numbers_with_labels(self, node):
        """Recursively replace numbers with labels based on the mapping."""
        if isinstance(node, list):
            # Replace each number in the list with its label if it exists in the mapping
            return [self.cluster_labels_dict[int(item)] for item in node]
        elif isinstance(node, dict):
            # Recursively replace keys and values in the dictionary
            return {
                key: self.replace_numbers_with_labels(value)
                for key, value in node.items()
            }
        else:
            # Handle individual items
            try:
                return self.cluster_labels_dict[int(node)]
            except (ValueError, KeyError):
                return node

    def transform_structure(self, name, data):
        """Transform the structure into a detailed format with all leaves."""
        if isinstance(data, dict):
            children = [
                self.transform_structure(key, value) for key, value in data.items()
            ]
            return {"name": name, "children": children}
        elif isinstance(data, list):
            return {"name": name, "children": [{"name": item} for item in data]}
        else:
            return {"name": name}

    def transform_overview(self, name, data):
        """Transform the structure for overview, skipping leaf nodes."""
        if isinstance(data, dict):
            children = [
                self.transform_overview(key, value)
                for key, value in data.items()
                if isinstance(value, (dict, list))
            ]
            return {"name": name, "children": children}
        elif isinstance(data, list):
            return {"name": name}
        else:
            return {"name": name}

    def create_transformed_structure(self):
        """Generate the final transformed structure with overview and detailed sections."""
        self.tree_with_labels = self.replace_numbers_with_labels(self.tree_data)
        self.desired_structure = {
            "overview": self.transform_overview("Topic Overview", self.tree_with_labels)
        }

        # Add individual topics with full details
        for topic, content in self.tree_with_labels.items():
            self.desired_structure[topic.lower()] = self.transform_structure(
                topic, content
            )

        return self.desired_structure

    def save_to_json(self, file_path):
        """Save the transformed structure to a JSON file."""
        with open(file_path, "w") as f:
            json.dump(self.desired_structure, f, indent=2)
        print(f"Structure saved to {file_path}")


# Initialize the transformer
transformer = ClusterHierarchyTransformerD3JS(
    filtered_cluster_tree, filtered_cluster_labels
)

d3js_structure = transformer.create_transformed_structure()

# Save to JSON
transformer.save_to_json(
    output_dir + "/cluster-qualifications_2025/cluster-label-tree/D3JS_cluster_hierarchy_structure.json"
)

# this destination is for the visualization in the web app
# Save to JSON
transformer.save_to_json(
    src_dir + "/visualization/tree-hierachy/D3JS_cluster_hierarchy_structure_2025.json"
)

Structure saved to /Users/jlq293/Projects/Study-1-Bibliometrics/output/cluster-qualifications_2025/cluster-label-tree/D3JS_cluster_hierarchy_structure.json
Structure saved to /Users/jlq293/Projects/Study-1-Bibliometrics/src/visualization/tree-hierachy/D3JS_cluster_hierarchy_structure_2025.json


In [28]:
d3js_structure

{'overview': {'name': 'Topic Overview',
  'children': [{'name': 'Pharmacology',
    'children': [{'name': 'Pharmacodynamics',
      'children': [{'name': 'Mechanism of action'},
       {'name': 'Animal Models of Disorders'},
       {'name': 'Other'}]},
     {'name': 'Other',
      'children': [{'name': 'Pharmacokinetics'},
       {'name': 'Pharmacogenetics'},
       {'name': 'Other Pharmacology'},
       {'name': 'Analytical Methods'}]}]},
   {'name': 'Indications',
    'children': [{'name': 'Depressive Disorders'},
     {'name': 'Anxiety'},
     {'name': 'Other',
      'children': [{'name': 'OCD'},
       {'name': 'Weight and Eating Disorders'},
       {'name': 'Substance Abuse/Addictions'},
       {'name': 'Other'}]}]},
   {'name': 'Safety',
    'children': [{'name': 'Special Populations',
      'children': [{'name': 'Pediatrics'},
       {'name': 'Geriatrics'},
       {'name': 'Other'}]},
     {'name': 'Perinatal Exposure'},
     {'name': 'Other'}]},
   {'name': 'Other',
    'childr