# Prepare Analysis Dataset


### Things in this Notebook:

Given the new analysis datasets:

1. Remove these clusters from the tree hierarchy and cluster label dictionary thaat are not part anymore
2. Remove these papers from the paper dictionary
3. Create a legend for three js
4. Create the file for D3js visualization (tree and dendrogram)


In [29]:
import json
import os

import igraph as ig
import pandas as pd
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access environment variables
python_path = os.getenv("PYTHONPATH")
data_dir = os.getenv("DATA_DIR")
src_dir = os.getenv("SRC_DIR")
output_dir = os.getenv("OUTPUT_DIR")
threejs_dir = os.getenv("THREEJS_OUTPUT_DIR")

In [30]:
path = (
    output_dir
    + "/cluster-qualifications_2025/cluster-label-tree/cluster_labels_tags_full_raw.json"
)
with open(path, "r") as f:
    cluster_labels_tags = json.load(f)

path = (
    output_dir
    + "/cluster-qualifications_2025/cluster-label-tree/cluster_labels_full_raw.json"
)
with open(path, "r") as f:
    cluster_labels = json.load(f)

path = (
    output_dir
    + "/cluster-qualifications_2025/cluster-label-tree/cluster_tree_full_raw.json"
)
with open(path, "r") as f:
    cluster_tree = json.load(f)

### removing the procedural/noise clusters (tag 9), and those with less than 50 papers.


In [31]:
pg = data_dir + "/08-analysis-data/graph_analysis.graphml"


g = ig.Graph.Read_GraphML(pg)

pdf = data_dir + "/08-analysis-data/df_analysis.pkl"
df = pd.read_pickle(pdf)

  g = ig.Graph.Read_GraphML(pg)


# New Qualifications


### remove clusters from cluster_labels_tags


In [32]:
final_cluster_list = df["cluster_alpha0.3_k10_res0.002"].unique().tolist()

# remove the clusters from the cluster label dict - using string keys consistently
filtered_cluster_labels_tags = {
    cluster: label
    for cluster, label in cluster_labels_tags.items()
    if cluster in final_cluster_list
}
with open(
    output_dir
    + "/cluster-qualifications_2025/cluster-label-tree/cluster_labels_tags_filtered.json",
    "w",
) as f:
    json.dump(filtered_cluster_labels_tags, f)

filtered_cluster_labels = {
    cluster: label
    for cluster, label in cluster_labels.items()
    if cluster in final_cluster_list
}
with open(
    output_dir
    + "/cluster-qualifications_2025/cluster-label-tree/cluster_labels_filtered.json",
    "w",
) as f:
    json.dump(filtered_cluster_labels, f)

### remove clusters from cluster_tree


In [33]:
def remove_clusters(data, keep_list):
    # Traverse the dictionary recursively
    for key, value in data.items():
        if isinstance(value, list):
            # Convert all elements to strings for consistent comparison
            # Handle float values by converting to integer first if they're whole numbers
            value_as_str = []
            for num in value:
                if isinstance(num, float) and num.is_integer():
                    value_as_str.append(str(int(num)))
                else:
                    value_as_str.append(str(num))

            # keep items in the keep list
            data[key] = [
                num for i, num in enumerate(value) if value_as_str[i] in keep_list
            ]
        elif isinstance(value, dict):
            # Recur for dictionaries
            remove_clusters(value, keep_list)


def remove_empty_dicts(data):
    # Store keys to delete after processing to avoid modifying dict while iterating
    keys_to_delete = []
    for key, value in data.items():
        if isinstance(value, dict):
            # Recur on the nested dictionary
            remove_empty_dicts(value)
            # Mark for deletion if the dictionary is empty after recursion
            if not value:
                keys_to_delete.append(key)
        elif isinstance(value, list) and not value:
            # Mark for deletion if it's an empty list
            keys_to_delete.append(key)
    # Delete all keys marked as empty
    for key in keys_to_delete:
        del data[key]


filtered_cluster_tree = cluster_tree.copy()
# Use strings consistently in remove_clusters
remove_clusters(filtered_cluster_tree, final_cluster_list)
remove_empty_dicts(filtered_cluster_tree)
filtered_cluster_tree

{'Pharmacology': {'Pharmacodynamics': {'Mechanism of action': [1,
    7,
    9,
    53,
    60,
    63,
    68,
    73,
    76,
    77,
    84,
    85,
    88,
    89,
    92],
   'Animal Models of Disorders': [8, 12, 17, 71],
   'Other': [10, 93]},
  'Pharmacokinetics': [6],
  'Pharmacogenetics': [20, 72],
  'Analytical Methods': [2, 27, 55, 64, 91],
  'Other Pharmacology': [15, 35, 70]},
 'Indications': {'Depressive Disorders': [5,
   9,
   12,
   14,
   19,
   21,
   23,
   36,
   41,
   43,
   47,
   58,
   59,
   61,
   74,
   75,
   79,
   80,
   81,
   86,
   92,
   94,
   95,
   96],
  'Anxiety': [17, 33, 34, 65],
  'OCD': [3, 69],
  'Weight and Eating Disorders': [26, 57],
  'Substance Abuse/Addictions': [51, 71],
  'Other': [11, 22, 38, 40, 44, 46, 50, 52, 54, 56, 67, 68, 78, 87, 90]},
 'Safety': {'Special Populations': {'Pediatrics': [5, 31],
   'Geriatrics': [18, 23, 24, 47, 58, 75],
   'Other': [43, 59, 86, 94]},
  'Perinatal Exposure': [4, 10, 42, 95],
  'Other': [16, 25,

### put single clusters under the "Other" category


In [34]:
# adjustments: alternative treatments to 0ther- other


filtered_cluster_tree = {
    "Pharmacology": {
        "Pharmacodynamics": {
            "Mechanism of action": [
                1,
                7,
                9,
                53,
                60,
                63,
                68,
                73,
                76,
                77,
                84,
                85,
                88,
                89,
                92,
            ],
            "Animal Models of Disorders": [8, 12, 17, 71],
            "Other": [10, 93],
        },
        "Pharmacokinetics": [6],
        "Pharmacogenetics": [20, 72],
        "Analytical Methods": [2, 27, 55, 64, 91],
        "Other Pharmacology": [15, 35, 70],
    },
    "Indications": {
        "Depressive Disorders": [
            5,
            9,
            12,
            14,
            19,
            21,
            23,
            36,
            41,
            43,
            47,
            58,
            59,
            61,
            74,
            75,
            79,
            80,
            81,
            86,
            92,
            94,
            95,
            96,
        ],
        "Anxiety": [17, 33, 34, 65],
        "OCD": [3, 69],
        "Weight and Eating Disorders": [26, 57],
        "Substance Abuse/Addictions": [51, 71],
        "Other": [11, 22, 38, 40, 44, 46, 50, 52, 54, 56, 67, 68, 78, 87, 90],
    },
    "Safety": {
        "Special Populations": {
            "Pediatrics": [5, 31],
            "Geriatrics": [18, 23, 24, 47, 58, 75],
            "Other": [43, 59, 86, 94],
        },
        "Perinatal Exposure": [4, 10, 42, 95],
        "Other": [16, 25, 28, 30, 31, 32, 37, 45, 48, 49, 66, 82, 83, 99],
    },
    "Other": {"Ecotoxicology": [0, 62], "Other": [13, 29, 39, 97]},
}

path = (
    output_dir
    + "/cluster-qualifications_2025/cluster-label-tree/cluster_tree_filtered.json"
)

with open(path, "w") as f:
    json.dump(filtered_cluster_tree, f, indent=4)

# create legend for ThreeJS


In [35]:
def transform_dict_to_legend(cluster_hierarchy_dict, cluster_label_dict):
    """
    Transforms the cluster hierarchy dictionary by adding cluster labels to create a legend.
    """
    # Ensure keys in cluster_label_dict are integers
    cluster_label_dict = {int(k): v for k, v in cluster_label_dict.items()}

    def transform(item):
        if isinstance(item, dict):
            return {k: transform(v) for k, v in item.items()}
        elif isinstance(item, list):
            return [transform(i) for i in item]
        elif isinstance(item, int) and item in cluster_label_dict:
            return {item: cluster_label_dict[item]}
        else:
            return item

    return transform(cluster_hierarchy_dict)


legend = transform_dict_to_legend(filtered_cluster_tree, filtered_cluster_labels)

# save as json
with open(
    output_dir
    + "/cluster-qualifications_2025/cluster-label-tree/legend_labels_2025.json",
    "w",
) as f:
    json.dump(legend, f, indent=4)

    # save as json
with open(data_dir + "/09-threeJs/legend_labels_2025.json", "w") as f:
    json.dump(legend, f, indent=4)

    # save as json
with open(threejs_dir + "/legend_labels_2025.json", "w") as f:
    json.dump(legend, f, indent=4)

In [36]:
legend

{'Pharmacology': {'Pharmacodynamics': {'Mechanism of action': [{1: 'Serotonin Receptor Modulation in SSRI Treatment'},
    {7: 'Impact of SSRIs on Neurogenesis'},
    {9: 'SSRIs Effect on Neural Processing of Emotional Cues'},
    {53: 'Neuroimaging of Serotonin Transporters'},
    {60: 'SSRIs Effects on Ion Channels'},
    {63: 'Methylenedioxymethamphetamine (MDMA) Induced Neurotoxicity'},
    {68: 'SSRIs Effect on Pulmonary Hypertension'},
    {73: 'Non-SERT Transporters in Antidepressant Action'},
    {76: 'SERT Binding Mechanisms'},
    {77: 'Astrocyte Receptors as a Target for SSRIs'},
    {84: 'SSRIs Effect on Dopamine Receptors'},
    {85: 'SSRIs Effects on Antimicrobials and Gut Microbiome'},
    {88: 'Neurometabolite Changes with Antidepressants'},
    {89: 'Galanin System in Antidepressant Action'},
    {92: 'SSRIs Effect on Brain-derived Neurotrophic Factor (BDNF) Levels in Depressive Patients'}],
   'Animal Models of Disorders': [{8: 'The Chronic Unpredictable Mild Stress M

# D3 JS Structue

### Build Full Tree Incl Labels


In [37]:
class ClusterHierarchyTransformerD3JS:
    def __init__(self, tree_data, cluster_labels_dict):
        """
        Initialize the ClusterHierarchyTransformerD3JS with tree data and a cluster label mapping.

        :param tree_data: The initial tree structure (dictionary).
        :param cluster_labels_dict: Dictionary mapping of numbers to labels.
        """
        self.tree_data = tree_data
        self.cluster_labels_dict = {
            int(k): v for k, v in cluster_labels_dict.items()
        }  # Ensure keys are integers

    def replace_numbers_with_labels(self, node):
        """Recursively replace numbers with labels based on the mapping."""
        if isinstance(node, list):
            # Replace each number in the list with its label if it exists in the mapping
            return [self.cluster_labels_dict[int(item)] for item in node]
        elif isinstance(node, dict):
            # Recursively replace keys and values in the dictionary
            return {
                key: self.replace_numbers_with_labels(value)
                for key, value in node.items()
            }
        else:
            # Handle individual items
            try:
                return self.cluster_labels_dict[int(node)]
            except (ValueError, KeyError):
                return node

    def transform_structure(self, name, data):
        """Transform the structure into a detailed format with all leaves."""
        if isinstance(data, dict):
            children = [
                self.transform_structure(key, value) for key, value in data.items()
            ]
            return {"name": name, "children": children}
        elif isinstance(data, list):
            return {"name": name, "children": [{"name": item} for item in data]}
        else:
            return {"name": name}

    def transform_overview(self, name, data):
        """Transform the structure for overview, skipping leaf nodes."""
        if isinstance(data, dict):
            children = [
                self.transform_overview(key, value)
                for key, value in data.items()
                if isinstance(value, (dict, list))
            ]
            return {"name": name, "children": children}
        elif isinstance(data, list):
            return {"name": name}
        else:
            return {"name": name}

    def create_transformed_structure(self):
        """Generate the final transformed structure with overview and detailed sections."""
        self.tree_with_labels = self.replace_numbers_with_labels(self.tree_data)
        self.desired_structure = {
            "overview": self.transform_overview("Topic Overview", self.tree_with_labels)
        }

        # Add individual topics with full details
        for topic, content in self.tree_with_labels.items():
            self.desired_structure[topic.lower()] = self.transform_structure(
                topic, content
            )

        return self.desired_structure

    def save_to_json(self, file_path):
        """Save the transformed structure to a JSON file."""
        with open(file_path, "w") as f:
            json.dump(self.desired_structure, f, indent=2)
        print(f"Structure saved to {file_path}")


# Initialize the transformer
transformer = ClusterHierarchyTransformerD3JS(
    filtered_cluster_tree, filtered_cluster_labels
)

d3js_structure = transformer.create_transformed_structure()

# Save to JSON
transformer.save_to_json(
    output_dir
    + "/cluster-qualifications_2025/cluster-label-tree/D3JS_cluster_hierarchy_structure.json"
)

# this destination is for the visualization in the web app
# Save to JSON
transformer.save_to_json(
    src_dir + "/visualization/tree-hierachy/D3JS_cluster_hierarchy_structure_2025.json"
)

# this destination is for the visualization in the web app
# Save to JSON
transformer.save_to_json(threejs_dir + "D3JS_cluster_hierarchy_structure_2025.json")

Structure saved to /Users/jlq293/Projects/PhD project/02 Project/Study 1/BibliometricAnalysis/output/cluster-qualifications_2025/cluster-label-tree/D3JS_cluster_hierarchy_structure.json
Structure saved to /Users/jlq293/Projects/PhD project/02 Project/Study 1/BibliometricAnalysis/src/visualization/tree-hierachy/D3JS_cluster_hierarchy_structure_2025.json
Structure saved to /Users/jlq293/Projects/PhD project/02 Project/Study 1/Immersive-SSRI-Evolution-Viz/src/data/D3JS_cluster_hierarchy_structure_2025.json


In [38]:
d3js_structure

{'overview': {'name': 'Topic Overview',
  'children': [{'name': 'Pharmacology',
    'children': [{'name': 'Pharmacodynamics',
      'children': [{'name': 'Mechanism of action'},
       {'name': 'Animal Models of Disorders'},
       {'name': 'Other'}]},
     {'name': 'Pharmacokinetics'},
     {'name': 'Pharmacogenetics'},
     {'name': 'Analytical Methods'},
     {'name': 'Other Pharmacology'}]},
   {'name': 'Indications',
    'children': [{'name': 'Depressive Disorders'},
     {'name': 'Anxiety'},
     {'name': 'OCD'},
     {'name': 'Weight and Eating Disorders'},
     {'name': 'Substance Abuse/Addictions'},
     {'name': 'Other'}]},
   {'name': 'Safety',
    'children': [{'name': 'Special Populations',
      'children': [{'name': 'Pediatrics'},
       {'name': 'Geriatrics'},
       {'name': 'Other'}]},
     {'name': 'Perinatal Exposure'},
     {'name': 'Other'}]},
   {'name': 'Other',
    'children': [{'name': 'Ecotoxicology'}, {'name': 'Other'}]}]},
 'pharmacology': {'name': 'Pharmac