# Prepare Analysis Dataset


### Things in this Notebook:

1. Remove all papers that belong to clusters that are either

   a. too small (less than 50 papers)

   b. belonging to a noise cluster

2. Remove these clusters from the tree hierarchy and cluster label dictionary
3. Remove these papers from the paper dictionary
4. Create a legend for three js
5. Create the file for D3js visualization (tree and dendrogram)


In [7]:
import json
import os

import igraph as ig
import pandas as pd
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access environment variables
python_path = os.getenv('PYTHONPATH')
data_dir = os.getenv('DATA_DIR')
src_dir = os.getenv('SRC_DIR')
output_dir = os.getenv('OUTPUT_DIR')
threejs_dir = os.getenv('THREEJS_OUTPUT_DIR')


In [8]:
path = output_dir + "/cluster-qualifications_2025/cluster-label-tree/cluster_labels_tags_full_raw.json"
with open(path, "r") as f:
    cluster_labels_tags = json.load(f)

path = (
    output_dir + "/cluster-qualifications_2025/cluster-label-tree/cluster_labels_full_raw.json"
)
with open(path, "r") as f:
    cluster_labels = json.load(f)

path = output_dir + "/cluster-qualifications_2025/cluster-label-tree/cluster_tree_full_raw.json"
with open(path, "r") as f:
    cluster_tree = json.load(f)


### removing the procedural/noise clusters (tag 9), and those with less than 50 papers.


In [9]:
pg = data_dir + "/07-clustered-graphs/2025/FINAL_alpha0.3_k10_res0.002_iterations400.graphml"


g = ig.Graph.Read_GraphML(pg)

pdf = data_dir + "/06-clustered-df/2025/FINAL_alpha0.3_k10_res0.002_iterations400.pkl"
df = pd.read_pickle(pdf)


  g = ig.Graph.Read_GraphML(pg)


In [10]:
# cluster with less than 100 (from 100) or 50 (from 125) papers:
#clusters_too_few_papers = list(range(125,144))
clusters_too_few_papers = list(range(100,144))
# Convert to strings to match the keys in cluster_labels_tags
clusters_too_few_papers = [str(i) for i in clusters_too_few_papers]

noise_clusters = [
    cluster for cluster, [_, tag] in cluster_labels_tags.items() if tag == 9
]
print("Noise Clusters:", noise_clusters)
for i in noise_clusters:
    print(i, ":", cluster_labels_tags[i][0])
print("=" * 50)
print("Too Few Papers Clusters (<100):", clusters_too_few_papers)
for i in clusters_too_few_papers:
    print(i, ":", cluster_labels_tags[i][0])

# Convert all to strings consistently for comparison
clusters_to_remove = clusters_too_few_papers + noise_clusters


Noise Clusters: ['98', '140', '142', '143']
98 : Gatways to Clinical Trials
140 : Gateways to Clinical Trials
142 : Fewer than 10
143 : Fewer than 10
Too Few Papers Clusters (<100): ['100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143']
100 : Folate for Depression
101 : SSRIs for Depression in Patients with Kidney Disease
102 : Psilocybin for Depression
103 : SSRIs Effect on Reinforcement Learning in Rodents
104 : SSRIs for Behavioral Addictions
105 : St. John's Wort for Depression
106 : SSRIs for Migraine
107 : Vilazodone for Depression
108 : SSRIs for Dysthymia
109 : SSRIs for Enuresis
110 : Paroxetines as G Protein-coupled Receptor Kinase 2 (GRK2) Inhibitor
111 : SSRIs for Trichotillomania
112 : Transcranial Direct Current Stimulatio

### update dataframe


In [None]:
print("Before removal:", df.shape)

# Convert column values to strings for proper comparison with string cluster IDs
# Handle float values by converting to integer first if they're whole numbers
df["cluster_alpha0.3_k10_res0.002"] = df["cluster_alpha0.3_k10_res0.002"].apply(
    lambda x: str(int(x)) if isinstance(x, float) and x.is_integer() else str(x)
)
df_analysis = df[~df["cluster_alpha0.3_k10_res0.002"].isin(clusters_to_remove)]
print("After removal:", df_analysis.shape)
df_analysis.to_pickle(data_dir + "/08-analysis-data/2025/df_analysis.pkl")


Before removal: (38961, 38)
After removal: (36510, 38)


### update graph


In [None]:
initial_num_vertices = g.vcount()
initial_num_edges = g.ecount()

# Convert the graph vertex cluster properties to strings if needed
# Handle both int and float cluster values
for v in g.vs:
    if "cluster" in v.attributes():
        # Convert float values like 19.0 to "19" (no decimal)
        if isinstance(v["cluster"], float) and v["cluster"].is_integer():
            v["cluster"] = str(int(v["cluster"]))
        else:
            v["cluster"] = str(v["cluster"])

# Use string comparison consistently
nodes_to_remove = [v.index for v in g.vs if v["cluster"] in clusters_to_remove]
# Delete the identified nodes from the graph
g.delete_vertices(nodes_to_remove)
# Get the number of vertices and edges after removal
final_num_vertices = g.vcount()
final_num_edges = g.ecount()
# Save the graph
g.write_graphml(data_dir + "/08-analysis-data/2025/graph_analysis.graphml")
# Display the results
print("Before removal:")
print(f"Number of vertices: {initial_num_vertices}")
print(f"Number of edges: {initial_num_edges}")
print("\nAfter removal:")
print(f"Number of vertices: {final_num_vertices}")
print(f"Number of edges: {final_num_edges}")


Before removal:
Number of vertices: 38961
Number of edges: 584576

After removal:
Number of vertices: 36510
Number of edges: 551227
