# Prepare Analysis Dataset


### Things in this Notebook:

1. Remove all papers that belong to clusters that are either

   a. too small (less than 50 papers)

   b. belonging to a noise cluster

2. Remove these clusters from the tree hierarchy and cluster label dictionary
3. Remove these papers from the paper dictionary
4. Create a legend for three js
5. Create the file for D3js visualization (tree and dendrogram)


In [1]:
import json
import os

import igraph as ig
import pandas as pd
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access environment variables
python_path = os.getenv("PYTHONPATH")
data_dir = os.getenv("DATA_DIR")
src_dir = os.getenv("SRC_DIR")
output_dir = os.getenv("OUTPUT_DIR")
threejs_dir = os.getenv("THREEJS_OUTPUT_DIR")

In [2]:
path = (
    output_dir
    + "/cluster-qualifications/cluster-label-tree/cluster_labels_tags_full_raw.json"
)
with open(path, "r") as f:
    cluster_labels_tags = json.load(f)

path = (
    output_dir
    + "/cluster-qualifications/cluster-label-tree/cluster_labels_full_raw.json"
)
with open(path, "r") as f:
    cluster_labels = json.load(f)

path = (
    output_dir
    + "/cluster-qualifications/cluster-label-tree/cluster_tree_full_raw.json"
)
with open(path, "r") as f:
    cluster_tree = json.load(f)

### removing the procedural/noise clusters (tag 9), and those with less than 50 papers.


In [3]:
pg = data_dir + "/07-clustered-graphs/FINAL_alpha0.3_k10_res0.002_iterations400.graphml"


g = ig.Graph.Read_GraphML(pg)

pdf = data_dir + "/06-clustered-df/FINAL_alpha0.3_k10_res0.002_iterations400.pkl"
df = pd.read_pickle(pdf)

  g = ig.Graph.Read_GraphML(pg)


In [4]:
authyear = "Turkmen_2024"
cluster = df[df["unique_auth_year"] == authyear][
    "cluster_alpha0.3_k10_res0.002"
].values[0]
title = df[df["unique_auth_year"] == authyear]["title"].values[0]
print(title)
print(cluster)

Determination of acute sertraline intoxication by high-performance thin-layer chromatography
2


In [5]:
authyear = "Xie_2025_2"
cluster = df[df["unique_auth_year"] == authyear][
    "cluster_alpha0.3_k10_res0.002"
].values[0]
title = df[df["unique_auth_year"] == authyear]["title"].values[0]
print(title)
print(cluster)

Deciphering the inhibitory mechanisms of didecyldimethylammonium chloride on microalgal removal of fluoxetine: Insights from the alterations in cell surface properties and the physio-biochemical and molecular toxicity
0


In [6]:
authyear = "Santobuono_2025"
cluster = df[df["unique_auth_year"] == authyear][
    "cluster_alpha0.3_k10_res0.002"
].values[0]
title = df[df["unique_auth_year"] == authyear]["title"].values[0]
print(title)
print(cluster)

Long-term exposure to sediment-associated antidepressants impacts life-history traits in an estuarine deposit-feeding worm
0


In [7]:
authyear = "Gao_2024"
cluster = df[df["unique_auth_year"] == authyear][
    "cluster_alpha0.3_k10_res0.002"
].values[0]
title = df[df["unique_auth_year"] == authyear]["title"].values[0]
print(title)
print(cluster)

Effects of sertraline hydrochloride with As(III) or Cd on rhizosphere micro-environment and root endophytes in rice
0


In [8]:
# cluster with less than 100 (from 100) or 50 (from 125) papers:
# clusters_too_few_papers = list(range(125,144))
clusters_too_few_papers = list(range(100, 144))
# Convert to strings to match the keys in cluster_labels_tags
clusters_too_few_papers = [str(i) for i in clusters_too_few_papers]

noise_clusters = [
    cluster for cluster, [_, tag] in cluster_labels_tags.items() if tag == 9
]
print("Noise Clusters:", noise_clusters)
for i in noise_clusters:
    print(i, ":", cluster_labels_tags[i][0])
print("=" * 50)
print("Too Few Papers Clusters (<100):", clusters_too_few_papers)
for i in clusters_too_few_papers:
    print(i, ":", cluster_labels_tags[i][0])

# Convert all to strings consistently for comparison
clusters_to_remove = clusters_too_few_papers + noise_clusters

Noise Clusters: ['98', '140', '142', '143']
98 : Gatways to Clinical Trials
140 : Gateways to Clinical Trials
142 : Fewer than 10
143 : Fewer than 10
Too Few Papers Clusters (<100): ['100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143']
100 : Folate for Depression
101 : SSRIs for Depression in Patients with Kidney Disease
102 : Psilocybin for Depression
103 : SSRIs Effect on Reinforcement Learning in Rodents
104 : SSRIs for Behavioral Addictions
105 : St. John's Wort for Depression
106 : SSRIs for Migraine
107 : Vilazodone for Depression
108 : SSRIs for Dysthymia
109 : SSRIs for Enuresis
110 : Paroxetines as G Protein-coupled Receptor Kinase 2 (GRK2) Inhibitor
111 : SSRIs for Trichotillomania
112 : Transcranial Direct Current Stimulatio

### update dataframe


In [9]:
print("Before removal:", df.shape)

# Convert column values to strings for proper comparison with string cluster IDs
# Handle float values by converting to integer first if they're whole numbers
df["cluster_alpha0.3_k10_res0.002"] = df["cluster_alpha0.3_k10_res0.002"].apply(
    lambda x: str(int(x)) if isinstance(x, float) and x.is_integer() else str(x)
)
df_analysis = df[~df["cluster_alpha0.3_k10_res0.002"].isin(clusters_to_remove)]
print("After removal:", df_analysis.shape)
df_analysis.to_pickle(data_dir + "/08-analysis-data/df_analysis.pkl")

Before removal: (38961, 38)
After removal: (36510, 38)


### update graph


In [10]:
initial_num_vertices = g.vcount()
initial_num_edges = g.ecount()

# Convert the graph vertex cluster properties to strings if needed
# Handle both int and float cluster values
for v in g.vs:
    if "cluster" in v.attributes():
        # Convert float values like 19.0 to "19" (no decimal)
        if isinstance(v["cluster"], float) and v["cluster"].is_integer():
            v["cluster"] = str(int(v["cluster"]))
        else:
            v["cluster"] = str(v["cluster"])

# Use string comparison consistently
nodes_to_remove = [v.index for v in g.vs if v["cluster"] in clusters_to_remove]
# Delete the identified nodes from the graph
g.delete_vertices(nodes_to_remove)
# Get the number of vertices and edges after removal
final_num_vertices = g.vcount()
final_num_edges = g.ecount()

### Add Author and DOI to the graph nodes


get apa format for authors


In [11]:
import ast


# Step 1: Define a function to extract author names from the JSON string
def extract_authors(authors_str):
    if isinstance(authors_str, float) or authors_str is None:
        return []

    try:
        authors_list = ast.literal_eval(authors_str)
        return [
            auth["authname"]
            for auth in authors_list
            if isinstance(auth, dict) and "authname" in auth
        ]
    except:
        return []


# Step 2: Define a function to format authors in APA style
def format_apa_citation(authors_list):
    if not authors_list:
        return ""

    # Clean up author names
    cleaned_authors = []
    for author in authors_list:
        # The authname looks like "Surname X.Y." format
        name_parts = author.split()
        if name_parts:
            # First part is typically the surname
            surname = name_parts[0].rstrip(".,")
            cleaned_authors.append(surname)

    # Apply APA style (7th edition):
    # 1-2 authors: list all names
    # 3+ authors: first author followed by 'et al.'
    if len(cleaned_authors) == 1:
        return cleaned_authors[0]
    elif len(cleaned_authors) == 2:
        return f"{cleaned_authors[0]} & {cleaned_authors[1]}"
    else:  # 3 or more authors
        return f"{cleaned_authors[0]} et al."


# Step 3: Apply both functions to your dataframe
df_analysis.loc[:, "authors_list"] = df_analysis["authors_json"].apply(extract_authors)
df_analysis.loc[:, "authors_apa_string"] = df_analysis["authors_list"].apply(
    format_apa_citation
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analysis.loc[:, "authors_list"] = df_analysis["authors_json"].apply(extract_authors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analysis.loc[:, "authors_apa_string"] = df_analysis["authors_list"].apply(


In [12]:
# save the dataframe
df_analysis.to_pickle(data_dir + "/08-analysis-data/df_analysis.pkl")

Add attributes to the graph nodes


In [13]:
# Create lookup dictionaries for DOI and authors using EID as the key
doi_lookup = dict(
    zip(df_analysis["eid"], df_analysis["doi"].fillna("No DOI available"))
)
authors_lookup = dict(zip(df_analysis["eid"], df_analysis["authors_apa_string"]))

# Use the dictionaries to update node attributes (much faster)
for node in g.vs:
    eid = node["eid"]
    # Get values from dictionaries with O(1) lookup time
    # Use get() with a default value to handle missing keys
    node["doi"] = doi_lookup.get(eid, "")
    node["authors"] = authors_lookup.get(eid, "")

# Save analysis graph


In [16]:
# Save the graph
g.write_graphml(data_dir + "/08-analysis-data/graph_analysis.graphml")
# Display the results
print("Before removal:")
print(f"Number of vertices: {initial_num_vertices}")
print(f"Number of edges: {initial_num_edges}")
print("\nAfter removal:")
print(f"Number of vertices: {final_num_vertices}")
print(f"Number of edges: {final_num_edges}")

Before removal:
Number of vertices: 38961
Number of edges: 584576

After removal:
Number of vertices: 36510
Number of edges: 551227


In [17]:
# show an edge
print(g.es[30000])

# show a vertex
print(g.vs[30000])

igraph.Edge(<igraph.Graph object at 0x120b36a50>, 30000, {'weight': 0.300000011920929})
igraph.Vertex(<igraph.Graph object at 0x120b36a50>, 30000, {'eid': '2-s2.0-85078531894', 'title': 'Antidepressants effects of Rhodiola capsule combined with sertraline for major depressive disorder: A randomized double-blind placebo-controlled clinical trial', 'year': 2020.0, 'id': '31925', 'cluster': '81', 'centrality_alpha0.3_k10_res0.002': 0.0135757188479586, 'doi': '10.1016/j.jad.2020.01.065', 'authors': 'Gao et al.'})


# Create a directed citation graph with only the remaining nodes


In [15]:
# rear in
path = data_dir + "/05-graphs/citation-graph/directed_citation_graph.graphml"
g_dir = ig.Graph.Read_GraphML(path)

In [16]:
def filter_graph_by_node_ids(original_graph, reference_graph, id_attribute="eid"):
    """
    Filter the original graph to keep only nodes that exist in the reference graph,
    based on a specific node attribute.

    Parameters:
    -----------
    original_graph : igraph.Graph
        The graph to be filtered
    reference_graph : igraph.Graph
        The graph containing the reference nodes
    id_attribute : str, default='eid'
        The node attribute to use for matching nodes between graphs

    Returns:
    --------
    igraph.Graph
        A subgraph of the original graph containing only nodes present in the reference graph
    """
    # Extract IDs from reference graph
    reference_ids = set(v[id_attribute] for v in reference_graph.vs)

    # Create a list of vertices to keep in the original graph
    vertices_to_keep = [
        v.index for v in original_graph.vs if v[id_attribute] in reference_ids
    ]

    # Create a subgraph with only the vertices to keep
    filtered_graph = original_graph.subgraph(vertices_to_keep)

    print(
        f"Original graph: {original_graph.vcount()} nodes, {original_graph.ecount()} edges"
    )
    print(
        f"Filtered graph: {filtered_graph.vcount()} nodes, {filtered_graph.ecount()} edges"
    )

    return filtered_graph

In [17]:
G_directed_ana = filter_graph_by_node_ids(g_dir, g, id_attribute="eid")

# store the filtered directed graph
G_directed_ana.write_graphml(
    data_dir + "/08-analysis-data/graph_analysis_directed_citation_only.graphml"
)

Original graph: 38961 nodes, 355263 edges
Filtered graph: 36510 nodes, 338084 edges


In [19]:
import numpy as np

# Check if your directed network has reciprocal edges
print(f"Reciprocity: {G_directed_ana.reciprocity()}")
print(f"Number of mutual edges: {G_directed_ana.count_multiple()}")

# Check individual node degrees
in_degrees = G_directed_ana.indegree()
out_degrees = G_directed_ana.outdegree()
print(f"In-degree range: {min(in_degrees)} - {max(in_degrees)}")
print(f"Out-degree range: {min(out_degrees)} - {max(out_degrees)}")
print(f"Correlation: {np.corrcoef(in_degrees, out_degrees)[0,1]}")

Reciprocity: 0.00043780246825932107
Number of mutual edges: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

# Descriptive Statistics of Analysis Graphs


In [1]:
from src.network.analysis.NetworkDescriptives import NetworkDescriptives

In [21]:
nd = NetworkDescriptives(g)
stats_hybrid = nd.get_stats()

output_dir + "/paper-output/descriptive_graph_hybrid_2025.json"
with open(filepath, "w") as f:
    json.dump(stats_hybrid, f, indent=2, default=str)

In [22]:
nd = NetworkDescriptives(G_directed_ana)
stats_directed = nd.get_stats()

filepath = output_dir + "/paper-output/descriptive_graph_directed_2025.json"
with open(filepath, "w") as f:
    json.dump(stats_directed, f, indent=2, default=str)