In [1]:
from pathlib import Path
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
def get_graph_data_from_topo(filepath=None):
    """
    Reads a .topo file and returns:
    - A NetworkX directed graph with gene names as node labels and 'Type' as edge weight.
    - A mapping from gene names to integer indices (useful for ML models like PyG).
    
    :param filepath: path to the topology file
    :return: G_named (NetworkX DiGraph), gene_to_idx (dict)
    """
    df = pd.read_csv(filepath, sep=r"\s+")

    # Create gene-to-index mapping for optional ML use
    genes = sorted(set(df['Source']).union(df['Target']))
    gene_to_idx = {gene: idx for idx, gene in enumerate(genes)}

    # Build NetworkX DiGraph with weights
    edges_with_weights = list(zip(df['Source'], df['Target'], df['Type']))
    G = nx.DiGraph()
    G.add_weighted_edges_from(edges_with_weights)

    return G, gene_to_idx

In [9]:
G, _ = get_graph_data_from_topo(filepath=Path("dorothea_150.topo"))
nodes = list(G.nodes())


In [10]:
import pandas as pd
df = pd.read_csv("XXL_run_gene_metrics.csv")
genes_XXL = df["gene"].tolist()

In [14]:
print(len(nodes), len(genes_XXL))
print(len(set(nodes)), len(set(genes_XXL)))  # Check if the sets are equal
# print the value counts for genes_XXL
print(df["gene"].value_counts())

150 228
150 125
gene
PGR      2
FOS      2
JUN      2
STAT1    2
HIF1A    2
        ..
NR1H2    1
HNF4A    1
MAFB     1
KLF13    1
HBP1     1
Name: count, Length: 125, dtype: int64
