# Identifying key genes for network analysis

This script identifies the key genes through various network centrality measures. It takes as input four lists of differentially expressed genes, and scores them based on centrality measures.

In [20]:
import networkx as nx
import pandas as pd
import scipy
import random
import numpy as np

## Network measure calculations

In [None]:
# ========== LOAD PPI NETWORK ==========
ppi_df = pd.read_csv("../data/networks/combined_PPI.csv")
ppi_graph = nx.from_pandas_edgelist(ppi_df, source="GeneA", target="GeneB")

print(f"PPI graph has {ppi_graph.number_of_nodes()} nodes and {ppi_graph.number_of_edges()} edges.")


# ========== LOAD DEG LIST ==========
deg_df = pd.read_csv("../data/networks/step1_deg.csv", header=None)
deg_list_raw = deg_df.iloc[:, 0].dropna().unique().tolist()

# Filter to DEGs present in the PPI graph
deg_list = [gene for gene in deg_list_raw if gene in ppi_graph.nodes]

print(f"Original DEG list: {len(deg_list_raw)} genes")
print(f"DEGs in PPI network: {len(deg_list)} genes")


# ========== COMPUTE DEGREE CENTRALITY ==========
ppi_graph_nodes = ppi_graph.nodes()

try:
    degree_centrality_dict = nx.degree_centrality(ppi_graph)
except:
    degree_centrality_dict = dict()
    for node in ppi_graph_nodes:
        degree_centrality_dict[node] = 0.0

# Keep only degree centrality for DEGs
deg_degree_centrality = {gene: degree_centrality_dict[gene] for gene in deg_list}

# Convert to DataFrame
deg_degree_centrality_df = pd.DataFrame.from_dict(deg_degree_centrality, orient='index', columns=['DegreeCentrality'])
deg_degree_centrality_df.index.name = "Gene"
deg_degree_centrality_df.reset_index(inplace=True)

# Preview
print(deg_degree_centrality_df.head())


# ========== COMPUTE EIGENVECTOR CENTRALITY ==========
try:
    eigen_centrality_dict = nx.eigenvector_centrality(ppi_graph)
except:
    eigen_centrality_dict = dict()
    for node in ppi_graph_nodes:
        eigen_centrality_dict[node] = 0.0

# Keep only eigenvector centrality for DEGs
deg_eigen_centrality = {gene: eigen_centrality_dict[gene] for gene in deg_list}

# Convert to DataFrame
deg_eigen_centrality_df = pd.DataFrame.from_dict(deg_eigen_centrality, orient='index', columns=['EigenvectorCentrality'])
deg_eigen_centrality_df.index.name = "Gene"
deg_eigen_centrality_df.reset_index(inplace=True)

# Preview
print(deg_eigen_centrality_df.head())


# ========== COMPUTE BETWEENNESS CENTRALITY ==========
try:
    betw_centrality_dict = nx.betweenness_centrality_subset(ppi_graph, sources=deg_list, targets=deg_list)
except:
    betw_centrality_dict = dict()
    for node in ppi_graph_nodes:
        betw_centrality_dict[node] = 0.0

# Keep only eigenvector centrality for DEGs
deg_betw_centrality = {gene: betw_centrality_dict[gene] for gene in deg_list}

# Convert to DataFrame
deg_betw_centrality_df = pd.DataFrame.from_dict(deg_betw_centrality, orient='index', columns=['BetweennessCentrality'])
deg_betw_centrality_df.index.name = "Gene"
deg_betw_centrality_df.reset_index(inplace=True)

# Preview
print(deg_betw_centrality_df.head())


# ========== COMPUTE PERSONALISED PAGERANK ==========
personalization = {gene: 1/len(deg_list) for gene in deg_list} # The walker is equally likely to start at any DEG
page_rank = nx.pagerank(ppi_graph, personalization=personalization, alpha=0.85)

# Keep only PageRank for DEGs
deg_page_rank = {gene: page_rank[gene] for gene in deg_list}

# Convert to DataFrame
deg_page_rank_df = pd.DataFrame.from_dict(deg_page_rank, orient='index', columns=['PageRank'])
deg_page_rank_df.index.name = "Gene"
deg_page_rank_df.reset_index(inplace=True)

# Preview
print(deg_page_rank_df.head())


# ========== COMBINE ALL CENTRALITY SCORES ==========
network_property_df = pd.DataFrame(columns=["Degree", "Eigen", "Between", "PageRank"])

for gene in deg_list:
    network_property_df.loc[gene] = [
        degree_centrality_dict[gene],
        eigen_centrality_dict[gene],
        betw_centrality_dict[gene],
        page_rank[gene],
    ]

# Reset index to make 'Gene' a column
network_property_df.index.name = "Gene"
network_property_df.reset_index(inplace=True)

# Preview
print(network_property_df.head())

# Save
network_property_df.to_csv("../results/humanPVATsn/network_analysis/step1_centrality_scores.csv", index=False)


PPI graph has 18451 nodes and 345547 edges.
Original DEG list: 70 genes
DEGs in PPI network: 70 genes
     Gene  DegreeCentrality
0  ADAM10          0.003252
1    JAG1          0.000976
2    RHOA          0.016043
3     FN1          0.015610
4  FERMT2          0.001843
     Gene  EigenvectorCentrality
0  ADAM10               0.001466
1    JAG1               0.000359
2    RHOA               0.012218
3     FN1               0.011425
4  FERMT2               0.002940
     Gene  BetweennessCentrality
0  ADAM10               0.000000
1    JAG1               0.000000
2    RHOA              15.354881
3     FN1               4.546381
4  FERMT2               0.023682
     Gene  PageRank
0  ADAM10  0.002316
1    JAG1  0.002257
2    RHOA  0.002820
3     FN1  0.002746
4  FERMT2  0.002233
     Gene    Degree     Eigen    Between  PageRank
0  ADAM10  0.003252  0.001466   0.000000  0.002316
1    JAG1  0.000976  0.000359   0.000000  0.002257
2    RHOA  0.016043  0.012218  15.354881  0.002820
3     FN1 

## Permutations

In [None]:
# Number of permutations
n_permutations = 100            # Increase to 1000 for more robust results

# Precompute centrality scores for the entire PPI graph
precomputed_degree = nx.degree_centrality(ppi_graph)
precomputed_eigen = nx.eigenvector_centrality(ppi_graph)

# Size of DEG list for step1
deg_list_size = len(deg_list)

# Generate 1000 random DEG-like gene lists (same length, sampled from graph)
random_gene_lists = []
for i in range(n_permutations):
    random_genes = random.sample(list(ppi_graph.nodes()), deg_list_size)
    random_gene_lists.append(random_genes)

def compute_centrality_scores(gene_list, graph):
    # Use precomputed values for speed
    degree_scores = [precomputed_degree.get(g, 0.0) for g in gene_list]
    eigen_scores = [precomputed_eigen.get(g, 0.0) for g in gene_list]
    
    # Betweenness
    try:
        betweenness_centrality = nx.betweenness_centrality_subset(graph, sources=gene_list, targets=gene_list)
    except:
        betweenness_centrality = {node: 0.0 for node in graph.nodes}
    between_scores = [betweenness_centrality.get(g, 0.0) for g in gene_list]

    # Personalized PageRank
    personalization = {gene: 1/len(gene_list) for gene in gene_list}
    pagerank = nx.pagerank(graph, personalization=personalization, alpha=0.85)
    pagerank_scores = [pagerank.get(g, 0.0) for g in gene_list]

    return {
        "degree": degree_scores,
        "eigen": eigen_scores,
        "between": between_scores,
        "pagerank": pagerank_scores
    }

# Initialize lists to hold the null distributions
null_degree = []
null_eigen = []
null_between = []
null_pagerank = []

# Loop over each random gene list
for i, random_genes in enumerate(random_gene_lists):
    scores = compute_centrality_scores(random_genes, ppi_graph)
    
    null_degree.append(np.mean(scores["degree"]))
    null_eigen.append(np.mean(scores["eigen"]))
    null_between.append(np.mean(scores["between"]))
    null_pagerank.append(np.mean(scores["pagerank"]))
    
    if (i+1) % 10 == 0:
        print(f"{i+1}/{n_permutations} permutations complete.")



10/100 permutations complete.
20/100 permutations complete.
