In [42]:
import pandas as pd
import numpy as np

import collections

import networkx as nx

import statistics 

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

In [43]:
def candidates_in_neighbourhood(G, candidates, node):
    sub_G = G.subgraph([n for n in G.neighbors(node)])
    candidates_in_neighbourhood = [x for x in [n for n in sub_G] if x in candidates]
    if candidates_in_neighbourhood:
        return candidates_in_neighbourhood
    else:
        return []

In [68]:
def candidates_at_distance(dict_distances, node, d):
    return len([dist for dist in dict_distances.get(node).values() if dist == d])

In [44]:
interactome_df = pd.read_csv('./data/Interactome_human.tsv', sep='\t', header=None)
candidates = pd.read_pickle("./data/candidateGenesList_MMAF.p")

G = nx.from_pandas_edgelist(interactome_df, 0, 1, edge_attr=True)

candidates_MMAF = list(set([c for c in candidates if c in G.nodes()]))
nonCandidates_MMAF = [n for n in G.nodes() if n not in candidates_MMAF] 


print(f"Interactome size: {len(G.nodes())}, number of MMAF candidates in interactome: {len(candidates_MMAF)}")

Interactome size: 14465, number of MMAF candidates in interactome: 40


In [45]:
# find the longest distance between MMAF and non-MMAF genes
dict_distances = {}
longest_distance = 0

for source in tqdm(nonCandidates_MMAF):
    dict_tmp = {}

    for target in candidates_MMAF:
        try:
            distance = nx.shortest_path_length(G, source, target)
            dict_tmp[target] = distance

            if distance > longest_distance:
                longest_distance = distance
        except:
            continue

    dict_distances[source] = dict_tmp

print(f"Longest distance between MMAF and non-MMAF gene: {longest_distance}")

100%|██████████| 14425/14425 [00:25<00:00, 568.57it/s]

Longest distance between MMAF and non-MMAF gene: 10





### Calculate new centrality for every non-MMAF gene

In [46]:
# set alpha parameter
alpha = 0.5

In [80]:
# calculate new centrality for every non-MMAF gene
dict_scores = {}

for i in tqdm(range(len(nonCandidates_MMAF))):
    node = nonCandidates_MMAF[i]
    score = 0

    candidates_MMAF_tmp = [node] + list(set(candidates_MMAF))
    A = nx.adjacency_matrix(G, candidates_MMAF_tmp).todense()

    for j in range(len(candidates_MMAF)):
        candidate = candidates_MMAF[j]
        d = dict_distances.get(node, 0).get(candidate, 0)
        score += alpha ** d * np.linalg.matrix_power(A, d)[0][j]
    
    dict_scores[node] = score
    
dict_scores_sorted = dict(sorted(dict_scores.items(), key=lambda v: v[1], reverse=True))

100%|██████████| 14425/14425 [01:07<00:00, 212.90it/s]


In [81]:
# get more info about each node
for n, score in dict_scores_sorted.items():
    dict_scores_sorted[n] = [score, 
                             G.degree(n), 
                             candidates_at_distance(dict_distances, n, 1), 
                             candidates_at_distance(dict_distances, n, 2),
                             candidates_at_distance(dict_distances, n, 3),
                             candidates_at_distance(dict_distances, n, 4)]

df = pd.DataFrame.from_dict(dict_scores_sorted, 
                            orient='index', 
                            columns=['score', 'degree', 'candidates at d=1', 'candidates at d=2', 'candidates at d=3', 'candidates at d=4'])

In [83]:
# get top 20 genes
df_top = df.iloc[:20]
df_top.sort_values(by='candidates at d=2', ascending=False)

Unnamed: 0,score,degree,candidates at d=1,candidates at d=2,candidates at d=3,candidates at d=4
ENSG00000101004,1.5,103,2,6,15,16
ENSG00000111057,1.625,62,2,5,20,12
ENSG00000084652,3.25,81,3,4,19,12
ENSG00000198883,1.6875,45,2,4,16,17
ENSG00000196544,1.5,52,2,4,14,19
ENSG00000142698,1.0,62,1,3,16,18
ENSG00000149089,1.0,16,1,3,11,17
ENSG00000131149,1.0,21,1,3,16,18
ENSG00000103202,1.0,40,1,3,17,14
ENSG00000108021,1.0,32,1,2,19,15
