In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import networkx as nx
import mygene

In [63]:
current_directory = os.getcwd()

# Obtain list of genes of interest from the tsv downloaded in disgenet
disgenet_target = os.path.abspath(
    os.path.join(current_directory, '..', 'Data', 'DisGenet', 'Group_Diseases_15_5496genes.tsv'))

G = pd.read_csv(disgenet_target, sep='\t')
genes = list(set(G['Gene'].values))

expression_target = os.path.abspath(
    os.path.join(current_directory, '..', 'Data', 'output_GeneticData.csv'))
clinical_target = os.path.abspath(
    os.path.join(current_directory, '..', 'Data', 'output_ClinicalData.csv'))

# expression data
expression_data = pd.read_csv(expression_target, sep = ',', index_col=0)
clinical_data = pd.read_csv(clinical_target, sep=',', index_col=0)

# We only consider genes present in our expression dataframe
filtered_genes = [x for x in genes if x in list(expression_data.columns)]

# We only keep columns (genes) that are present in the network
expression_data = expression_data[expression_data.columns.intersection(filtered_genes)]

R = [1,2,3,4,5]
for r in R:
    print("With radius:", r)
    ppi = get_snap(filtered_genes, True, r)
    print()

With radius: 1
SNAP
Whole network: 859 nodes
Added nodes through neighbor look up:  0 nodes
Biggest connected component: 819 nodes
Percentage of lost genes/nodes: 40 (4.66%)

With radius: 2
SNAP
Whole network: 3051 nodes
Added nodes through neighbor look up:  2194 nodes
Biggest connected component: 3045 nodes
Percentage of lost genes/nodes: 6 (0.2%)

With radius: 3
SNAP
Whole network: 3219 nodes
Added nodes through neighbor look up:  2362 nodes
Biggest connected component: 3213 nodes
Percentage of lost genes/nodes: 6 (0.19%)

With radius: 4
SNAP
Whole network: 3239 nodes
Added nodes through neighbor look up:  2382 nodes
Biggest connected component: 3233 nodes
Percentage of lost genes/nodes: 6 (0.19%)

With radius: 5
SNAP
Whole network: 3245 nodes
Added nodes through neighbor look up:  2388 nodes
Biggest connected component: 3239 nodes
Percentage of lost genes/nodes: 6 (0.18%)



In [62]:
def recursive_neighbour_gather(node, ppi, R, result = [], looked_nodes = []):
    if R == 1:
        return result
    else:
        N = int(node)
        if (ppi.has_node(N)):
            neighbors = list(ppi.neighbors(N))
            looked_nodes += [N]
            if len(neighbors) > 0:
                result.extend([item for item in neighbors])
                result = list(set(result))
            for n in neighbors:
                if (int(n) not in looked_nodes) and (N != int(n)):
                    return recursive_neighbour_gather(n, ppi, R-1, result, looked_nodes)
        return result

def get_snap(genes, remove_components, RADIUS):

    # We remove first line in csv with column names
    file_address = '../Data/PPT-Ohmnet/PPT-Ohmnet_tissues-combined.edgelist'
    with open(file_address, 'r') as f:
        next(f)
        tissues_edgelist = pd.read_csv(file_address, sep='\t')

    # We prepare target to save the tissues in PPT Ohmnet that are ONLY specific to kidney
    current_directory = os.getcwd()
    kidney_specific_target = os.path.abspath(
        os.path.join(current_directory, '..', 'Data', 'PPT-Ohmnet', 'PPT-Ohmnet-tissues-kidney.csv'))

    kidney_specific = tissues_edgelist[tissues_edgelist['tissue'] == 'kidney']
    kidney_specific.to_csv(kidney_specific_target, sep='\t', index=False)


    with open(kidney_specific_target, 'r') as f:
        next(f)
        G_kidney = nx.read_edgelist(f, nodetype=int, data=(('tissue', str),))

    # Genes in PPT-Ohmnet are Entrez IDs (e.g 7157), it is necessary to convert them to gene Symbols (e.g TP53, the gene name).
    # Initialize mygene object
    mg = mygene.MyGeneInfo()
    # Query gene information for list 'genes'. It specifies that the gene symbols are provided as input (scopes = symbol)
    # requests the entrezgene field to be included in the output, specifies that the genes are from the human species.
    out_entrez = list(mg.querymany(genes, scopes='symbol', fields='entrezgene', species='human', verbose=False))

    entrezgenes = []
    mapping = {}
    nodesToAdd = []
    for o in out_entrez:
        if 'entrezgene' in o:
            entrezgenes.append(int(o['entrezgene']))
            mapping[int(o['entrezgene'])] = o['query']
            res = recursive_neighbour_gather(o['entrezgene'], G_kidney, RADIUS, [], [])
            if res != []:
                nodesToAdd += [res]
    
    nodesToAdd = list(set([item for sublist in nodesToAdd for item in sublist]))
    addedCount = len([x for x in nodesToAdd if x not in entrezgenes])
    
    out_symbol = list(mg.querymany(nodesToAdd, scopes='entrezgene', fields='symbol', species='human', verbose=False))
    for o in out_symbol:
        if 'symbol' in o:
            entrezgenes.append(int(o['query']))
            mapping[int(o['query'])] = o['symbol']

    # we keep only nodes that are relevant to our specified genes
    A_kidney_frozen = G_kidney.subgraph(entrezgenes)
    A_kidney = nx.Graph(A_kidney_frozen)
    original = A_kidney.number_of_nodes()

    if remove_components == True:
        # Delete nodes from components with less than 5 nodes
        nodes_to_remove = []
        for component in list(nx.connected_components(A_kidney)):
            if len(component)<5:
                for node in component:
                    A_kidney.remove_node(node)

    # Remove self-loops
    A_kidney.remove_edges_from(list(nx.selfloop_edges(A_kidney)))

    largest = A_kidney.number_of_nodes()
    lost = original - largest
    lost_percent = round((lost/original), 4)

    print('SNAP')
    print('Whole network:', original, 'nodes')
    print('Added nodes through neighbor look up: ', addedCount, 'nodes')
    print('Biggest connected component:', largest, 'nodes')
    print('Percentage of lost genes/nodes:', lost, f'({lost_percent*100}%)')

    A_kidney_relabeled = nx.relabel_nodes(A_kidney, mapping)
    #nx.write_edgelist(A_brain_relabeled, f'data/networks/PPI_SNAP_brain_{remove_components}.edgelist')

    return A_kidney_relabeled


In [64]:
G = get_snap(filtered_genes, True, 2)
G

SNAP
Whole network: 3051 nodes
Added nodes through neighbor look up:  2194 nodes
Biggest connected component: 3045 nodes
Percentage of lost genes/nodes: 6 (0.2%)


<networkx.classes.graph.Graph at 0x25bdde3e6d0>

In [66]:
list(G.nodes)

['NFKB1',
 'TNIP2',
 'AMOT',
 'VASP',
 'SS18L1',
 'SMARCA4',
 'SMURF1',
 'HSPA5',
 'SKIL',
 'UBE2I',
 'RAC1',
 'ARL2BP',
 'SNRPC',
 'EIF4A3',
 'TIPRL',
 'PPP6C',
 'SETDB1',
 'H4C16',
 'PIN1',
 'TSC22D4',
 'CDK12',
 'TRA2B',
 'DVL1',
 'AXIN1',
 'REST',
 'TRIM13',
 'ARHGDIA',
 'NPC2',
 'NPC1',
 'MAP2K1',
 'CFLAR',
 'SIAH1',
 'RBBP8',
 'SEPTIN9',
 'VAPB',
 'PKD1',
 'ACTN1',
 'LGR4',
 'NCAPD3',
 'PCMT1',
 'TIMP2',
 'RC3H1',
 'UBE2G2',
 'RYK',
 'PRKDC',
 'MAPK14',
 'ELK3',
 'TERF2',
 'NAIF1',
 'CTCF',
 'HMGB1',
 'RNF20',
 'WAC',
 'NUDT21',
 'HDAC2',
 'MOV10',
 'USP11',
 'BTRC',
 'CDC26',
 'ANAPC7',
 'NKX3-2',
 'TLE4',
 'FBXW7',
 'MED4',
 'BAG6',
 'WDR83',
 'ETV4',
 'NID1',
 'MED1',
 'HDAC5',
 'TFRC',
 'YWHAH',
 'PAK2',
 'GNA13',
 'TGFBR1',
 'CSNK2A1',
 'SRSF10',
 'PHGDH',
 'GRB2',
 'SMARCD1',
 'GIGYF1',
 'ELAVL1',
 'DICER1',
 'RBFOX2',
 'ATXN2',
 'CCND1',
 'ORC4',
 'GATA2',
 'ZFPM1',
 'ALDOA',
 'CAND1',
 'DSP',
 'PPP1CC',
 'APP',
 'ACAA2',
 'KDM4C',
 'HDAC1',
 'MYH9',
 'DNAJC2',
 'XPO1',
 '

In [13]:
somepath = os.path.abspath(
    os.path.join(current_directory, '..', 'Data', 'PPT-Ohmnet', 'PPT-Ohmnet-tissues-kidney.csv'))

# expression data
ppi_network = pd.read_csv(somepath, sep = '\t')
ppi_network

Unnamed: 0,protein1,protein2,tissue
0,4790,79155,kidney
1,154796,7408,kidney
2,26039,6597,kidney
3,57154,3309,kidney
4,6498,7329,kidney
...,...,...,...
52121,324,1457,kidney
52122,79228,9775,kidney
52123,662,8878,kidney
52124,57521,253260,kidney


In [53]:
N = 8771 
ppi_network[(ppi_network['protein1'] == N) | (ppi_network['protein2'] == N)]

Unnamed: 0,protein1,protein2,tissue
17474,9966,8771,kidney


In [41]:
kidney_specific_target = os.path.abspath(
        os.path.join(current_directory, '..', 'Data', 'PPT-Ohmnet', 'PPT-Ohmnet-tissues-kidney.csv'))

with open(kidney_specific_target, 'r') as f:
        next(f)
        G_kidney = nx.read_edgelist(f, nodetype=int, data=(('tissue', str),))
    
node_data = G_kidney.nodes[4790]
if node_data is not None:
    print("Node 4790 data:", node_data)
else:
    print("Node 4790 does not exist in the graph.")

Node 4790 data: {}


In [140]:
with open(somepath, 'r') as f:
        next(f)
        G_kidney2 = nx.read_edgelist(f, nodetype=int, data=(('tissue', str),))

In [151]:
nodoe= G_kidney2.nodes()
for i in nodoe:
    neighs = G_kidney2.neighbors(i)
    print(i)
    for n in neighs:
        print("NNNNEEE:   " + str(n))
print(len(list(G_kidney.edges())))

4790
NNNNEEE:   79155
NNNNEEE:   1213
NNNNEEE:   3301
NNNNEEE:   5607
NNNNEEE:   8837
NNNNEEE:   1999
NNNNEEE:   4791
NNNNEEE:   801
NNNNEEE:   71
NNNNEEE:   5062
NNNNEEE:   7528
NNNNEEE:   9252
NNNNEEE:   8841
NNNNEEE:   4089
NNNNEEE:   2734
NNNNEEE:   1191
NNNNEEE:   3064
NNNNEEE:   8451
NNNNEEE:   7175
NNNNEEE:   1499
NNNNEEE:   29110
NNNNEEE:   25855
NNNNEEE:   2317
NNNNEEE:   156
NNNNEEE:   5451
NNNNEEE:   1147
NNNNEEE:   1831
NNNNEEE:   92609
NNNNEEE:   3326
NNNNEEE:   3312
NNNNEEE:   23054
NNNNEEE:   50649
NNNNEEE:   8202
NNNNEEE:   1832
NNNNEEE:   3308
NNNNEEE:   9043
NNNNEEE:   9559
NNNNEEE:   10891
NNNNEEE:   10498
NNNNEEE:   5578
NNNNEEE:   9611
NNNNEEE:   7518
NNNNEEE:   3148
NNNNEEE:   6667
NNNNEEE:   6722
NNNNEEE:   8452
NNNNEEE:   6207
NNNNEEE:   8945
NNNNEEE:   672
NNNNEEE:   5591
NNNNEEE:   7514
NNNNEEE:   2113
NNNNEEE:   808
NNNNEEE:   5371
NNNNEEE:   4628
NNNNEEE:   4221
NNNNEEE:   3185
NNNNEEE:   51773
NNNNEEE:   467
NNNNEEE:   688
NNNNEEE:   9372
NNNNEEE:   7323
NN

In [161]:
iter= G_kidney2.neighbors(4790)
for i in iter:
    print(i)

79155
1213
3301
5607
8837
1999
4791
801
71
5062
7528
9252
8841
4089
2734
1191
3064
8451
7175
1499
29110
25855
2317
156
5451
1147
1831
92609
3326
3312
23054
50649
8202
1832
3308
9043
9559
10891
10498
5578
9611
7518
3148
6667
6722
8452
6207
8945
672
5591
7514
2113
808
5371
4628
4221
3185
51773
467
688
9372
7323
142
10270
7532
3309
5966
3320
25942
8517
6774
334
3840
10856
667
3516
9612
1051
2074
7415
2752
10524
6500
4792
6778
150684
3551
149951
23118
4163
10379
8454
7536
6885
805
2908
6711
10014
3659
65123
5566
5531
3313
5970
8607
6188
3066
6256
522
3303
2932
23291
2033
5716
367
7538
7295
1997
3065
5568
4627
9093
203068
8450
4670
5094
330
10987
7431
324
6421
4851
6776
9759
2969
2353
3093
8626
3146
5296
4790
3838
10273
9641
