In [1]:
import pandas as pd
import numpy as np
from igraph import Graph
import sys
sys.path.insert(0, '../features')
from disease_process_proteins import get_protein_index
from steiner_tree import sca
from tqdm.notebook import tqdm
from IPython.display import display

# STRING PPI Conversion

In [2]:
string_ppi = pd.read_csv('../../data/raw/9606.protein.links.v11.5.txt', sep=' ')
display(string_ppi.head(2))
string_phys_ppi = pd.read_csv('../../data/raw/9606.protein.physical.links.v11.5.txt', sep=' ')
display(string_phys_ppi.head(2))
string_aliases = pd.read_csv('../../data/raw/9606.protein.aliases.v11.5.txt', sep='\t')
display(string_aliases.head(2))
#conversion_df = pd.read_csv('../../data/interim/HGNC symbols.txt')
#conversion_df.head(2)

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000379496,155
1,9606.ENSP00000000233,9606.ENSP00000314067,197


Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000264718,156
1,9606.ENSP00000000233,9606.ENSP00000346046,177


Unnamed: 0,#string_protein_id,alias,source
0,9606.ENSP00000000233,2B6H,BLAST_UniProt_DR_PDB
1,9606.ENSP00000000233,2B6H,Ensembl_HGNC_UniProt_ID(supplied_by_UniProt)_D...


In [3]:
print(string_ppi.shape[0])
print(string_phys_ppi.shape[0])

11938498
1991832


## Process ID mapping table

In [4]:
# Allows ID mapping between Ensembl protein IDs and HGNC GeneName
string_aliases[string_aliases['source']=='Ensembl_HGNC'].head() 

Unnamed: 0,#string_protein_id,alias,source
32,9606.ENSP00000000233,ARF5,Ensembl_HGNC
244,9606.ENSP00000000412,M6PR,Ensembl_HGNC
516,9606.ENSP00000001008,FKBP4,Ensembl_HGNC
712,9606.ENSP00000001146,CYP26B1,Ensembl_HGNC
976,9606.ENSP00000002125,NDUFAF7,Ensembl_HGNC


In [5]:
# Ensures full coverage of identifiers
string_aliases_hgnc = string_aliases[string_aliases['source']=='Ensembl_HGNC']
string_aliases_hgnc.shape[0]

19142

In [6]:
filtered_alias = string_aliases_hgnc.drop_duplicates(subset=['#string_protein_id', 'alias'])
filtered_alias.shape[0]

19142

In [7]:
# String IDs with more than one HGNC Alias
duplicates = filtered_alias[filtered_alias['#string_protein_id'].duplicated(keep=False)]
print(duplicates.shape[0])
duplicates

17


Unnamed: 0,#string_protein_id,alias,source
3917207,9606.ENSP00000451768,KIAA0408,Ensembl_HGNC
3917264,9606.ENSP00000451768,SOGA3,Ensembl_HGNC
4027534,9606.ENSP00000469970,OPN1MW,Ensembl_HGNC
4027544,9606.ENSP00000469970,OPN1MW2,Ensembl_HGNC
4035263,9606.ENSP00000471017,MAGEA9,Ensembl_HGNC
4035282,9606.ENSP00000471017,MAGEA9B,Ensembl_HGNC
4045792,9606.ENSP00000472749,TMSB15A,Ensembl_HGNC
4045796,9606.ENSP00000472749,TMSB15B,Ensembl_HGNC
4061631,9606.ENSP00000475814,HIST2H2AA3,Ensembl_HGNC
4061634,9606.ENSP00000475814,HIST2H2AA4,Ensembl_HGNC


In [8]:
# manual curation of duplicates with information from pharos.nih.gov/targets/.
aliases = filtered_alias[~filtered_alias['alias'].isin(
    ['KIAA0408', 'OPN1MW2', 'MAGEA9B', 'TMSB15B', 'HIST2H2AA4', 'TBC1D3', 'CT45A4', 'CT45A6', 'C1QTNF5']
)][['#string_protein_id', 'alias']]
aliases.head(2)

Unnamed: 0,#string_protein_id,alias
32,9606.ENSP00000000233,ARF5
244,9606.ENSP00000000412,M6PR


## ID mapping

In [9]:
# full network
hgnc_string_ppi = pd.merge(
    string_ppi, aliases, left_on='protein1', right_on='#string_protein_id', how='inner'
).rename(columns={'alias':'gene1'})

hgnc_string_ppi = pd.merge(
    hgnc_string_ppi, aliases, left_on='protein2', right_on='#string_protein_id', how='inner'
).rename(columns={'alias':'gene2'})

print(string_ppi.shape[0])
print(hgnc_string_ppi.shape[0])
hgnc_string_ppi.head(2)

11938498
11741350


Unnamed: 0,protein1,protein2,combined_score,#string_protein_id_x,gene1,#string_protein_id_y,gene2
0,9606.ENSP00000000233,9606.ENSP00000379496,155,9606.ENSP00000000233,ARF5,9606.ENSP00000379496,PDE1C
1,9606.ENSP00000013807,9606.ENSP00000379496,255,9606.ENSP00000013807,ERCC1,9606.ENSP00000379496,PDE1C


In [10]:
# physical network
hgnc_string_phys_ppi = pd.merge(
    string_phys_ppi, aliases, left_on='protein1', right_on='#string_protein_id', how='inner'
).rename(columns={'alias':'gene1'})

hgnc_string_phys_ppi = pd.merge(
    hgnc_string_phys_ppi, aliases, left_on='protein2', right_on='#string_protein_id', how='inner'
).rename(columns={'alias':'gene2'})

print(string_phys_ppi.shape[0])
print(hgnc_string_phys_ppi.shape[0])
hgnc_string_phys_ppi.head(2)

1991832
1951472


Unnamed: 0,protein1,protein2,combined_score,#string_protein_id_x,gene1,#string_protein_id_y,gene2
0,9606.ENSP00000000233,9606.ENSP00000264718,156,9606.ENSP00000000233,ARF5,9606.ENSP00000264718,GPN1
1,9606.ENSP00000005257,9606.ENSP00000264718,156,9606.ENSP00000005257,RALA,9606.ENSP00000264718,GPN1


In [11]:
hgnc_string_ppi = hgnc_string_ppi[['gene1', 'gene2', 'combined_score']]
#hgnc_string_ppi['weight'] = hgnc_string_ppi['combined_score']/1000
display(hgnc_string_ppi.head(2))

hgnc_string_phys_ppi = hgnc_string_phys_ppi[['gene1', 'gene2', 'combined_score']]
hgnc_string_phys_ppi.head(2)

Unnamed: 0,gene1,gene2,combined_score
0,ARF5,PDE1C,155
1,ERCC1,PDE1C,255


Unnamed: 0,gene1,gene2,combined_score
0,ARF5,GPN1,156
1,RALA,GPN1,156


In [12]:
hgnc_string_ppi.to_csv('../../data/processed/ppis/string.csv', index=False)
hgnc_string_phys_ppi.to_csv('../../data/processed/ppis/string_phys.csv', index=False)

# STRING Graph Creation

In [2]:
string_full = pd.read_csv('../../data/processed/ppis/string.csv')
print(string_full.shape[0])
display(string_full.head(2))
string_phys = pd.read_csv('../../data/processed/ppis/string_phys.csv')
print(string_phys.shape[0])
string_phys.head(2)

11741350


Unnamed: 0,gene1,gene2,combined_score
0,ARF5,PDE1C,155
1,ERCC1,PDE1C,255


1951472


Unnamed: 0,gene1,gene2,combined_score
0,ARF5,GPN1,156
1,RALA,GPN1,156


In [3]:
# evaluate different combined_score filters
stats = []
for i in [0, 400, 600, 700]:
    for l, g in zip(('Full PPI Graph', 'Physical PPI Graph'), (string_full, string_phys)):
        graph = Graph.DataFrame(
            g.loc[g.combined_score>i, ['gene1', 'gene2']],
            directed=False, use_vids=False
        )
        graph = graph.simplify()
        graph = graph.subgraph(graph.components()[0])

        stats.append({
            'filter': i,
            'graph': l,
            'nodes': graph.vcount(),
            'edges': graph.ecount(),
            'density': graph.density()
        })

stats = pd.DataFrame(stats)
stats

Unnamed: 0,filter,graph,nodes,edges,density
0,0,Full PPI Graph,19035,5849499,0.03229
1,0,Physical PPI Graph,18092,972696,0.005944
2,400,Full PPI Graph,18975,879691,0.004887
3,400,Physical PPI Graph,14409,205749,0.001982
4,600,Full PPI Graph,18274,386481,0.002315
5,600,Physical PPI Graph,11600,117600,0.001748
6,700,Full PPI Graph,16381,248982,0.001856
7,700,Physical PPI Graph,9508,80326,0.001777


In [15]:
# density of the Net4 graph (Picart-Armada et al 2019)
236963/(11748*(11748-1)/2)

0.0034341529081280946

We will use the combination of the Full PPI graph + combine_score > 700

It has a good balance of # of nodes and edges when compared with the Net4 graph (Picart-Armada et al 2019)

In [16]:
# build graph
string_graph = Graph.DataFrame(
    string_full.loc[string_full.combined_score>700, ['gene1', 'gene2']],
    directed=False, use_vids=False
)

In [4]:
# alternative
string_graph = Graph.DataFrame(
    string_full[['gene1', 'gene2']],
    directed=False, use_vids=False
)

In [5]:
print(string_graph.ecount())
string_graph = string_graph.simplify()
print(string_graph.ecount())

11741350
5849499


In [6]:
print(string_graph.is_connected())
string_graph = string_graph.subgraph(string_graph.components()[0])
print(string_graph.is_connected())
print(string_graph.ecount())

True
True
5849499


In [7]:
string_graph.write_gml("../../data/processed/graph_string")

In [8]:
adj_matrix = string_graph.get_adjacency()
adj_matrix = np.array(adj_matrix.data)
np.save('../../data/processed/string_adjacency_matrix.npy', adj_matrix, allow_pickle=True, fix_imports=True)

# STRING Process and Disease Modules

In [9]:
graph = Graph.Read_GML("../../data/processed/graph_string")

In [10]:
adj_matrix = np.load("../../data/processed/string_adjacency_matrix.npy")

In [11]:
reactome = pd.read_csv('../../data/interim/ReactomeReactions.csv')
display(reactome.head(2))
disgenet = pd.read_csv('../../data/interim/disgenet.csv')
display(disgenet.head(2))

Unnamed: 0,NCBI ID,Reactome ID,URL,Event,Evidence Code,Species,HGNC ID
0,1,R-HSA-481007,https://reactome.org/PathwayBrowser/#/R-HSA-48...,Exocytosis of platelet alpha granule contents,TAS,Homo sapiens,A1BG
1,1,R-HSA-6798748,https://reactome.org/PathwayBrowser/#/R-HSA-67...,Exocytosis of secretory granule lumen proteins,TAS,Homo sapiens,A1BG


Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,1,A1BG,0.7,0.538,C0019209,Hepatomegaly,phenotype,C23;C06,Finding,0.3,1.0,2017.0,2017.0,1,0,CTD_human
1,1,A1BG,0.7,0.538,C0036341,Schizophrenia,disease,F03,Mental or Behavioral Dysfunction,0.3,1.0,2015.0,2015.0,1,0,CTD_human


In [29]:
filtered_reactome = reactome[reactome['HGNC ID'].isin(graph.vs['name'])]
filtered_disgenet = disgenet[disgenet['geneSymbol'].isin(graph.vs['name'])]
#filtered_disgenet = filtered_disgenet[filtered_disgenet['diseaseType']=='disease']

In [30]:
reactome_modules = get_protein_index(filtered_reactome, 'reactome', graph)
reactome_modules['module_size'] = reactome_modules['protein_index'].apply(lambda row: len(row))
reactome_modules = reactome_modules[(reactome_modules['module_size']>=50)&(reactome_modules['module_size']<=300)].reset_index(drop=True)

In [31]:
disgenet_modules = get_protein_index(filtered_disgenet, 'disgenet', graph)
disgenet_modules['len'] = disgenet_modules['protein_index'].apply(lambda row: len(row))
disgenet_modules = disgenet_modules[(disgenet_modules['len']>=50)&(disgenet_modules['len']<=300)].reset_index(drop=True)

In [32]:
reactome_modules

Unnamed: 0,process,proteins_ids,protein_index,module_size
0,R-HSA-1031716,"[TRIM10, TRIM22, IRF9, IFI30, TRIM38, TRIM3, T...","[13891, 6259, 6308, 6325, 6149, 5598, 16269, 2...",71
1,R-HSA-112379,"[CDK7, CDK9, SUPT16H, LEO1, ERCC2, ERCC3, RTF1...","[786, 6625, 681, 996, 4031, 3268, 2755, 6574, ...",52
2,R-HSA-112385,"[CDK7, CDK9, SUPT16H, LEO1, ERCC2, ERCC3, RTF1...","[786, 6625, 681, 996, 4031, 3268, 2755, 6574, ...",52
3,R-HSA-1168640,"[PSME3, PSMD14, PSMB11, PSMA8, PSME4, NFKB1, N...","[967, 1572, 1569, 1048, 1564, 716, 2960, 10724...",50
4,R-HSA-1234159,"[PSME3, PSMD14, EGLN3, PSMB11, WTIP, PSMA8, EP...","[967, 1572, 3047, 1569, 1746, 1048, 6497, 1564...",59
...,...,...,...,...
239,R-HSA-983147,"[GPR75-ASB3, HUWE1, RNF41, STUB1, TRAIP, KLHL4...","[10661, 1183, 7912, 7774, 12581, 14383, 2905, ...",210
240,R-HSA-983156,"[GPR75-ASB3, HUWE1, RNF41, STUB1, TRAIP, KLHL4...","[10661, 1183, 7912, 7774, 12581, 14383, 2905, ...",247
241,R-HSA-983157,"[GPR75-ASB3, HUWE1, RNF41, STUB1, TRAIP, KLHL4...","[10661, 1183, 7912, 7774, 12581, 14383, 2905, ...",247
242,R-HSA-983259,"[KIF20A, TUBA1B, TUBB3, TUBB4A, TUBB4B, CENPE,...","[1501, 3588, 2327, 3156, 3619, 884, 4602, 5798...",59


## SCA & Conservative Modules

In [15]:
disgenet_modules

Unnamed: 0,process,proteins_ids,protein_index,len
0,C0000786,"[AGTR1, AHR, APOE, ARNT, CEACAM1, CD7, CD8A, C...","[9810, 10634, 4866, 7455, 7208, 6890, 6332, 59...",107
1,C0000822,"[AGTR1, AHR, APOE, ARNT, CEACAM1, CD7, CD8A, C...","[9810, 10634, 4866, 7455, 7208, 6890, 6332, 59...",107
2,C0001418,"[ABL1, ALOX5, ALOX12B, APC, BIRC5, APOA1, APOE...","[1371, 12169, 15208, 7267, 3369, 8223, 4866, 1...",114
3,C0001787,"[ACTG1, ADCY5, ANXA2, ATIC, CA2, CALCR, COL1A1...","[1735, 550, 3656, 7785, 13478, 9600, 7234, 735...",61
4,C0001973,"[NAT1, ABO, ADCY5, ADCY7, ADH1A, ADH1B, ADH1C,...","[16166, 14222, 550, 461, 13093, 9473, 13196, 9...",260
...,...,...,...,...
296,C4552766,"[AGTR1, AHR, APOE, ARNT, CEACAM1, CD7, CD8A, C...","[9810, 10634, 4866, 7455, 7208, 6890, 6332, 59...",107
297,C4704862,"[ADCY7, ALOX12, AQP9, AREG, RERE, BTG1, C8B, C...","[461, 9928, 9291, 7568, 11337, 10419, 12162, 7...",125
298,C4707243,"[ACTA2, ACVRL1, JAG1, BGN, CBS, COL1A1, COL1A2...","[1623, 5420, 778, 7414, 4105, 7234, 7356, 7372...",53
299,C4721453,"[ABCA1, ALOX12, ATF3, CACNA1B, CASP9, CDKN2A, ...","[8291, 9928, 6140, 5326, 1128, 4272, 10812, 58...",54


In [16]:
tqdm.pandas()
disgenet_modules['main_component'],\
      disgenet_modules['conservative_module'],\
          disgenet_modules['added_nodes'] = disgenet_modules.progress_apply(
              lambda row: sca(row['protein_index'], graph, adj_matrix), axis=1, result_type='expand'
              ).T.values

  0%|          | 0/301 [00:00<?, ?it/s]

In [17]:
disgenet_modules['len_sca'] = disgenet_modules['main_component'].apply(lambda x: len(x))
disgenet_modules['len_conservative'] = disgenet_modules['conservative_module'].apply(lambda x: len(x))
disgenet_modules['len_added_nodes'] = disgenet_modules['added_nodes'].apply(lambda x: len(x))

In [18]:
disgenet_modules['main_component_ids'] = disgenet_modules.apply(lambda row: [graph.vs['name'][i] for i in row['main_component']], axis=1)
disgenet_modules['conservative_module_ids'] = disgenet_modules.apply(lambda row: [graph.vs['name'][i] for i in row['conservative_module']], axis=1)
disgenet_modules['added_nodes_ids'] = disgenet_modules.apply(lambda row: [graph.vs['name'][i] for i in row['added_nodes']], axis=1)

In [19]:
disgenet_modules['increase'] = disgenet_modules.apply(lambda row: row['len_added_nodes']/row['len'], axis=1)

In [20]:
disgenet_modules['increase'].max()

0.03636363636363636

There is a very big difference between the connectivity observed in the STRING network and in the APID&HuRI networks. No SCA modules need to be discarded.

In [21]:
disgenet_modules.head()

Unnamed: 0,process,proteins_ids,protein_index,len,main_component,conservative_module,added_nodes,len_sca,len_conservative,len_added_nodes,main_component_ids,conservative_module_ids,added_nodes_ids,increase
0,C0000786,"[AGTR1, AHR, APOE, ARNT, CEACAM1, CD7, CD8A, C...","[9810, 10634, 4866, 7455, 7208, 6890, 6332, 59...",107,"[9810, 10634, 4866, 7455, 7208, 6890, 6332, 59...","[11272, 6160, 12816, 6684, 4645, 7208, 7214, 7...",[16513],108,107,1,"[AGTR1, AHR, APOE, ARNT, CEACAM1, CD7, CD8A, C...","[PRLR, CD163, SPAG5, IGF2, TFRC, CEACAM1, ITGB...",[ADAM12],0.009346
1,C0000822,"[AGTR1, AHR, APOE, ARNT, CEACAM1, CD7, CD8A, C...","[9810, 10634, 4866, 7455, 7208, 6890, 6332, 59...",107,"[9810, 10634, 4866, 7455, 7208, 6890, 6332, 59...","[11272, 6160, 12816, 6684, 4645, 7208, 7214, 7...",[16513],108,107,1,"[AGTR1, AHR, APOE, ARNT, CEACAM1, CD7, CD8A, C...","[PRLR, CD163, SPAG5, IGF2, TFRC, CEACAM1, ITGB...",[ADAM12],0.009346
2,C0001418,"[ABL1, ALOX5, ALOX12B, APC, BIRC5, APOA1, APOE...","[1371, 12169, 15208, 7267, 3369, 8223, 4866, 1...",114,"[1371, 12169, 15208, 7267, 3369, 8223, 4866, 1...","[1029, 13318, 11783, 5128, 1543, 8735, 8223, 6...",[],114,114,0,"[ABL1, ALOX5, ALOX12B, APC, BIRC5, APOA1, APOE...","[EEF2, HOXA10, MSLN, PGR, BCL2, PAWR, APOA1, P...",[],0.0
3,C0001787,"[ACTG1, ADCY5, ANXA2, ATIC, CA2, CALCR, COL1A1...","[1735, 550, 3656, 7785, 13478, 9600, 7234, 735...",61,"[1735, 550, 3656, 7785, 13478, 9600, 7234, 735...","[9600, 6402, 387, 3588, 516, 2568, 907, 6925, ...",[],61,61,0,"[ACTG1, ADCY5, ANXA2, ATIC, CA2, CALCR, COL1A1...","[CALCR, SOD2, CAP1, TUBA1B, ZDHHC13, TLN1, PSM...",[],0.0
4,C0001973,"[NAT1, ABO, ADCY5, ADCY7, ADH1A, ADH1B, ADH1C,...","[16166, 14222, 550, 461, 13093, 9473, 13196, 9...",260,"[16166, 14222, 550, 461, 13093, 9473, 13196, 9...","[1538, 9219, 10757, 3080, 6154, 5643, 15885, 3...",[],260,260,0,"[NAT1, ABO, ADCY5, ADCY7, ADH1A, ADH1B, ADH1C,...","[MBP, CNR1, ANKRD7, ALDH3B2, RFX4, CDH8, LILRA...",[],0.0


In [22]:
disgenet_modules.to_csv('../../data/processed/string_disgenet_modules.csv', index=False)

In [23]:
reactome_modules.to_csv('../../data/processed/string_reactome_modules.csv', index=False)