In [35]:
import pandas as pd
import numpy as np
from igraph import Graph
import sys
sys.path.insert(0, '../features')
from disease_process_proteins import get_protein_index
from steiner_tree import sca
from tqdm.notebook import tqdm

# STRING PPI Conversion

In [64]:
string_ppi = pd.read_csv('../../data/raw/9606.protein.links.v11.5.txt', sep=' ')
string_aliases = pd.read_csv('../../data/raw/9606.protein.aliases.v11.5.txt', sep='\t')
conversion_df = pd.read_csv('../../data/interim/HGNC symbols.txt')

In [65]:
string_ppi.head()

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000379496,155
1,9606.ENSP00000000233,9606.ENSP00000314067,197
2,9606.ENSP00000000233,9606.ENSP00000263116,222
3,9606.ENSP00000000233,9606.ENSP00000361263,181
4,9606.ENSP00000000233,9606.ENSP00000409666,270


In [66]:
string_aliases[string_aliases['source']=='Ensembl_HGNC'].head() 
# Provides with conversion from ENSP to HGNC GeneName

Unnamed: 0,#string_protein_id,alias,source
32,9606.ENSP00000000233,ARF5,Ensembl_HGNC
244,9606.ENSP00000000412,M6PR,Ensembl_HGNC
516,9606.ENSP00000001008,FKBP4,Ensembl_HGNC
712,9606.ENSP00000001146,CYP26B1,Ensembl_HGNC
976,9606.ENSP00000002125,NDUFAF7,Ensembl_HGNC


In [70]:
string_aliases_hgnc = string_aliases[string_aliases['source']=='Ensembl_HGNC']
# Ensures full coverage of identifiers

In [71]:
filtered_alias = string_aliases_hgnc.drop_duplicates(subset=['#string_protein_id', 'alias'])

In [73]:
pd.concat(g for _, g in filtered_alias.groupby("#string_protein_id") if len(g) > 1) # String IDs with more than one HGNC Alias - manual curation with information from pharos.nih.gov/targets/.

Unnamed: 0,#string_protein_id,alias,source
3917207,9606.ENSP00000451768,KIAA0408,Ensembl_HGNC
3917264,9606.ENSP00000451768,SOGA3,Ensembl_HGNC
4027534,9606.ENSP00000469970,OPN1MW,Ensembl_HGNC
4027544,9606.ENSP00000469970,OPN1MW2,Ensembl_HGNC
4035263,9606.ENSP00000471017,MAGEA9,Ensembl_HGNC
4035282,9606.ENSP00000471017,MAGEA9B,Ensembl_HGNC
4045792,9606.ENSP00000472749,TMSB15A,Ensembl_HGNC
4045796,9606.ENSP00000472749,TMSB15B,Ensembl_HGNC
4061631,9606.ENSP00000475814,HIST2H2AA3,Ensembl_HGNC
4061634,9606.ENSP00000475814,HIST2H2AA4,Ensembl_HGNC


In [78]:
aliases = filtered_alias[~filtered_alias['alias'].isin(['KIAA0408', 'OPN1MW2', 'MAGEA9B', 'TMSB15B', 'HIST2H2AA4', 'TBC1D3', 'CT45A4', 'CT45A6', 'C1QTNF5'])][['#string_protein_id', 'alias']]

In [86]:
aliases.set_index('#string_protein_id', inplace=True, drop=True)

In [89]:
aliases_dict = aliases.to_dict(orient='dict')['alias']

In [95]:
string_ppi['gene1'] = string_ppi['protein1'].apply(lambda row: aliases_dict[row] if row in aliases_dict.keys() else np.nan)
string_ppi['gene2'] = string_ppi['protein2'].apply(lambda row: aliases_dict[row] if row in aliases_dict.keys() else np.nan)

In [107]:
hgnc_string_ppi = string_ppi[['gene1', 'gene2', 'combined_score']].dropna(subset=['gene1', 'gene2'])

In [108]:
hgnc_string_ppi['weight'] = hgnc_string_ppi['combined_score']/1000

In [120]:
hgnc_string_ppi.to_csv('../../data/processed/ppis/string.csv', index=False)

In [36]:
hgnc_string_ppi = pd.read_csv('../../data/processed/ppis/string.csv')

# STRING Graph Creation

In [None]:
graph.is_connected()

True

In [26]:
graph = graph.simplify(combine_edges=dict(weight="max"))

In [31]:
print('Graph has {} nodes connected by {} edges.'.format(graph.vcount(), graph.ecount()))

Graph has 19035 nodes connected by 5849499 edges.


In [32]:
graph.write_gml("../../data/processed/graph_string_weighted")

In [121]:
graph.write_gml("../../data/processed/graph_string")

In [38]:
adj_matrix = graph.get_adjacency()
adj_matrix = np.array(adj_matrix.data)
np.save('../../data/processed/string_adjacency_matrix.npy', adj_matrix, allow_pickle=True, fix_imports=True)

In [2]:
graph = Graph.Read_GML("../../data/processed/graph_string")

In [3]:
adj_matrix = np.load("../../data/processed/string_adjacency_matrix.npy")

# STRING Process and Disease Modules

In [4]:
reactome = pd.read_csv('../../data/interim/ReactomeReactions.csv')
disgenet = pd.read_csv('../../data/interim/disgenet.csv')

In [5]:
filtered_reactome = reactome[reactome['HGNC ID'].isin(graph.vs['name'])]
filtered_disgenet = disgenet[disgenet['geneSymbol'].isin(graph.vs['name'])]
#filtered_disgenet = filtered_disgenet[filtered_disgenet['diseaseType']=='disease']

In [6]:
reactome_modules = get_protein_index(filtered_reactome, 'reactome', graph)
reactome_modules['module_size'] = reactome_modules['protein_index'].apply(lambda row: len(row))
reactome_modules = reactome_modules[(reactome_modules['module_size']>=50)&(reactome_modules['module_size']<=300)].reset_index(drop=True)

In [7]:
disgenet_modules = get_protein_index(filtered_disgenet, 'disgenet', graph)
disgenet_modules['len'] = disgenet_modules['protein_index'].apply(lambda row: len(row))
disgenet_modules = disgenet_modules[(disgenet_modules['len']>=50)&(disgenet_modules['len']<=300)].reset_index(drop=True)

## SCA & Conservative Modules

In [8]:
disgenet_modules

Unnamed: 0,process,proteins_ids,protein_index,len
0,C0000786,"[AGTR1, AHR, APOE, ARNT, CEACAM1, CD7, CD8A, C...","[418, 431, 844, 1027, 2906, 2747, 2759, 2745, ...",107
1,C0000822,"[AGTR1, AHR, APOE, ARNT, CEACAM1, CD7, CD8A, C...","[418, 431, 844, 1027, 2906, 2747, 2759, 2745, ...",107
2,C0001418,"[ABL1, ALOX5, ALOX12B, APC, BIRC5, APOA1, APOE...","[101, 559, 556, 800, 1511, 821, 844, 1318, 262...",114
3,C0001787,"[ACTG1, ADCY5, ANXA2, ATIC, CA2, CALCR, COL1A1...","[203, 306, 748, 1177, 2199, 2266, 3466, 3467, ...",61
4,C0001973,"[NAT1, ABO, ADCY5, ADCY7, ADH1A, ADH1B, ADH1C,...","[10339, 106, 306, 308, 317, 318, 319, 320, 321...",260
...,...,...,...,...
296,C4552766,"[AGTR1, AHR, APOE, ARNT, CEACAM1, CD7, CD8A, C...","[418, 431, 844, 1027, 2906, 2747, 2759, 2745, ...",107
297,C4704862,"[ADCY7, ALOX12, AQP9, AREG, RERE, BTG1, C8B, C...","[308, 555, 876, 885, 13601, 1650, 2138, 2268, ...",125
298,C4707243,"[ACTA2, ACVRL1, JAG1, BGN, CBS, COL1A1, COL1A2...","[199, 234, 7971, 1491, 2408, 3466, 3467, 3478,...",53
299,C4721453,"[ABCA1, ALOX12, ATF3, CACNA1B, CASP9, CDKN2A, ...","[31, 555, 1151, 2224, 2375, 2878, 9183, 3738, ...",54


In [9]:
tqdm.pandas()
disgenet_modules['main_component'], disgenet_modules['conservative_module'], disgenet_modules['added_nodes'] = disgenet_modules.progress_apply(lambda row: sca(row['protein_index'],graph,adj_matrix), axis=1, result_type='expand').T.values

  0%|          | 0/301 [00:00<?, ?it/s]

In [11]:
disgenet_modules['len_sca'] = disgenet_modules['main_component'].apply(lambda x: len(x))
disgenet_modules['len_conservative'] = disgenet_modules['conservative_module'].apply(lambda x: len(x))
disgenet_modules['len_added_nodes'] = disgenet_modules['added_nodes'].apply(lambda x: len(x))

In [13]:
disgenet_modules['main_component_ids'] = disgenet_modules.apply(lambda row: [graph.vs['name'][i] for i in row['main_component']], axis=1)
disgenet_modules['conservative_module_ids'] = disgenet_modules.apply(lambda row: [graph.vs['name'][i] for i in row['conservative_module']], axis=1)
disgenet_modules['added_nodes_ids'] = disgenet_modules.apply(lambda row: [graph.vs['name'][i] for i in row['added_nodes']], axis=1)

In [16]:
disgenet_modules['increase'] = disgenet_modules.apply(lambda row: row['len_added_nodes']/row['len'], axis=1)

In [22]:
disgenet_modules['increase'].max()

0.03636363636363636

There is a very big difference between the connectivity observed in the STRING network and in the APID&HuRI networks. No SCA modules need to be discarded.

In [23]:
disgenet_modules.to_csv('../../data/processed/string_disgenet_modules.csv', index=False)

In [24]:
reactome_modules.to_csv('../../data/processed/string_reactome_modules.csv', index=False)