In [1]:
import pandas as pd
from igraph import Graph
import disease_process_proteins
import importlib
import numpy as np
import steiner_tree
from tqdm.notebook import tqdm
tqdm.pandas()

# APID & HuRI Graph

In [119]:
apid_huri = pd.read_csv("../../data/processed/ppis/apid_huri_graph.csv", sep=',', header=0)
print(apid_huri.shape[0])
apid_huri.head()

282394


Unnamed: 0,protein_A,protein_B
0,SCRIB,ARHGEF7
1,SCRIB,NET1
2,KCNA5,SCRIB
3,VWCE,SCRIB
4,DNM1L,SCRIB


In [120]:
apid_huri_graph = Graph.DataFrame(apid_huri, use_vids=False, directed=False)
print(apid_huri_graph.ecount())
apid_huri_graph = apid_huri_graph.simplify()
print(apid_huri_graph.ecount())

282394
260627


In [121]:
if not apid_huri_graph.is_connected():
    apid_huri_graph = apid_huri_graph.subgraph(apid_huri_graph.components()[0])

print(apid_huri_graph.is_connected())
print(apid_huri_graph.ecount())

True
260624


In [122]:
apid_huri_graph.write_gml("../../data/processed/apid_huri_graph")

In [123]:
apid_huri_gene_ids = pd.DataFrame(apid_huri_graph.vs['name'], columns=['protein_id'])
apid_huri_gene_ids.to_csv('../../data/processed/apid_huri_gene_ids.csv', index=False)
apid_huri_gene_ids.head()

Unnamed: 0,protein_id
0,SCRIB
1,ARHGEF7
2,NET1
3,KCNA5
4,VWCE


In [None]:
apid_huri_adj_matrix = apid_huri_graph.get_adjacency()
apid_huri_adj_matrix = np.array(apid_huri_adj_matrix.data)
np.save('../../data/processed/apid_huri_adjacency_matrix.npy', apid_huri_adj_matrix, allow_pickle=True, fix_imports=True)

KeyboardInterrupt: 

# Pathways and Diseases

## Load Data

In [2]:
disgenet = pd.read_csv("../../data/interim/disgenet.csv")
print(disgenet.shape[0])
disgenet.head(2)

83953


Unnamed: 0,entrez_id,protein_id,diseaseId,diseaseName,diseaseType,score
0,1,A1BG,C0019209,Hepatomegaly,phenotype,0.3
1,1,A1BG,C0036341,Schizophrenia,disease,0.3


In [3]:
reactome = pd.read_csv("../../data/interim/reactome_reactions.csv",)
print(reactome.shape[0])
reactome.head()

33205


Unnamed: 0,Reactome_ID,protein_id,Event
0,R-HSA-481007,A1BG,Exocytosis of platelet alpha granule contents
1,R-HSA-6798748,A1BG,Exocytosis of secretory granule lumen proteins
2,R-HSA-6800434,A1BG,Exocytosis of ficolin-rich granule lumen proteins
3,R-HSA-8952289,CDH2,FAM20C phosphorylates FAM20C substrates
4,R-HSA-560473,MED6,Expression of ANGPTL4


In [4]:
gene_ids = pd.read_csv('../../data/processed/apid_huri_gene_ids.csv')
gene_ids.head()

Unnamed: 0,protein_id
0,SCRIB
1,ARHGEF7
2,NET1
3,KCNA5
4,VWCE


In [5]:
apid_huri_graph = Graph.Read_GML("../../data/processed/apid_huri_graph")
apid_huri_adj_matrix = np.load("../../data/processed/apid_huri_adjacency_matrix.npy")

In [6]:
# compute shortest paths
sp = apid_huri_graph.distances()
sp_df = pd.DataFrame(sp)

## Reactome Pathway Modules

In [48]:
reactome_modules = reactome.loc[reactome['protein_id'].isin(gene_ids['protein_id']), ['Reactome_ID', 'protein_id']]
print(reactome_modules.shape[0])
reactome_modules.head(2)

32069


Unnamed: 0,Reactome_ID,protein_id
0,R-HSA-481007,A1BG
1,R-HSA-6798748,A1BG


In [49]:
importlib.reload(disease_process_proteins)
reactome_proteins_indices = disease_process_proteins.get_protein_index(reactome_modules, apid_huri_graph)
print(reactome_proteins_indices.shape[0])
reactome_proteins_indices.head(2)

433


Unnamed: 0,Reactome_ID,protein_id,protein_index,module_size
0,R-HSA-1031716,"[TRIM10, TRIM22, IRF9, IFI30, TRIM38, TRIM3, T...","[10480, 6234, 7001, 11737, 4192, 8083, 7685, 8...",73
1,R-HSA-112379,"[CDK7, CDK9, SUPT16H, LEO1, ERCC2, ERCC3, RTF1...","[3063, 2790, 512, 5493, 2565, 2775, 5494, 4148...",56


In [50]:
reactome_proteins_indices = reactome_proteins_indices[
    (reactome_proteins_indices.module_size <= 300)&
    (reactome_proteins_indices.module_size >= 50)
    ]

print(reactome_proteins_indices.shape[0])
reactome_proteins_indices.head(2)

426


Unnamed: 0,Reactome_ID,protein_id,protein_index,module_size
0,R-HSA-1031716,"[TRIM10, TRIM22, IRF9, IFI30, TRIM38, TRIM3, T...","[10480, 6234, 7001, 11737, 4192, 8083, 7685, 8...",73
1,R-HSA-112379,"[CDK7, CDK9, SUPT16H, LEO1, ERCC2, ERCC3, RTF1...","[3063, 2790, 512, 5493, 2565, 2775, 5494, 4148...",56


In [51]:
reactome_proteins_indices.rename(columns={'Reactome_ID': 'module_id'}, inplace=True)
reactome_proteins_indices.head(2)

Unnamed: 0,module_id,protein_id,protein_index,module_size
0,R-HSA-1031716,"[TRIM10, TRIM22, IRF9, IFI30, TRIM38, TRIM3, T...","[10480, 6234, 7001, 11737, 4192, 8083, 7685, 8...",73
1,R-HSA-112379,"[CDK7, CDK9, SUPT16H, LEO1, ERCC2, ERCC3, RTF1...","[3063, 2790, 512, 5493, 2565, 2775, 5494, 4148...",56


In [52]:
reactome_proteins_indices.to_csv('../../data/processed/reactome_modules_apid_huri.csv', index=False)

### Add false annotations to modules

In [53]:
importlib.reload(disease_process_proteins)
reactome_proteins_indices_fa = disease_process_proteins.add_false_annotations(reactome_proteins_indices, sp_df, apid_huri_graph, n_jobs=4)
reactome_proteins_indices_fa.head(2)

  0%|          | 0/426 [00:00<?, ?it/s]

100%|██████████| 426/426 [10:30<00:00,  1.48s/it]


Unnamed: 0,module_id,protein_id,protein_index
0,R-HSA-1031716,"[TRIM10, TRIM22, IRF9, IFI30, TRIM38, TRIM3, T...","[10480, 6234, 7001, 11737, 4192, 8083, 7685, 8..."
1,R-HSA-112379,"[CDK7, CDK9, SUPT16H, LEO1, ERCC2, ERCC3, RTF1...","[3063, 2790, 512, 5493, 2565, 2775, 5494, 4148..."


In [54]:
reactome_proteins_indices_fa.to_csv('../../data/processed/reactome_modules_fa_apid_huri.csv', index=False)

## Disgenet Disease Modules

In [7]:
disgenet_diseases = disgenet.loc[disgenet['protein_id'].isin(gene_ids['protein_id']), ['diseaseId', 'protein_id']]
print(disgenet_diseases.shape[0])
disgenet_diseases.head(2)

78830


Unnamed: 0,diseaseId,protein_id
0,C0019209,A1BG
1,C0036341,A1BG


In [36]:
importlib.reload(disease_process_proteins)
disgenet_protein_indices = disease_process_proteins.get_protein_index(disgenet_diseases, apid_huri_graph, 'diseaseId')
print(disgenet_protein_indices.shape[0])
disgenet_protein_indices.head(2)

10859


Unnamed: 0,diseaseId,protein_id,protein_index,module_size
0,C0000737,[IFNA2],[9754],1
1,C0000744,[MTTP],[2991],1


In [38]:
disgenet_protein_indices = disgenet_protein_indices[
    (disgenet_protein_indices.module_size <= 300)&
    (disgenet_protein_indices.module_size >= 50)
    ]

print(disgenet_protein_indices.shape[0])
disgenet_protein_indices.head(2)

300


Unnamed: 0,diseaseId,protein_id,protein_index,module_size
5,C0000786,"[AGTR1, AHR, ALPG, APOE, ARNT, CEACAM1, CD7, C...","[2375, 3514, 14913, 2195, 3758, 4206, 9816, 25...",104
8,C0000822,"[AGTR1, AHR, ALPG, APOE, ARNT, CEACAM1, CD7, C...","[2375, 3514, 14913, 2195, 3758, 4206, 9816, 25...",104


In [39]:
ids = apid_huri_graph.vs['name']

In [41]:
importlib.reload(steiner_tree)

disgenet_protein_indices_sca = disgenet_protein_indices.copy()

disgenet_protein_indices_sca['main_component'],\
    disgenet_protein_indices_sca['conservative_module'],\
        disgenet_protein_indices_sca['added_proteins'] = disgenet_protein_indices_sca.progress_apply(
            lambda row: steiner_tree.sca(row['protein_index'], apid_huri_graph, apid_huri_adj_matrix), axis=1, result_type='expand'
            ).T.values

  0%|          | 0/300 [00:00<?, ?it/s]

In [42]:
disgenet_protein_indices_sca['main_component_size'] = disgenet_protein_indices_sca['main_component'].apply(lambda x: len(x))
disgenet_protein_indices_sca['conservative_module_size'] = disgenet_protein_indices_sca['conservative_module'].apply(lambda x: len(x))
disgenet_protein_indices_sca['n_added_proteins'] = disgenet_protein_indices_sca['added_proteins'].apply(lambda x: len(x))

In [43]:
disgenet_protein_indices_sca['main_component_ids'] = disgenet_protein_indices_sca.apply(lambda row: [ids[i] for i in row['main_component']], axis=1)
disgenet_protein_indices_sca['conservative_module_ids'] = disgenet_protein_indices_sca.apply(lambda row: [ids[i] for i in row['conservative_module']], axis=1)
disgenet_protein_indices_sca['added_protein_ids'] = disgenet_protein_indices_sca.apply(lambda row: [ids[i] for i in row['added_proteins']], axis=1)

In [45]:
disgenet_protein_indices_sca['increase_pct'] = disgenet_protein_indices_sca.n_added_proteins/disgenet_protein_indices_sca.module_size

In [55]:
disgenet_protein_indices_sca.rename(columns={'diseaseId': 'module_id'}, inplace=True)
disgenet_protein_indices_sca.head(2)

Unnamed: 0,module_id,protein_id,protein_index,module_size,main_component,conservative_module,added_proteins,main_component_size,conservative_module_size,n_added_proteins,main_component_ids,conservative_module_ids,added_protein_ids,increase_pct
5,C0000786,"[AGTR1, AHR, ALPG, APOE, ARNT, CEACAM1, CD7, C...","[2375, 3514, 14913, 2195, 3758, 4206, 9816, 25...",104,"[2375, 3514, 14913, 2195, 3758, 4206, 9816, 25...","[12291, 13317, 9737, 9739, 1548, 11791, 16923,...","[145, 1243, 1453, 2613, 7084, 1838, 932, 4497,...",148,102,46,"[AGTR1, AHR, ALPG, APOE, ARNT, CEACAM1, CD7, C...","[IFI44, MMP12, CD226, CEACAM6, MMP15, FLRT3, I...","[EGFR, KANK2, APP, TGM2, HSCB, UBQLN2, EBP, CU...",0.442308
8,C0000822,"[AGTR1, AHR, ALPG, APOE, ARNT, CEACAM1, CD7, C...","[2375, 3514, 14913, 2195, 3758, 4206, 9816, 25...",104,"[2375, 3514, 14913, 2195, 3758, 4206, 9816, 25...","[12291, 13317, 9737, 9739, 1548, 11791, 16923,...","[145, 1243, 1453, 2613, 7084, 1838, 932, 4497,...",148,102,46,"[AGTR1, AHR, ALPG, APOE, ARNT, CEACAM1, CD7, C...","[IFI44, MMP12, CD226, CEACAM6, MMP15, FLRT3, I...","[EGFR, KANK2, APP, TGM2, HSCB, UBQLN2, EBP, CU...",0.442308


In [47]:
disgenet_protein_indices.to_csv('../../data/processed/disgenet_modules_apid_huri.csv', index=False)

In [59]:
disgenet_sca_modules = disgenet_protein_indices_sca[disgenet_protein_indices_sca['increase_pct']<0.4][
    ['module_id', 'main_component_ids', 'main_component', 'main_component_size']
    ]
disgenet_sca_modules.columns = ['module_id', 'protein_id', 'protein_index', 'module_size']
disgenet_sca_modules.head(2)

Unnamed: 0,module_id,protein_id,protein_index,module_size
24,C0001418,"[ABL1, ALOX5, ALOX12B, APC, BIRC5, APOA1, APOE...","[2510, 3877, 10381, 853, 2600, 2518, 2195, 789...",134
48,C0001973,"[NAT1, ABO, ADCY5, ADCY7, ADH1A, ADH1B, ADH1C,...","[14914, 16411, 13350, 9792, 16667, 9783, 14841...",325


In [63]:
disgenet_conservative_modules = disgenet_protein_indices_sca[['module_id', 'conservative_module_ids', 'conservative_module', 'conservative_module_size']]
disgenet_conservative_modules.columns = ['module_id', 'protein_id', 'protein_index', 'module_size']
disgenet_conservative_modules.head(2)

Unnamed: 0,module_id,protein_id,protein_index,module_size
5,C0000786,"[IFI44, MMP12, CD226, CEACAM6, MMP15, FLRT3, I...","[12291, 13317, 9737, 9739, 1548, 11791, 16923,...",102
8,C0000822,"[IFI44, MMP12, CD226, CEACAM6, MMP15, FLRT3, I...","[12291, 13317, 9737, 9739, 1548, 11791, 16923,...",102


In [65]:
disgenet_sca_modules.to_csv('../../data/processed/disgenet_sca_modules_apid_huri.csv', index=False)
disgenet_conservative_modules.to_csv('../../data/processed/disgenet_conservative_modules_apid_huri.csv', index=False)

### Add false annotations

In [7]:
disgenet_sca_modules = pd.read_csv(
    '../../data/processed/disgenet_sca_modules_apid_huri.csv',
    converters={'protein_id': eval, 'protein_index': eval}
    )
print(disgenet_sca_modules.shape[0])
display(disgenet_sca_modules.head(2))

disgenet_conservative_modules = pd.read_csv(
    '../../data/processed/disgenet_conservative_modules_apid_huri.csv',
    converters={'protein_id': eval, 'protein_index': eval}
    )
print(disgenet_conservative_modules.shape[0])
disgenet_conservative_modules.head(2)

204


Unnamed: 0,module_id,protein_id,protein_index,module_size
0,C0001418,"[ABL1, ALOX5, ALOX12B, APC, BIRC5, APOA1, APOE...","[2510, 3877, 10381, 853, 2600, 2518, 2195, 789...",134
1,C0001973,"[NAT1, ABO, ADCY5, ADCY7, ADH1A, ADH1B, ADH1C,...","[14914, 16411, 13350, 9792, 16667, 9783, 14841...",325


300


Unnamed: 0,module_id,protein_id,protein_index,module_size
0,C0000786,"[IFI44, MMP12, CD226, CEACAM6, MMP15, FLRT3, I...","[12291, 13317, 9737, 9739, 1548, 11791, 16923,...",102
1,C0000822,"[IFI44, MMP12, CD226, CEACAM6, MMP15, FLRT3, I...","[12291, 13317, 9737, 9739, 1548, 11791, 16923,...",102


In [None]:
importlib.reload(disease_process_proteins)
disgenet_sca_modules_fa = disease_process_proteins.add_false_annotations(disgenet_sca_modules, sp_df, apid_huri_graph, n_jobs=4)
disgenet_sca_modules_fa.head(2)

In [16]:
importlib.reload(disease_process_proteins)
disgenet_conservative_modules_fa = disease_process_proteins.add_false_annotations(disgenet_conservative_modules, sp_df, apid_huri_graph, n_jobs=4)
disgenet_conservative_modules_fa.head(2)

100%|██████████| 300/300 [06:35<00:00,  1.32s/it]


Unnamed: 0,module_id,protein_id,protein_index
0,C0000786,"[IFI44, MMP12, CD226, CEACAM6, MMP15, FLRT3, I...","[12291, 13317, 9737, 9739, 1548, 11791, 16923,..."
1,C0000822,"[IFI44, MMP12, CD226, CEACAM6, MMP15, FLRT3, I...","[12291, 13317, 9737, 9739, 1548, 11791, 16923,..."


In [18]:
disgenet_sca_modules_fa.to_csv('../../data/processed/disgenet_sca_modules_fa_apid_huri.csv', index=False)
disgenet_conservative_modules_fa.to_csv('../../data/processed/disgenet_conservative_modules_fa_apid_huri.csv', index=False)