In [1]:
from interactome_funcs import *

In [2]:
# reviewed homo sapiens proteome from UniProt 2022_04
# this dataset is filtered according to certain thresholds
proteome = pd.read_excel("processed_data/uniprot/30aa_nounchar_noputative_ref_proteome_protein_existence_filtered_02.xlsx", header=0)
len(proteome)

18401

## HuRI

http://www.interactome-atlas.org/download

In [41]:
huri = pd.read_csv('raw_data/interactomes/huri/HuRI.tsv', sep='\t', header=None, names=['protein1', 'protein2']) # 52548

In [5]:
# extract unique ENSEMBL IDs for mapping to UniProt IDs
huri_all_genes = huri[['protein1', 'protein2']].to_numpy().flatten()
huri_unique_genes = pd.DataFrame(np.unique(huri_all_genes), columns=['Ensembl']) # 8272
huri_unique_genes.to_csv('raw_data/interactomes/huri/huri_ensemble_genes.txt', sep=' ', index=False, header=None)

In [48]:
# uniprot id mapping results
huri_mapping = pd.read_csv('raw_data/interactomes/huri/uniprot-download_true_fields_accession_2Cid_2Cgene_names_2Corganism_-2022.12.05-09.18.01.83.tsv', sep='\t')
huri_mapping = huri_mapping[['From', 'Entry']]
len(huri_mapped)

8158

In [54]:
# sanity check. for 1 by 1 mapping.
# huri_mapping.From.str.split(',').apply(len).sum()

In [7]:
# duplicate check
# same ENSG IDs are mapped to different UniProt IDs
# keep the file
huri_duplicate_ensembl = huri_mapping[huri_mapping.duplicated('From', keep=False)] # len=20, 10 IDs duplicated
huri_duplicate_ensembl.to_excel("raw_data/interactomes/huri/huri_duplicate_ensembl.xlsx", index=False)

In [40]:
# duplicates are investigated. from huri_psi file, mapping is checked. incorrect ones are excluded from mapped list.
# for ENSG00000186184, both mappings were incorrect according to huri_psi file but the correct one was not included in the ref. proteome 
# we selected P0DPB6 as it was the canonical one from ref protein and excluded P0DPB5.
exclude = ['Q9Y4C0', 'Q9UII6', 'Q5TFQ8', 'Q9BXH1', 'P06881', 'Q9P0M2', 'P42166', 'Q6ZVN7', 'L0R6Q1', 'P0DPB5']  
huri_mapping = huri_mapping[huri_mapping["Entry"].str.contains('|'.join(exclude)) == False]

In [49]:
# map ENSG to UniProt and drop if NA
huri['protein1_uniprot'] = huri.protein1.map(huri_mapping.set_index('From')['Entry'].to_dict())
huri['protein2_uniprot'] = huri.protein2.map(huri_mapping.set_index('From')['Entry'].to_dict())
huri = huri.dropna() # 51343

In [61]:
# map to reference proteome data
huri_proteome_mapped = huri[(huri.protein1_uniprot.isin(proteome.Entry)) & huri.protein2_uniprot.isin(proteome.Entry)] # 49050

In [64]:
# remove redundant ones (A-B, B-A)
huri_final = remove_duplicates(huri_proteome_mapped) # 48763 interactions
huri_final.to_excel('processed_data/interactomes/huri/huri_final.xlsx', index=False)

In [66]:
# check the number of nodes/proteins
np.unique(huri_final[['protein1_uniprot', 'protein2_uniprot']].to_numpy().flatten()).shape # 7889 proteins

(7889,)

## STRING

https://string-db.org/cgi/download?sessionId=beWWyYG7kyNt&species_text=Homo+sapiens

In [3]:
string = pd.read_csv('raw_data/interactomes/string/9606.protein.physical.links.detailed.v11.5.txt', sep=' ') # 1991832

### adjust the scoring

http://version10.string-db.org/help/faq/#how-are-the-scores-computed <br>
https://stringdb-static.org/download/combine_subscores.py python script

In [102]:
prior = 0.041

def compute_prior_away(score, prior=0.041):
    if score < prior: score = prior
    score_no_prior = (score - prior) / (1 - prior)
    return score_no_prior

string[['experimental', 'database']] = string[['experimental', 'database']].div(1000)
string['experimental_nop'] = string['experimental'].apply(compute_prior_away)
string['database_nop'] = string['database'].apply(compute_prior_away)
string['expdtb_combined'] = 1.0 - ((1.0 - string['experimental_nop']) * (1 - string['database_nop']))
string['expdtb_combined'] *= (1.0 - prior)
string['expdtb_combined'] += prior

In [128]:
# get high-conf interactions
string = string[['protein1', 'protein2', 'experimental', 'database', 'expdtb_combined']]
string_high_conf = string[string['expdtb_combined'] > 0.7] # 137718
string_high_conf.to_excel('processed_data/interactomes/string/string_high_conf.xlsx', index=False)

In [None]:
# we need unique ENSEMBL IDs for mapping to UniProt IDs
string_high_conf.protein1 = string_high_conf.protein1.str.lstrip("9606.")
string_high_conf.protein2 = string_high_conf.protein2.str.lstrip("9606.")
string_all_ensprots = string_high_conf[['protein1', 'protein2']].to_numpy().flatten()
string_uniq_ensprots = pd.DataFrame(np.unique(string_all_ensprots), columns=['Ensembl']) # 8220
#string_uniq_ensprots.to_csv('raw_data/interactomes/string/string_ensemble_prots.txt', sep=' ', header=None, index=False)

In [115]:
# uniprot id mapping results
string_mapping = pd.read_csv('raw_data/interactomes/string/uniprot-download_true_fields_accession_2Cid_2Cgene_names_2Corganism_-2022.12.05-11.31.44.62.tsv', sep='\t')
string_mapping = string_mapping[['From', 'Entry']]
len(string_mapping)

7632

In [113]:
# string_mapping[string_mapping.duplicated('From', keep=False)] # no dups

In [None]:
# map ENSP to UniProt
string_high_conf['protein1_uniprot'] = string_high_conf.protein1.map(string_mapping.set_index('From')['Entry'].to_dict())
string_high_conf['protein2_uniprot'] = string_high_conf.protein2.map(string_mapping.set_index('From')['Entry'].to_dict())
string_high_conf = string_high_conf.dropna() # 121618

In [134]:
# map to reference proteome data
string_high_conf_proteome_mapped = string_high_conf[(string_high_conf.protein1_uniprot.isin(proteome.Entry)) & string_high_conf.protein2_uniprot.isin(proteome.Entry)] # 120410

In [136]:
# remove redundant ones (A-B, B-A)
string_high_conf_final = remove_duplicates(string_high_conf_proteome_mapped) # 57192 interactions
string_high_conf_final.to_excel('processed_data/interactomes/string/string_high_conf_final.xlsx', index=False)

In [138]:
# check the number of nodes/proteins
np.unique(string_high_conf_final[['protein1_uniprot', 'protein2_uniprot']].to_numpy().flatten()).shape # 7327 proteins

(7327,)

## BioPlex

https://bioplex.hms.harvard.edu/interactions.php 2021 datasets

In [3]:
bioplex_293t = pd.read_csv('raw_data/interactomes/bioplex/BioPlex_293T_Network_10K_Dec_2019.tsv', sep='\t') # 118162
bioplex_hct116 = pd.read_csv('raw_data/interactomes/bioplex/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv', sep='\t') # 70966
bioplex = pd.concat([bioplex_293t, bioplex_hct116]) # 189128

In [5]:
# map to reference proteome data (both cases)
bioplex_proteome_mapped = bioplex[(bioplex.UniprotA.isin(proteome.Entry)) & bioplex.UniprotB.isin(proteome.Entry)] # 87568, no NA values
bioplex_293t_proteome_mapped = bioplex_293t[(bioplex_293t.UniprotA.isin(proteome.Entry)) & bioplex_293t.UniprotB.isin(proteome.Entry)] # 53136, no NA values

In [8]:
# remove redundant ones (A-B, B-A)
bioplex_final = remove_duplicates(bioplex_proteome_mapped, int_dtb_ids=['UniprotA', 'UniprotB']) # 77713 interactions
bioplex_final = bioplex_final.rename(columns={'UniprotA': 'protein1_uniprot', 'UniprotB': 'protein2_uniprot'})
bioplex_final.to_excel('processed_data/interactomes/bioplex/bioplex_final.xlsx', index=False)

bioplex_293t_final = remove_duplicates(bioplex_293t_proteome_mapped, int_dtb_ids=['UniprotA', 'UniprotB']) # 53136 interactions
bioplex_293t_final = bioplex_293t_final.rename(columns={'UniprotA': 'protein1_uniprot', 'UniprotB': 'protein2_uniprot'})
bioplex_293t_final.to_excel('processed_data/interactomes/bioplex/bioplex_293t_final.xlsx', index=False)

In [11]:
# check the number of nodes/proteins
print(np.unique(bioplex_final[['protein1_uniprot', 'protein2_uniprot']].to_numpy().flatten()).shape) # 9615 proteins
print(np.unique(bioplex_293t_final[['protein1_uniprot', 'protein2_uniprot']].to_numpy().flatten()).shape) # 8806 proteins

(9615,)
(8806,)


## HIPPIE

http://cbdm-01.zdv.uni-mainz.de/~mschaefer/hippie/download.php

current release (v2.3)	last updated: 04/29/22

In [3]:
hippie = pd.read_csv('raw_data/interactomes/hippie/hippie_current.txt', sep='\t', header=None, names=['protein1', 'unk1', 'protein2', 'unk2', 'score', 'information'])
# there are NAs only in protein1 & protein2, drop them
hippie = hippie.dropna() # 819719

In [4]:
# select binary PPI detection methods
hippie_binary = hippie[hippie['information'].str.contains('Two-hybrid|atomic force microscopy|fluorescent resonance energy transfer')] 

In [7]:
# select high conf PPIs
hippie_binary_hq = hippie_binary[hippie_binary['score'] >= 0.73] # 25269

In [None]:
# map to reference proteome data
hippie_binary_hq['protein1_uniprot'] = hippie_binary_hq.protein1.map(proteome.set_index('Entry Name')['Entry'].to_dict())
hippie_binary_hq['protein2_uniprot'] = hippie_binary_hq.protein2.map(proteome.set_index('Entry Name')['Entry'].to_dict())
hippie_binary_hq = hippie_binary_hq.dropna() # 22602

In [14]:
# remove redundant ones and check the number of nodes and proteins
hippie_binary_hq = remove_duplicates(hippie_binary_hq)
hippie_binary_hq.to_excel('processed_data/interactomes/hippie/hippie_binary_hq_final.xlsx', index=False)
print(len(hippie_binary_hq)) # 22280 interactions
print(np.unique(hippie_binary_hq[['protein1_uniprot', 'protein2_uniprot']].to_numpy().flatten()).shape) # 7640 proteins

22280
(7640,)


## APID

http://cicblade.dep.usal.es:8080/APID/init.action

APID version: March 2021

Level 2: interactions proven by at least 1 binary method (binary interactomes)

If you want to filter out inter-species interactions select "YES" and download again

In [3]:
apid = pd.read_csv('raw_data/interactomes/apid/9606_noISI_Q1.txt', sep='\t') # 135055

In [22]:
# apid[['UniprotID_A', 'UniprotID_B']].dropna() # no NAs

In [14]:
apid[apid['CurationEvents'] >= 2]

Unnamed: 0,InteractionID,UniprotID_A,UniprotName_A,GeneName_A,UniprotID_B,UniprotName_B,GeneName_B,ExpEvidences,Methods,Publications,3DStructures,CurationEvents
0,1205000,Q14160,SCRIB_HUMAN,SCRIB,B7Z2Y1,B7Z2Y1_HUMAN,,1,1,1,0,3
1,1205001,Q14160,SCRIB_HUMAN,SCRIB,Q14155,ARHG7_HUMAN,ARHGEF7,11,8,8,0,20
2,1205002,Q14160,SCRIB_HUMAN,SCRIB,Q7Z628,ARHG8_HUMAN,NET1,2,2,2,0,2
3,1205003,P22460,KCNA5_HUMAN,KCNA5,Q14160,SCRIB_HUMAN,SCRIB,1,1,1,0,2
5,1205005,O00429,DNM1L_HUMAN,DNM1L,Q14160,SCRIB_HUMAN,SCRIB,1,1,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
134615,2424643,Q13361,MFAP5_HUMAN,MFAP5,P35556,FBN2_HUMAN,FBN2,2,2,2,0,2
134768,2426823,P24386,RAE1_HUMAN,CHM,P20338,RAB4A_HUMAN,RAB4A,2,2,2,0,2
134941,2427806,Q9NY26,S39A1_HUMAN,SLC39A1,Q13303,KCAB2_HUMAN,KCNAB2,1,2,1,0,2
134946,2427812,Q9NP94,S39A2_HUMAN,SLC39A2,Q13303,KCAB2_HUMAN,KCNAB2,1,2,1,0,2


In [79]:
# map to reference proteome data
apid_proteome_mapped = apid[(apid.UniprotID_A.isin(proteome.Entry)) & apid.UniprotID_B.isin(proteome.Entry)] # 125722

In [26]:
# remove redundant ones (A-B, B-A)
apid_final = remove_duplicates(apid_proteome_mapped, int_dtb_ids=['UniprotID_A', 'UniprotID_B']) # 125722 interactions
apid_final.to_excel('processed_data/interactomes/apid/apid_final.xlsx', index=False)

In [28]:
# check the number of nodes/proteins
np.unique(apid_final[['UniprotID_A', 'UniprotID_B']].to_numpy().flatten()).shape # 14854 proteins

(14854,)

## PICKLE

PICKLE Release 3.3 (Oct 1, 2021)

http://www.pickle.gr/Downloads#HUMAN-3-3

PPI network (based on IntAct release 239, BioGRID release 4.4.198 and HPRD release 9) > Normalized at the Protein (UniProt) level > Cross-checked (Default) (UniProt IDs: 16420, Interactions: 218025)

In [15]:
pickle = pd.read_csv('raw_data/interactomes/pickle/UniProtNormalizedTabular-default.txt', sep='\t') # 218025

In [31]:
# map to reference proteome data
pickle_proteome_mapped = pickle[(pickle.InteractorA.isin(proteome.Entry)) & pickle.InteractorB.isin(proteome.Entry)] # 211943, no NAs

In [34]:
# remove redundant ones (A-B, B-A)
pickle_final = remove_duplicates(pickle_proteome_mapped, int_dtb_ids=['InteractorA', 'InteractorB']) # 211943 interactions
pickle_final.to_excel('processed_data/interactomes/pickle/pickle_final.xlsx', index=False)

In [36]:
# check the number of nodes/proteins
np.unique(pickle_final[['InteractorA', 'InteractorB']].to_numpy().flatten()).shape # 15922 proteins

(15922,)

## BioGRID

https://downloads.thebiogrid.org/File/BioGRID/Release-Archive/BIOGRID-4.4.216/BIOGRID-ORGANISM-4.4.216.tab3.zip

Release Version: 4.4.216

Last Modified: November 29th, 2022

In [3]:
biogrid = pd.read_csv('raw_data/interactomes/biogrid/BIOGRID-ORGANISM-Homo_sapiens-4.4.216.tab3.txt', sep='\t') # 1120006

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
# removing 'genetic' system type
biogrid = biogrid[biogrid['Experimental System Type'] == 'physical'] # 1103242

In [6]:
# map to reference proteome data
biogrid_proteome_mapped = biogrid[(biogrid['SWISS-PROT Accessions Interactor A'].isin(proteome.Entry)) & biogrid['SWISS-PROT Accessions Interactor B'].isin(proteome.Entry)] # 974465

In [47]:
# remove redundant ones (A-B, B-A)
biogrid_final = remove_duplicates(biogrid_proteome_mapped, int_dtb_ids=['SWISS-PROT Accessions Interactor A', 'SWISS-PROT Accessions Interactor B']) # 719566 interactions
biogrid_final.to_csv('processed_data/interactomes/biogrid/biogrid_final.tsv', sep='\t', index=False)

In [49]:
# check the number of nodes/proteins
np.unique(biogrid_final[['SWISS-PROT Accessions Interactor A', 'SWISS-PROT Accessions Interactor B']].to_numpy().flatten()).shape # 17100 proteins

(17100,)

## IID

http://iid.ophid.utoronto.ca/ downloads section

In [3]:
iid = pd.read_csv('raw_data/interactomes/iid/human_annotated_PPIs.txt', sep='\t') # 1209534

  iid = pd.read_csv('raw_data/interactomes/iid/human_annotated_PPIs.txt', sep='\t') #


In [11]:
iid_exp = iid[iid.evidence_type.str.contains('exp')] # 560628, no NA ids

In [14]:
# map to reference proteome data
iid_exp_proteome_mapped = iid_exp[(iid_exp['uniprot1'].isin(proteome.Entry)) & iid_exp['uniprot2'].isin(proteome.Entry)] # 542157

In [22]:
# remove redundant ones (A-B, B-A)
iid_exp_final = remove_duplicates(iid_exp_proteome_mapped, int_dtb_ids=['uniprot1', 'uniprot2']) # 542157 interactions
iid_exp_final.to_csv('processed_data/interactomes/iid/iid_exp_final.tsv', sep='\t', index=False)

In [24]:
# check the number of nodes/proteins
np.unique(iid_exp_final[['uniprot1', 'uniprot2']].to_numpy().flatten()).shape # 17331 proteins

(17331,)

# Final Dataset

In [None]:
concatted = pd.concat([huri_final, string_high_conf_final, hippie_binary_hq])

In [None]:
interactome = remove_duplicates(concatted) # 117897 interactions
# np.unique(interactome[['protein1_uniprot', 'protein2_uniprot']].to_numpy().flatten()).shape # 12748 proteins

In [None]:
interactome.to_excel('processed_data/interactomes/interactome_final.xlsx', index=False)