In [2]:
from interactome_funcs import *

In [3]:
# reviewed homo sapiens proteome from UniProt 2022_04
# this dataset is filtered according to certain thresholds
proteome = pd.read_excel("processed_data/uniprot/30aa_nounchar_noputative_ref_proteome_protein_existence_filtered_02.xlsx", header=0)
# filtered pdb structures for uniprot entries
pdb = pd.read_csv("processed_data/uniprot/proteome_have_pdb_begin_end_missing_consec_greater_30pdb.csv")
# filtered homology modeling datasets, using all data
modbase = pd.read_csv('processed_data/modbase/modbase_30aa_hq.tsv', sep='\t')
modbase = modbase.rename(columns={'UniprotID': 'Entry'})
swissmodel = pd.read_csv('processed_data/swissmodel/swissmodel_30aa_hq.tsv', sep='\t')
swissmodel = swissmodel.rename(columns={'UniProtKB_ac': 'Entry'})
# filtered alphafold structures
af_85 = pd.read_excel('processed_data/alphafold/all_af_acc_85.xlsx')
af_85 = af_85[af_85['Situation'] == 'Yes']

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
# interactome databases
huri = pd.read_excel('processed_data/interactomes/huri/huri_final.xlsx')
string = pd.read_excel('processed_data/interactomes/string/string_high_conf_final.xlsx')
bioplex = pd.read_excel('processed_data/interactomes/bioplex/bioplex_293t_final.xlsx')
hippie = pd.read_excel('processed_data/interactomes/hippie/hippie_binary_hq_final.xlsx')
apid = pd.read_excel('processed_data/interactomes/apid/apid_final.xlsx')
pickle = pd.read_excel('processed_data/interactomes/pickle/pickle_final.xlsx')
biogrid = pd.read_csv('processed_data/interactomes/biogrid/biogrid_final.tsv', sep='\t')
iid = pd.read_csv('processed_data/interactomes/iid/iid_exp_final.tsv', sep='\t')

# Coverage of human interactome databases

In this part, we use all available HQ models from homology databases.

Order of the results: num of proteins, num of interactions where both sides have structures

## HuRI

In [5]:
print(f"Total number of interactions: {len(huri), len(huri.dropna()), len(remove_duplicates(huri))}") # sanity check
print(f"Total number of proteins: {np.unique(huri[['protein1_uniprot', 'protein2_uniprot']].to_numpy().flatten()).shape}")
print(f"PDB: {get_coverage(huri, pdb)}")
print(f"SWISS-MODEL: {get_coverage(huri, swissmodel)}")
print(f"ModBase: {get_coverage(huri, modbase)}")
print(f"AlphaFold (85% higher than 70%): {get_coverage(huri, af_85)}")

Total number of interactions: (48763, 48763, 48763)
Total number of proteins: (7889,)
PDB: (3317, 7046)
SWISS-MODEL: (3414, 7014)
ModBase: (3729, 8938)
AlphaFold (85% higher than 70%): (2026, 2737)


## BioPlex

In [7]:
print(f"Total number of interactions: {len(bioplex), len(bioplex.dropna()), len(remove_duplicates(bioplex))}") # sanity check
print(f"Total number of proteins: {np.unique(bioplex[['protein1_uniprot', 'protein2_uniprot']].to_numpy().flatten()).shape}")
print(f"PDB: {get_coverage(bioplex, pdb)}")
print(f"SWISS-MODEL: {get_coverage(bioplex, swissmodel)}")
print(f"ModBase: {get_coverage(bioplex, modbase)}")
print(f"AlphaFold (85% higher than 70%): {get_coverage(bioplex, af_85)}")

Total number of interactions: (53136, 53136, 53136)
Total number of proteins: (8806,)
PDB: (3888, 14036)
SWISS-MODEL: (4242, 13945)
ModBase: (4615, 15162)
AlphaFold (85% higher than 70%): (2819, 5802)


## STRING

In [6]:
print(f"Total number of interactions: {len(string), len(string.dropna()), len(remove_duplicates(string))}") # sanity check
print(f"Total number of proteins: {np.unique(string[['protein1_uniprot', 'protein2_uniprot']].to_numpy().flatten()).shape}")
print(f"PDB: {get_coverage(string, pdb)}")
print(f"SWISS-MODEL: {get_coverage(string, swissmodel)}")
print(f"ModBase: {get_coverage(string, modbase)}")
print(f"AlphaFold (85% higher than 70%): {get_coverage(string, af_85)}")

Total number of interactions: (57192, 57192, 57192)
Total number of proteins: (7327,)
PDB: (4518, 38005)
SWISS-MODEL: (4214, 24829)
ModBase: (4095, 21442)
AlphaFold (85% higher than 70%): (2085, 9941)


## HIPPIE

In [8]:
print(f"Total number of interactions: {len(hippie), len(hippie.dropna()), len(remove_duplicates(hippie))}") # sanity check
print(f"Total number of proteins: {np.unique(hippie[['protein1_uniprot', 'protein2_uniprot']].to_numpy().flatten()).shape}")
print(f"PDB: {get_coverage(hippie, pdb)}")
print(f"SWISS-MODEL: {get_coverage(hippie, swissmodel)}")
print(f"ModBase: {get_coverage(hippie, modbase)}")
print(f"AlphaFold (85% higher than 70%): {get_coverage(hippie, af_85)}")

Total number of interactions: (22280, 22280, 22280)
Total number of proteins: (7640,)
PDB: (4164, 10193)
SWISS-MODEL: (3971, 7795)
ModBase: (4231, 8412)
AlphaFold (85% higher than 70%): (1910, 1586)


## APID

In [10]:
print(f"Total number of interactions: {len(apid), len(apid.dropna()), len(remove_duplicates(apid, int_dtb_ids=['UniprotID_A', 'UniprotID_B']))}") # sanity check
print(f"Total number of proteins: {np.unique(apid[['UniprotID_A', 'UniprotID_B']].to_numpy().flatten()).shape}")
print(f"PDB: {get_coverage(apid, pdb, int_dtb_ids=['UniprotID_A', 'UniprotID_B'])}")
print(f"SWISS-MODEL: {get_coverage(apid, swissmodel, int_dtb_ids=['UniprotID_A', 'UniprotID_B'])}")
print(f"ModBase: {get_coverage(apid, modbase, int_dtb_ids=['UniprotID_A', 'UniprotID_B'])}")
print(f"AlphaFold (85% higher than 70%): {get_coverage(apid, af_85, int_dtb_ids=['UniprotID_A', 'UniprotID_B'])}")

Total number of interactions: (125722, 125722, 125722)
Total number of proteins: (14854,)
PDB: (6508, 39922)
SWISS-MODEL: (6774, 32453)
ModBase: (7236, 36390)
AlphaFold (85% higher than 70%): (3897, 8394)


## PICKLE

In [11]:
print(f"Total number of interactions: {len(pickle), len(pickle.dropna()), len(remove_duplicates(pickle, int_dtb_ids=['InteractorA', 'InteractorB']))}") # sanity check
print(f"Total number of proteins: {np.unique(pickle[['InteractorA', 'InteractorB']].to_numpy().flatten()).shape}")
print(f"PDB: {get_coverage(pickle, pdb, int_dtb_ids=['InteractorA', 'InteractorB'])}")
print(f"SWISS-MODEL: {get_coverage(pickle, swissmodel, int_dtb_ids=['InteractorA', 'InteractorB'])}")
print(f"ModBase: {get_coverage(pickle, modbase, int_dtb_ids=['InteractorA', 'InteractorB'])}")
print(f"AlphaFold (85% higher than 70%): {get_coverage(pickle, af_85, int_dtb_ids=['InteractorA', 'InteractorB'])}")

Total number of interactions: (211943, 211943, 211943)
Total number of proteins: (15922,)
PDB: (6852, 88408)
SWISS-MODEL: (7213, 68328)
ModBase: (7719, 71761)
AlphaFold (85% higher than 70%): (4191, 14794)


## BioGRID

In [12]:
print(f"Total number of interactions: {len(biogrid), len(biogrid.dropna()), len(remove_duplicates(biogrid, int_dtb_ids=['SWISS-PROT Accessions Interactor A', 'SWISS-PROT Accessions Interactor B']))}") # sanity check
print(f"Total number of proteins: {np.unique(biogrid[['SWISS-PROT Accessions Interactor A', 'SWISS-PROT Accessions Interactor B']].to_numpy().flatten()).shape}")
print(f"PDB: {get_coverage(biogrid, pdb, int_dtb_ids=['SWISS-PROT Accessions Interactor A', 'SWISS-PROT Accessions Interactor B'])}")
print(f"SWISS-MODEL: {get_coverage(biogrid, swissmodel, int_dtb_ids=['SWISS-PROT Accessions Interactor A', 'SWISS-PROT Accessions Interactor B'])}")
print(f"ModBase: {get_coverage(biogrid, modbase, int_dtb_ids=['SWISS-PROT Accessions Interactor A', 'SWISS-PROT Accessions Interactor B'])}")
print(f"AlphaFold (85% higher than 70%): {get_coverage(biogrid, af_85, int_dtb_ids=['SWISS-PROT Accessions Interactor A', 'SWISS-PROT Accessions Interactor B'])}")

Total number of interactions: (719566, 719566, 719566)
Total number of proteins: (17100,)
PDB: (6914, 292081)
SWISS-MODEL: (7499, 233361)
ModBase: (8174, 239226)
AlphaFold (85% higher than 70%): (4587, 70455)


## IID

In [13]:
print(f"Total number of interactions: {len(iid), len(iid.dropna()), len(remove_duplicates(iid, int_dtb_ids=['uniprot1', 'uniprot2']))}") # sanity check
print(f"Total number of proteins: {np.unique(iid[['uniprot1', 'uniprot2']].to_numpy().flatten()).shape}")
print(f"PDB: {get_coverage(iid, pdb, int_dtb_ids=['uniprot1', 'uniprot2'])}")
print(f"SWISS-MODEL: {get_coverage(iid, swissmodel, int_dtb_ids=['uniprot1', 'uniprot2'])}")
print(f"ModBase: {get_coverage(iid, modbase, int_dtb_ids=['uniprot1', 'uniprot2'])}")
print(f"AlphaFold (85% higher than 70%): {get_coverage(iid, af_85, int_dtb_ids=['uniprot1', 'uniprot2'])}")

Total number of interactions: (542157, 542157, 542157)
Total number of proteins: (17331,)
PDB: (7015, 235632)
SWISS-MODEL: (7636, 181337)
ModBase: (8250, 185030)
AlphaFold (85% higher than 70%): (4600, 53251)
