In [1]:
import requests
from tqdm import tqdm
import pandas as pd
import id_conversion
from IPython.display import display

# APID (apid2hgnc.py)

In [5]:
apid = pd.read_csv(
    "../../data/raw/9606_noISI_Q2.txt",
      sep='\t', header=0, dtype={'UniprotID_A': 'string', 'UniprotID_B': 'string', 'GeneName_A':'string','GeneName_B':'string'}
      )
display(apid.head(5))
hgnc_symbols = pd.read_csv(
    "../../data/raw/hgnc_complete_set.tsv",
      sep='\t', header=0, dtype='string'
      )
hgnc_symbols.head(2)

Unnamed: 0,InteractionID,UniprotID_A,UniprotName_A,GeneName_A,UniprotID_B,UniprotName_B,GeneName_B,ExpEvidences,Methods,Publications,3DStructures,CurationEvents
0,1205000,Q14160,SCRIB_HUMAN,SCRIB,B7Z2Y1,B7Z2Y1_HUMAN,,1,1,1,0,3
1,1205001,Q14160,SCRIB_HUMAN,SCRIB,Q14155,ARHG7_HUMAN,ARHGEF7,11,8,8,0,20
2,1205002,Q14160,SCRIB_HUMAN,SCRIB,Q7Z628,ARHG8_HUMAN,NET1,2,2,2,0,2
3,1205003,P22460,KCNA5_HUMAN,KCNA5,Q14160,SCRIB_HUMAN,SCRIB,1,1,1,0,2
4,1205004,Q96DN2,VWCE_HUMAN,VWCE,Q14160,SCRIB_HUMAN,SCRIB,1,1,1,0,1


Unnamed: 0,hgnc_id,symbol,name,locus_group,locus_type,status,location,location_sortable,alias_symbol,alias_name,...,cd,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_ids,lncipedia,gtrnadb,agr,mane_select,gencc
0,HGNC:5,A1BG,alpha-1-B glycoprotein,protein-coding gene,gene with protein product,Approved,19q13.43,19q13.43,,,...,,,,,,,,HGNC:5,ENST00000263100.8|NM_130786.4,
1,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,non-coding RNA,"RNA, long non-coding",Approved,19q13.43,19q13.43,FLJ23569,,...,,,,,,A1BG-AS1,,HGNC:37133,,


In [6]:
# all genes in apid
genenames = set(list(apid['GeneName_A'].dropna().unique()) + list(apid['GeneName_B'].dropna().unique()))
# all genes with approved HGNC symbol
hgnc_ids = list(hgnc_symbols['symbol'].unique())
# apid genes not found in hgnc table
missing_apid_ids = [protein for protein in genenames if protein not in hgnc_ids]

In [7]:
# search for deprecated or alternative symbols
resolved_ids = {}
not_found = []
for protein in tqdm(missing_apid_ids):

    for id_type in ['alias_symbol', 'prev_symbol']:
        new_symbol = hgnc_symbols.loc[hgnc_symbols[id_type].str.contains(protein), 'symbol'].values
        if len(new_symbol) > 0:
            resolved_ids[protein] = new_symbol[0]
            break
    else:
        not_found.append(protein)
print(f'{len(not_found)} symbols were not found.' )

100%|██████████| 751/751 [00:08<00:00, 93.21it/s]

105 symbols were not found.





In [8]:
a = apid[['GeneName_A', 'UniprotID_A']].rename(columns={'GeneName_A': 'GeneName', 'UniprotID_A': 'UniprotID'})
b = apid[['GeneName_B', 'UniprotID_B']].rename(columns={'GeneName_B': 'GeneName', 'UniprotID_B': 'UniprotID'})
uniprot_ids = pd.concat([a, b]).drop_duplicates()
not_found_uniprot_ids = uniprot_ids.loc[uniprot_ids.GeneName.isin(not_found)].to_dict('records')

In [9]:
# search in Uniprot IDs
remaining = []
for protein in tqdm(not_found_uniprot_ids):

    new_symbol = hgnc_symbols.loc[hgnc_symbols['uniprot_ids'] == protein['UniprotID'], 'symbol'].values
    if len(new_symbol) > 0:
        resolved_ids[protein['GeneName']] = new_symbol[0]
        
    else:
        remaining.append(protein['GeneName'])
        
print(f'{len(remaining)} symbols were not found.' )

100%|██████████| 108/108 [00:00<00:00, 582.36it/s]

104 symbols were not found.





In [10]:
# replace alternative IDs with Approved IDs
apid_graph = apid[['GeneName_A', 'GeneName_B']].replace(resolved_ids).dropna(how='any')

In [11]:
# remove IDs that could not be converted
apid_graph = apid_graph[
    ~apid_graph['GeneName_A'].isin(remaining)&
    ~apid_graph['GeneName_B'].isin(remaining)
    ].rename(columns={'GeneName_A': 'protein_A', 'GeneName_B': 'protein_B'})

In [12]:
apid_graph.to_csv('../../data/interim/apid_graph.csv', header=-1, index=False)

# HuRI (ensembl2hgnc.py)

In [13]:
huri = pd.read_csv("../../data/raw/HuRI.tsv", sep='\t',header=None, names=['ENSG_A', 'ENSG_B'])
print(huri.shape[0])
huri.head(2)

52548


Unnamed: 0,ENSG_A,ENSG_B
0,ENSG00000000005,ENSG00000061656
1,ENSG00000000005,ENSG00000099968


In [15]:
huri_hgnc = pd.merge(
    huri, 
    hgnc_symbols[['ensembl_gene_id', 'symbol']].set_index('ensembl_gene_id'),
    how='left',
    left_on='ENSG_A',
    right_on='ensembl_gene_id'
    )
huri_hgnc = pd.merge(
    huri_hgnc, 
    hgnc_symbols[['ensembl_gene_id', 'symbol']].set_index('ensembl_gene_id'),
    how='left',
    left_on='ENSG_B',
    right_on='ensembl_gene_id',
    suffixes=['_A', '_B']
    )

huri_hgnc.rename(columns={'symbol_A': 'protein_A', 'symbol_B': 'protein_B'}, inplace=True)

In [16]:
huri_graph = huri_hgnc[['protein_A', 'protein_B']].dropna()

In [17]:
huri_graph.to_csv('../../data/interim/huri_graph.csv', index=False)

# Reactome

In [19]:
reactome = pd.read_csv(
    "../../data/raw/NCBI2ReactomeReactions.txt", sep='\t', header=None, dtype='string',
      names = ['NCBI_ID', 'Reactome_ID', 'URL', 'Event', 'Evidence_Code', 'Species']
      )
reactome = reactome[reactome['Species'] == 'Homo sapiens']
print(reactome.shape[0])
reactome.head(2)

112796


Unnamed: 0,NCBI_ID,Reactome_ID,URL,Event,Evidence_Code,Species
0,1,R-HSA-481007,https://reactome.org/PathwayBrowser/#/R-HSA-48...,Exocytosis of platelet alpha granule contents,TAS,Homo sapiens
1,1,R-HSA-6798748,https://reactome.org/PathwayBrowser/#/R-HSA-67...,Exocytosis of secretory granule lumen proteins,TAS,Homo sapiens


In [20]:
# convert entrez IDs (NCBI) to HGNC Gene Symbols
reactome_hgnc = pd.merge(
    reactome, hgnc_symbols[['entrez_id', 'symbol']].set_index('entrez_id'),
    left_on='NCBI_ID', right_on='entrez_id'
    ).rename(columns={'symbol': 'protein_id'})

In [21]:
# keep only reactome modules with sizes between 50 and 300 proteins
reactome_final = reactome_hgnc.groupby('Reactome_ID').filter(lambda x: 50 <= len(x) <= 300)
reactome_final = reactome_final[['Reactome_ID', 'protein_id', 'Event']]

In [22]:
reactome_final.to_csv('../../data/interim/reactome_reactions.csv', index=False)

# Disgenet

In [23]:
disgenet = pd.read_csv(
    "../../data/raw/curated_gene_disease_associations.tsv", sep='\t', header=0)

print(disgenet.shape[0])
disgenet.head(2)

84038


Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,1,A1BG,0.7,0.538,C0019209,Hepatomegaly,phenotype,C23;C06,Finding,0.3,1.0,2017.0,2017.0,1,0,CTD_human
1,1,A1BG,0.7,0.538,C0036341,Schizophrenia,disease,F03,Mental or Behavioral Dysfunction,0.3,1.0,2015.0,2015.0,1,0,CTD_human


In [24]:
# all genes in apid
disgenet_symbols = list(disgenet.geneSymbol.unique())
# all genes with approved HGNC symbol
hgnc_ids = list(hgnc_symbols['symbol'].unique())
# apid genes not found in hgnc table
missing_disgenet_ids = [protein for protein in disgenet_symbols if protein not in hgnc_ids]

In [25]:
# search for deprecated or alternative symbols
resolved_ids = {}
not_found = []
for protein in tqdm(missing_disgenet_ids):

    for id_type in ['alias_symbol', 'prev_symbol']:
        new_symbol = hgnc_symbols.loc[hgnc_symbols[id_type].str.contains(protein), 'symbol'].values
        if len(new_symbol) > 0:
            resolved_ids[protein] = new_symbol[0]
            break
    else:
        not_found.append(protein)
        
print(f'{len(not_found)} symbols were not found.' )

100%|██████████| 146/146 [00:01<00:00, 92.62it/s]

60 symbols were not found.





In [28]:
# replace alternative IDs with Approved IDs
disgenet_hgnc = disgenet.copy()
disgenet_hgnc['geneSymbol'] = disgenet_hgnc.geneSymbol.replace(resolved_ids)

In [29]:
# remove IDs that could not be converted
disgenet_hgnc = disgenet_hgnc.loc[
    ~disgenet_hgnc['geneSymbol'].isin(not_found),
    ['geneId', 'geneSymbol', 'diseaseId', 'diseaseName', 'diseaseType', 'score']
    ].rename(columns={'geneId': 'entrez_id', 'geneSymbol': 'protein_id'})

In [30]:
disgenet_hgnc.to_csv('../../data/interim/disgenet.csv', index=False)

# PPI Integration

In [31]:
apid = pd.read_csv("../../data/interim/apid_graph.csv")
print(apid.shape[0])
display(apid.head(2))

huri = pd.read_csv("../../data/interim/huri_graph.csv")
print(huri.shape[0])
display(huri.head(2))

263595


Unnamed: 0,protein_A,protein_B
0,SCRIB,ARHGEF7
1,SCRIB,NET1


52225


Unnamed: 0,protein_A,protein_B
0,TNMD,SPAG4
1,TNMD,BCL2L13


In [32]:
apid_huri = pd.concat([apid, huri]).drop_duplicates()

In [33]:
apid_huri.to_csv('../../data/processed/ppis/apid_huri_graph.csv', index=False)