In [None]:
import numpy as np
import pandas as pd
import os
import sys
from tqdm.notebook import tqdm
import pickle as pkl
from re import search
from itertools import combinations

### Define paths

In [None]:
# Input files
STRING_DATA  = 'data/STRING/9606.protein.links.full.v11.0.txt'
HURI_DATA    = 'data/HuRI/HuRI.tsv'
PARALOG_DATA = 'data/DGD/duplicate_genes_Hsapiens.tsv'

# Output files
# Make sure all the folders are in place
PANTHER_PARALOGS = 'data/PANTHER/paralogs-GeneID'  # see uniprot_enrez.csv in supp.mat.
GENE_INFO        = 'data/NCBI/gene_map.csv'
SIGNOR_DATA      = 'data/SIGNOR/geneID_interactions.pkl'
ENSP_ENTREZ_MAP  = 'data/ens_entrez_maps/ensp_entrez_mapping.pkl'
ENSG_ENTREZ_MAP  = 'data/ens_entrez_maps/ensg_entrez_mapping.pkl'

### Load STRING database

In [None]:
string_data = pd.read_csv(STRING_DATA, delimiter=' ')
string_data.head()

### Load HuRI database

In [None]:
huri_data = pd.read_csv(HURI_DATA, delimiter='\t', header=None)
huri_data.head()

### Load duplicate gene data

In [None]:
dup_gens = pd.read_csv(PARALOG_DATA, delimiter='\t')
dup_gens.head()

In [None]:
# Run to obtain paralog file: awk -F '[=|\t]' '{if ($1 ~ /HUMAN/ && $11 == "P") print $5, $10}' file > output
# Use the above bash code to extract the paralogs list from the Panther database
# Then convert to entrez gene ids using the gene_id_map below
# This file is also included in the supplementary files as gene_map.csv

In [None]:
# Used to update the PANTHER paralog list to only include actual gene paralogs (and not species etc.)

wrong = []
for GeneID, uniprot in tqdm(panther_map.iterrows()):
    if type(uniprot) != str:
        uniprot = uniprot.values[0]
    exists = False
    for i in paralogs.loc[paralogs['paralog'] == uniprot, 'gene'].append(paralogs.loc[paralogs['gene'] == uniprot, 'paralog']):
        if i != uniprot and i in panther_map.uniprot.values:
            exists = True
            break
    if not exists:
        wrong.append(uniprot)
len(set(wrong))
# panther_map.loc[~panther_map.uniprot.isin(wrong)].to_csv(PANTHER_MAP, sep=' ')

In [None]:
# Sorry if the following is a bit messy...

In [None]:
### Used to create GeneInfo mapping ###
gen_inf = 'data/NCBI/gene_info'
ens_inf = 'data/NCBI/gene2ensembl'

gen_inf_df = pd.read_csv(gen_inf, delimiter='\t')
ens_inf_df = pd.read_csv(ens_inf, delimiter='\t')

gene_id_map = pd.merge(gen_inf_df[['GeneID', 'Symbol']], ens_inf_df[['GeneID', 'Ensembl_gene_identifier', 'Ensembl_protein_identifier']], on='GeneID')

gene_id_map.to_csv(GENE_INFO)

In [None]:
def ens_to_entrez(ensID):
    if ensID in ens_id_map.index:
        entrez = ens_id_map.loc[ensID]
    else:
        entrez = ens_id_map.loc[ens_id_map.index.str.contains(ensID)].values
        if entrez.size == 0:
            if '.' in ensID:
                print(ensID)
                entrez = ens_to_entrez(ensID.split('.')[0])
            else:
                not_in_db.append(ensID)
                entrez = -1
    try:
        entrez = int(entrez)
    except Exception as e:
        print(ensID)
        print(entrez)
        entrez = int(entrez[0])
        
    return entrez

In [None]:
dup_gens = pd.read_csv(PARALOG_DATA, delimiter='\t')

ens_ids = np.unique(dup_gens['ENS_ID'].values)
ens_id_map = gene_id_map.set_index('Ensembl_gene_identifier')['GeneID'].groupby(level=0).first()
not_in_db = []

for ens_id in tqdm(ens_ids):
    if ens_id not in ensg_entrez_map:
        ensg_entrez_map[ens_id] = ens_to_entrez(ens_id)
        
# with open(ENSG_ENTREZ_MAP, 'wb') as f:
#     pkl.dump(ensg_entrez_map, f)

In [None]:
huri_data = pd.read_csv(HURI_DATA, delimiter='\t', header=None).values

ens_ids = np.unique(huri_data.flatten())
ens_id_map = gene_id_map.set_index('Ensembl_gene_identifier')['GeneID'].groupby(level=0).first()
not_in_db = []

for ens_id in tqdm(ens_ids):
    if ens_id not in ensg_entrez_map:
        ensg_entrez_map[ens_id] = ens_to_entrez(ens_id)        
        
with open('not_in_db.txt', 'w') as f:
    f.writelines("%s\n" % i for i in not_in_db)        
    
# with open(ENSG_ENTREZ_MAP, 'wb') as f:
#     pkl.dump(ensg_entrez_map, f)

In [None]:
with open('data/ens_entrez_maps/not_in_db_map.pkl', 'rb') as f:
    ncbi_ids = pkl.load(f)
ncbi_ids
ensg_entrez_map.update(ncbi_ids)

In [None]:
ens_ids = np.unique(prot_interactions[['protein1', 'protein2']].values.flatten())
ens_id_map = gene_id_map.set_index('Ensembl_protein_identifier').drop('-')['GeneID']
not_in_db = []

for ens_id in tqdm(ens_ids):
    if ens_id not in ensp_entrez_map:
        ensp_entrez_map[ens_id] = ens_to_entrez(ens_id)
        
with open('not_in_db.txt', 'w') as f:
    f.writelines("%s\n" % i for i in not_in_db)

# with open(ENSP_ENTREZ_MAP, 'wb') as f:
#     pkl.dump(ensp_entrez_map, f)

In [None]:
# In order to achieve the SIGNOR interactions in Entrez GeneID format the following was done:

In [None]:
# The possible protein complexes were split into individual interactions (rows)

# See: https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None and len(lst_cols) > 0 and 
        not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col: np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col: np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res 

def split_complexes(df, cols, sep='/'):
    df = df.assign(**{c: df[c].str.split(sep) for c in cols})
    for col in cols:
        df = explode(df, [col])
    return df

In [32]:
signor_interactions = signor_data[['IDA', 'ENTITYA', 'IDB', 'ENTITYB']].drop_duplicates()
signor_interactions = split_complexes(signor_interactions, ['ENTITYA', 'ENTITYB'])
signor_interactions
# interactions = set()
# for _, row in tqdm(signor_interactions.iterrows(), total=len(signor_interactions)):
#     if row.ENTITYA in name_id_map.index and row.ENTITYB in name_id_map.index:
#         interactions.update([(min(i,j), max(i,j)) for i in name_id_map.loc[row.ENTITYA].values for j in name_id_map.loc[row.ENTITYB].values])
# print(len(interactions))
# interactions = set(interactions)
# print(len(interactions))

Unnamed: 0,ENTITYA,IDA,IDB,ENTITYB
0,PDPK1,O15530,O15530,PDPK1
1,PHLPP1,O60346,P31749,AKT1
2,OXGR1,Q96P68,P08754,GNAI3
3,ACVR1B,P36896,P69905,HBA1
4,GPR119,Q8TDV5,P08754,GNAI3
...,...,...,...,...
18064,NEDD4L,Q96PU5,Q9UI33,SCN11A
18065,BRCA1,P38398,P07339,CTSD
18066,CREB1,P16220,P98177,FOXO4
18067,P2RY13,Q9BPV8,P30679,GNA15


In [None]:
# The following are some experiments on converting the gene names to geneIDs

In [31]:
print(len(set(signor_interactions[['ENTITYA', 'ENTITYB']].values.flatten())))
print(len(set(signor_interactions[['ENTITYA', 'ENTITYB']].values.flatten()).intersection(gene_id_map.Symbol.values)))

print(len(set(signor_interactions[['ENTITYA', 'ENTITYB']].values.flatten()).intersection(gene_id_map.Symbol.values))/len(set(signor_interactions[['ENTITYA', 'ENTITYB']].values.flatten())))

5696
4290
0.7531601123595506


In [93]:
# pd.set_option("display.max_columns", 30)
print(len(set(signor_interactions[['ENTITYA', 'ENTITYB']].values.flatten())))
print(len(set(signor_interactions[['ENTITYA', 'ENTITYB']].values.flatten()).intersection(gene_id_map.Symbol.values)))

print(len(set(signor_interactions[['ENTITYA', 'ENTITYB']].values.flatten()).intersection(gene_id_map.Symbol.values))/len(set(signor_interactions[['ENTITYA', 'ENTITYB']].values.flatten())))
print()

print(len(signor_interactions.loc[(signor_interactions.DATABASEA == 'UNIPROT') | (signor_interactions.DATABASEB == 'UNIPROT')]))
print(len(set(signor_interactions.loc[(signor_interactions.DATABASEA == 'UNIPROT') & (signor_interactions.DATABASEB == 'UNIPROT'), ['IDA', 'IDB']].values.flatten())))
print(len(set(signor_interactions.loc[(signor_interactions.DATABASEA == 'UNIPROT') & (signor_interactions.DATABASEB == 'UNIPROT'), ['IDA', 'IDB']].values.flatten()).intersection(paralogs.uniprot)))


print()
# print(len(set(signor_interactions[['ENTITYA', 'ENTITYB']].values.flatten()).intersection([i.split()[0] for i, _ in list(ncbi_gene_names.values())])))
# print(signor_interactions.columns)
# signor_interactions
print(len(signor_interactions))
print(len(signor_interactions.loc[(signor_interactions.ENTITYA.isin(gene_id_map.Symbol.values)) & (signor_interactions.ENTITYB.isin(gene_id_map.Symbol.values))]))

print(len(signor_interactions.loc[(signor_interactions.ENTITYA.isin(gene_id_map.Symbol.values)) & (signor_interactions.ENTITYB.isin(gene_id_map.Symbol.values))])/len(signor_interactions))

print(len(signor_interactions.loc[(signor_interactions.ENTITYA.isin(gene_id_map.Symbol.values) | signor_interactions.IDA.isin(paralogs.uniprot)) & (signor_interactions.ENTITYB.isin(gene_id_map.Symbol.values) | signor_interactions.IDB.isin(paralogs.uniprot))]))
print(len(signor_interactions.loc[(signor_interactions.IDA.isin(paralogs.uniprot)) & (signor_interactions.IDB.isin(paralogs.uniprot))]))

5746
4282
0.7452140619561434

22672
4019
1991

23146
16683
0.7207724876868573
16698
4561
