In [1]:
import sys
print('python', sys.version)

import numpy as np
print('numpy', np.__version__)

import pandas as pd
print('pandas', pd.__version__)

import matplotlib as mpl
print('matplotlib', mpl.__version__)

import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as sci
import glob
import networkx as nx
import Bio.KEGG.KGML.KGML_parser as keg

import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

python 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:27:36) [GCC 11.2.0]
numpy 1.26.4
pandas 2.2.3
matplotlib 3.10.0


In [2]:
# mapping human gene symbol to entrez
dat=pd.read_table('data/Sym2Entrez.txt',sep='\t')
dat.columns=['sym','entrez']
dat=dat.loc[dat['entrez'].notnull()]
dat=dat.loc[dat['sym'].notnull()]
dat['entrez']=dat['entrez'].astype(int).astype(str)
s2e_dic=dat.set_index('sym')['entrez']
e2s_dic=dat.set_index('entrez')['sym']

# prot_to_gene mapping

In [5]:
import gzip
import requests

# # 1. UniProt FTP URL
# url = "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz"
# local_gz = "data/HUMAN_9606_idmapping.dat.gz"

# # 2. file download
# print("Downloading from UniProt FTP...")
# with requests.get(url, stream=True) as r:
#     r.raise_for_status()
#     with open(local_gz, 'wb') as f:
#         for chunk in r.iter_content(chunk_size=8192):
#             _=f.write(chunk)
# print("Download complete.")

# # 3. UniProt ID â†” Entrez Gene ID
# mapping_file = "data/uniprot2entrez.tsv"
# with gzip.open(local_gz, 'rt') as f_in, open(mapping_file, 'w') as f_out:
#     for line in f_in:
#         parts = line.strip().split("\t")
#         if len(parts) >= 3 and parts[1] == "GeneID":
#             uniprot_id = parts[0]
#             entrez_id = parts[2]
#             _=f_out.write(f"{uniprot_id}\t{entrez_id}\n")

# print(f"Mapping table saved to {mapping_file}")

prot2gene=pd.read_table('data/uniprot2entrez.tsv', header=None)
prot2gene.columns=['prot','gene']
prot2gene_dic=prot2gene.set_index('prot')['gene']
prot2gene_dic
len(set(prot2gene_dic.index))
#pd.Series(prot2gene_dic.index).value_counts().head(20)

prot
P31946          7529
P62258          7531
Q04917          7533
P61981          7532
P31947          2810
               ...  
B4DWC1          3176
A0A1S5UZ17      9209
Q96HD8         57338
B4DN61        283298
A0A0S2Z4T5      9610
Name: gene, Length: 33969, dtype: int64

33678

prot
P62805        14
Q5JQC4        12
P68431        10
Q9ULZ0         8
Q0WX57         7
P62807         6
P86496         5
P86481         5
P86479         5
P86480         5
P86478         5
B2R4S9         5
P0C0S8         5
A4FTV9         5
P0DMU8         4
A1L429         4
P0DMV2         3
P0DMV0         3
Q9UEU5         3
A0A1S5UZ02     3
Name: count, dtype: int64

# omnipath and trrust network (only activation)

## omnipath network

In [9]:
import pandas as pd
import omnipath as op

def construct_ominpath_net(source):
    print('###### ', source)
    if source=='all':
        omni_ppi = op.interactions.AllInteractions.get(organism='human')
    if source=='omni':
        omni_ppi = op.interactions.OmniPath.get(organism='human')
        
    omni_ppi=omni_ppi[['source', 'target']].copy()
    print(omni_ppi.head(2))
    
    omni_net=nx.Graph()
    for ii in range(len(omni_ppi)):
        prot_a, prot_b=omni_ppi.iloc[ii]
    
        if not {prot_a, prot_b}.issubset(prot2gene_dic.index):
            continue
            
        for gene_a in prot2gene_dic.loc[[prot_a]]:
            for gene_b in prot2gene_dic.loc[[prot_b]]:
                omni_net.add_edge(str(gene_a), str(gene_b))
    
    print(len(omni_net.nodes()))
    print(len(omni_net.edges()))
    
    with open(f"./data_preproc/omnipath_{source}_ppi.pickle", 'wb') as file:
        pickle.dump(omni_net, file)

construct_ominpath_net('all')

######  all
   source  target
0  P0DP25  P48995
1  P0DP23  P48995
16394
183644


## string ppi

In [10]:
string_p2g=pd.read_table('data/9606.protein.aliases.v12.0.txt')
string_p2g=string_p2g.loc[string_p2g['source']=='Ensembl_HGNC_entrez_id']
string_p2g=string_p2g.set_index('#string_protein_id')['alias']
string_p2g = string_p2g[~string_p2g.index.duplicated(keep='first')]
string_p2g

string_ppi=pd.read_table('data/9606.protein.links.v12.0.txt',sep=' ')
string_ppi['combined_score']=string_ppi['combined_score'].astype(int)
string_sig_ppi=string_ppi.loc[string_ppi['combined_score']>=700]
string_sig_ppi.head()



string_sig_ppi_geneid=string_sig_ppi.copy()
string_sig_ppi_geneid['gene1']=string_sig_ppi_geneid['protein1'].map(string_p2g)
string_sig_ppi_geneid['gene2']=string_sig_ppi_geneid['protein2'].map(string_p2g)
string_sig_ppi_geneid=string_sig_ppi_geneid[['gene1','gene2']]

string_net=nx.Graph()
for ii in range(len(string_sig_ppi_geneid)):
    gene_a, gene_b=string_sig_ppi_geneid.iloc[ii]
    string_net.add_edge(str(gene_a), str(gene_b))

print(len(string_net.nodes()))
print(len(string_net.edges()))

with open("./data_preproc/string_ppi.pickle", 'wb') as file:
    pickle.dump(string_net, file)

#string_protein_id
9606.ENSP00000000233       381
9606.ENSP00000000412      4074
9606.ENSP00000001008      2288
9606.ENSP00000001146     56603
9606.ENSP00000002125     55471
                         ...  
9606.ENSP00000501254       222
9606.ENSP00000501259    642968
9606.ENSP00000501265    284434
9606.ENSP00000501277      8861
9606.ENSP00000501317     64864
Name: alias, Length: 19197, dtype: object

Unnamed: 0,protein1,protein2,combined_score
85,9606.ENSP00000000233,9606.ENSP00000158762,825
130,9606.ENSP00000000233,9606.ENSP00000357048,718
160,9606.ENSP00000000233,9606.ENSP00000262305,952
197,9606.ENSP00000000233,9606.ENSP00000329419,752
268,9606.ENSP00000000233,9606.ENSP00000469035,795


15970
235130
