In [22]:
import numpy as np
import pandas as pd

## Get the coip and y2h networks and get all the relevant proteins

In [23]:
dc = pd.read_csv("../data/networks/coip_hc_full.tsv", sep = "\t", header = None)
dy = pd.read_csv("../data/networks/y2h_hc_full.tsv" , sep = "\t", header = None)
dc

Unnamed: 0,0,1,2
0,uniprotkb:A0A0B4J1S8,uniprotkb:Q9H3P7,0.57
1,uniprotkb:Q9H3P7,uniprotkb:O43493,0.55
2,uniprotkb:Q9H3P7,uniprotkb:Q8WUA7,0.64
3,uniprotkb:Q9H3P7,uniprotkb:Q08378,0.40
4,uniprotkb:Q9H3P7,uniprotkb:Q9UBF8,0.54
...,...,...,...
29228,uniprotkb:Q9Y3C0,uniprotkb:Q9Y2V7,0.67
29229,uniprotkb:Q9Y3D3,uniprotkb:Q9Y4X4,0.42
29230,uniprotkb:Q9Y4X4,uniprotkb:Q8NCR0,0.42
29231,uniprotkb:Q9Y6I4,uniprotkb:Q8N5D0,0.40


In [24]:
all_nodes = set(dc[0]).union(set(dc[1]))
all_nodes = all_nodes.union(set(dy[0]).union(set(dy[1])))
len(all_nodes)

11526

## Get the FASTA 25k sequences

In [25]:
from Bio import SeqIO

In [26]:
namespace = pd.read_csv("../data/rsingh/Biomart_Entrez-to-Uniprot_mapping.tsv", sep = "\t")
namespace.head().T

Unnamed: 0,0,1,2,3,4
Gene stable ID,ENSG00000198888,ENSG00000198888,ENSG00000198888,ENSG00000198763,ENSG00000198763
Gene stable ID version,ENSG00000198888.2,ENSG00000198888.2,ENSG00000198888.2,ENSG00000198763.3,ENSG00000198763.3
UniProtKB/TrEMBL ID,U5Z754,U5Z754,U5Z754,A0A1X7RBG6,A0A1X7RBG6
UniProtKB/Swiss-Prot ID,P03886,P03886,P03886,P03891,P03891
Gene Synonym,MTND1,NAD1,ND1,MTND2,NAD2
NCBI gene (formerly Entrezgene) ID,4535.0,4535.0,4535.0,4536.0,4536.0
Gene name,MT-ND1,MT-ND1,MT-ND1,MT-ND2,MT-ND2


## This shows that almost all of the proteins in the two networks are swissprot proteins

In [27]:
trembl = set(namespace.iloc[:, [2]].dropna()["UniProtKB/TrEMBL ID"])
sprot  = set(namespace.iloc[:, [3]].dropna()["UniProtKB/Swiss-Prot ID"])
trembl1 = {f"uniprotkb:{p}" for p in trembl}
sprot1 = {f"uniprotkb:{p}" for p in sprot}
len(all_nodes.intersection(sprot1))

11375

In [28]:
records = SeqIO.parse("../data/networks/seqs/uniprot_human_proteome_UP000005640_20k_genes.fa", "fasta")

sp_record = []
sp_prots  = []
for record in records:
    id = record.id
    _, name, _ = id.split("|")[:3]
    uprotname  = f"uniprotkb:{name}"
    if id.startswith("sp") and uprotname in all_nodes and len(record) < 800:
        record.id = uprotname
        sp_prots.append(uprotname)
        sp_record.append(record)

## Filter the original networks and save

In [29]:
sp_prots = set(sp_prots)

dc1 = dc[(dc[0].isin(sp_prots)) & (dc[1].isin(sp_prots))]
dy1 = dy[(dy[0].isin(sp_prots)) & (dy[1].isin(sp_prots))]

dc1.to_csv("../data/networks/dscript-tt/coip_hc_full.tsv", sep = "\t", header = None, index = None)
dy1.to_csv("../data/networks/dscript-tt/y2h_hc_full.tsv", sep = "\t", header = None, index = None)

## Save the fasta file

In [30]:
SeqIO.write(sp_record, "../data/networks/dscript-tt/y2h-coip-human-sequences.fasta", "fasta")

9198

## Compute test and training cases

In [31]:
dc2     = dc1.drop([2], axis = 1)
dc2.loc[dc2[0] > dc2[1]] = dc2.loc[dc2[0] > dc2[1], [1, 0]]
dcnodes = list(set(dc1[0]).union(set(dc1[1])))

dy2     = dy1.drop([2], axis = 1)
dy2.loc[dy2[0] > dy2[1]] = dy2.loc[dy2[0] > dy2[1], [1, 0]]
dynodes = list(set(dy1[0]).union(set(dy1[1])))

In [32]:
dcedges = {(k[0], k[1]): i for (i, k) in enumerate(dc2.values)}
dyedges = {(k[0], k[1]): i for (i, k) in enumerate(dy2.values)}

In [33]:
import numpy as np

dcpos  = dc2.values
dypos  = dy2.values

## Split to train-test samples, generate negatives

In [34]:
def getnegatives(edgemap, nodes, negratio = 10):
    negedges = []
    n_nodes  = len(nodes)
    n_negative = len(edgemap) * negratio
    while(len(negedges) < n_negative):
        p, q = np.random.randint(n_nodes, size = 2)
        if p == q:
            continue
        n1, n2  = (nodes[p], nodes[q])
        n1, n2  = (n2, n1) if (n1 > n2) else (n1, n2)
        
        if (n1, n2) in edgemap:
            continue
        else:
            negedges.append((n1, n2))
    return negedges

In [35]:
dcneg = getnegatives(dcedges, dcnodes)
dyneg = getnegatives(dyedges, dynodes)

In [36]:
len(dcneg), len(dyneg)

(174420, 306860)

# generate Train and Train in 4:1 ratio

In [37]:
dcpos = np.concatenate((dcpos, np.ones((len(dcpos), 1))), axis = 1)
dypos = np.concatenate((dypos, np.ones((len(dypos), 1))), axis = 1)
dcneg = np.concatenate((np.array(dcneg), np.zeros((len(dcneg), 1))), axis = 1)
dyneg = np.concatenate((np.array(dyneg), np.zeros((len(dyneg), 1))), axis = 1)
dyneg

array([['uniprotkb:O75348', 'uniprotkb:Q9BWX1', '0.0'],
       ['uniprotkb:Q9BW92', 'uniprotkb:Q9NS71', '0.0'],
       ['uniprotkb:P08729', 'uniprotkb:Q8WW24', '0.0'],
       ...,
       ['uniprotkb:H3BRN8', 'uniprotkb:Q8TE77', '0.0'],
       ['uniprotkb:O00757', 'uniprotkb:P06681', '0.0'],
       ['uniprotkb:Q0VAQ4', 'uniprotkb:Q5MJ70', '0.0']], dtype='<U32')

In [38]:
pcp = np.random.permutation(len(dcpos))
pyp = np.random.permutation(len(dypos))
ncp = np.random.permutation(len(dcneg))
nyp = np.random.permutation(len(dyneg))

In [39]:
pc, py, nc, ny = (len(pcp), len(pyp), len(ncp), len(nyp))
pctr, pytr, nctr, nytr = [int(k * 4/5) for k in (pc, py, nc, ny)]

c_train = np.concatenate((dcpos[pcp[:pctr]], dcneg[ncp[:nctr]]), axis = 0)
c_test  = np.concatenate((dcpos[pcp[pctr:]], dcneg[ncp[nctr:]]), axis = 0)
y_train = np.concatenate((dypos[pyp[:pytr]], dyneg[nyp[:nytr]]), axis = 0)
y_test  = np.concatenate((dypos[pyp[pytr:]], dyneg[nyp[nytr:]]), axis = 0)

In [40]:
len(c_train), len(c_test), len(y_train), len(y_test)

(153489, 38373, 270036, 67510)

In [41]:
dctr = pd.DataFrame(c_train)
dcte = pd.DataFrame(c_test)
dytr = pd.DataFrame(y_train)
dyte = pd.DataFrame(y_test)

In [42]:
dctr.to_csv("../data/networks/dscript-tt/coip_train.tsv", sep = "\t", header = None, index = None)
dcte.to_csv("../data/networks/dscript-tt/coip_test.tsv", sep = "\t", header = None, index = None)
dytr.to_csv("../data/networks/dscript-tt/y2h_train.tsv", sep = "\t", header = None, index = None)
dyte.to_csv("../data/networks/dscript-tt/y2h_test.tsv", sep = "\t", header = None, index = None)

# Generate TEST cases for the new model

In [50]:
import pandas as pd

dcoip = pd.read_csv("../data/networks/dscript-tt/coip_hc_full.tsv", sep = "\t", header = None)
dy2h  = pd.read_csv("../data/networks/dscript-tt/y2h_hc_full.tsv", sep = "\t", header = None)
dcoip[2] = 1
dy2h[2]  = 0
dcoip, dy2h

(                          0                 1  2
 0          uniprotkb:Q9H3P7  uniprotkb:O43493  1
 1          uniprotkb:Q9H3P7  uniprotkb:Q8WUA7  1
 2          uniprotkb:Q9H3P7  uniprotkb:Q9NU19  1
 3      uniprotkb:A0A0U1RRE5  uniprotkb:Q9NPI6  1
 4          uniprotkb:Q9NPI6  uniprotkb:O95429  1
 ...                     ...               ... ..
 17437      uniprotkb:Q9Y3C0  uniprotkb:O75506  1
 17438      uniprotkb:Q9Y3C0  uniprotkb:Q9Y2V7  1
 17439      uniprotkb:Q9Y3D3  uniprotkb:Q9Y4X4  1
 17440      uniprotkb:Q9Y4X4  uniprotkb:Q8NCR0  1
 17441      uniprotkb:Q9Y6I4  uniprotkb:Q8N5D0  1
 
 [17442 rows x 3 columns],
                       0                 1  2
 0      uniprotkb:P37268  uniprotkb:P48165  0
 1      uniprotkb:P37268  uniprotkb:Q13520  0
 2      uniprotkb:P37268  uniprotkb:Q14973  0
 3      uniprotkb:P37268  uniprotkb:Q3SXY8  0
 4      uniprotkb:P37268  uniprotkb:Q5JX71  0
 ...                 ...               ... ..
 30681  uniprotkb:Q9UL46  uniprotkb:Q9UL46  0
 30

In [51]:
coipnodes = set(dcoip[0]).union(set(dcoip[1]))
y2hnodes = set(dy2h[0]).union(set(dy2h[1]))
len(coipnodes), len(y2hnodes), len(coipnodes.intersection(y2hnodes))

(6011, 6233, 3333)

In [49]:
out_folder = "../data/networks/dscript-tt/dataset_coip+y2h_predict"
# sample Train : 80, Test : 20
n = 17442
nt = int(n*0.8)
for i in range(5):
    dy2h1 = dy2h.sample(n=n)
    n1s   = np.random.permutation(n)
    n2s   = np.random.permutation(n)
    dtr   = pd.concat([dy2h.loc[n1s[:nt]], dcoip.loc[n2s[:nt]]])
    dte   = pd.concat([dy2h.loc[n1s[nt:]], dcoip.loc[n2s[nt:]]])
    dtr.to_csv(f"{out_folder}/train_{i}.tsv", sep = "\t", header = None, index = None)
    dte.to_csv(f"{out_folder}/test_{i}.tsv", sep = "\t", header = None, index = None)