In [2]:
import pandas as pd
from Bio import SeqIO
import numpy as np
import networkx as nx
import scipy.sparse as sp
from Bio import SeqIO
import matplotlib.pyplot as plt

np.random.seed(seed=2)

In [3]:
def split_train_test(edges, fasta, df_mmseq, train_frac=1, test_frac=0.1, out=False):
    
    np.random.shuffle(edges)
    
    num_test = int(len(edges) * test_frac)
    train_edges = edges[num_test:]
    np.random.shuffle(train_edges)
    train_edges = train_edges[:int(len(train_edges)*train_frac)]
    test_edges = edges[:num_test]
    
    train_nodes = set(train_edges.flatten())
    test_nodes = set(test_edges.flatten())
    
    more_40_seq_id = df_mmseq[df_mmseq[2] > 0.4]
    node_sim_dict = {}
    for x in more_40_seq_id[[0,1]].values:
        if x[0] not in node_sim_dict.keys():
            node_sim_dict[x[0]] = [x[0]]
        if x[1] not in node_sim_dict.keys():
            node_sim_dict[x[1]] = [x[1]]

        node_sim_dict[x[0]].append(x[1])
        node_sim_dict[x[1]].append(x[0])

    node_sim_dict = {x:list(set(y)) for x,y in node_sim_dict.items()}
    
    sim_train_nodes = []
    for x in train_nodes:
        if x in node_sim_dict.keys():
            sim_train_nodes = sim_train_nodes + node_sim_dict[x]
        else:
            sim_train_nodes = sim_train_nodes + [x]
    
    sim_train_nodes = list(set(sim_train_nodes))
    c1 = []
    c2 = []
    c3 = []
    
    for x in test_edges:
        if x[0] not in sim_train_nodes and x[1] not in sim_train_nodes:
            c3.append(x)
        elif x[0] in sim_train_nodes and x[1] in sim_train_nodes:
            c1.append(x)  
        else:
            c2.append(x)
    
    if out:
        np.savez('../Data/train_test_data/train_edges.npz', train_edges)
        np.savez('../Data/train_test_data/test_edges.npz', test_edges)
    
    print('Train:',len(train_edges),'C1:',len(c1),'C2:', len(c2),'C3:', len(c3))
    print(sum([len(train_edges), len(c1), len(c2), len(c3)]))
    return [len(train_edges), len(c1), len(c2), len(c3)]

In [4]:
fasta = list(SeqIO.parse("../Data/Fasta/huri_apid_proteins.fasta", "fasta"))
# fasta = list(SeqIO.parse("../Data/test/huri_apid40.fasta", "fasta"))

df_edges = pd.read_csv('../Data/Interactome/huri_apid_merge_ppis_edgelist.csv', sep='\t', header=None)
df_nodes = pd.read_csv('../Data/Interactome/huri_apid_merge_ppis_nodelist.csv', sep='\t', header=None)
df_mmseq = pd.read_csv('../Data/mmseqs/align.m8', sep='\t', header=None)
red_prot = [x.id.split('|')[1] for x in fasta]

edges = np.array([x for x in df_edges.values if x[0] in red_prot and x[1] in red_prot])
edges = np.array([x for x in edges if x[0] != x[1]])
print('edges',len(edges))

k = {}
for x in range(9,0,-1):
    l = []
    for y in range(5):
        print(x/10)
        l.append(split_train_test(edges, fasta, df_mmseq, 1, x/10))
    k[x/10] = l

edges 109905
0.9


KeyboardInterrupt: 

In [None]:
cs = [4,3,2,1]
x = list(k.keys())[-9:]

for c in cs: 
    dy = [np.std(np.array(k[i])[:,-c]) for i in x]
    y = [np.mean(np.array(k[i])[:,-c]) for i in x]
    print(y[0])
    plt.title('Number of PPIs in test set')
    if c == 4:
        plt.errorbar(x, y, fmt='-o').set_label('Train')
    else:
        plt.errorbar(x, y, fmt='-o').set_label('C'+str(4+-c))
    plt.xlabel('Test set (fraction)')
    plt.ylabel('Frequency')
plt.legend()

In [None]:
for c in [2]: 
    dy = [np.std(np.array(k[i])[:,-c]) for i in x]
    y = [np.mean(np.array(k[i])[:,-c]) for i in x]
    print(y[0])
    plt.title('Number of PPIs in C3')
    plt.errorbar(x, y, yerr=0, fmt='-o', color='#d62728').set_label('C'+str(4+-c))
    plt.xlabel('Test set (fraction)')
    plt.ylabel('Frequency')
plt.legend()

In [32]:
fasta = list(SeqIO.parse("../Data/test/huri_apid100.fasta", "fasta"))
df_edges = pd.read_csv('../Data/Interactome/huri_apid_merge_ppis_edgelist.csv', sep='\t', header=None)
df_nodes = pd.read_csv('../Data/Interactome/huri_apid_merge_ppis_nodelist.csv', sep='\t', header=None)
df_mmseq = pd.read_csv('../Data/mmseqs/align.m8', sep='\t', header=None)

red_prot = [x.id.split('|')[1] for x in fasta]
edges = np.array([x for x in df_edges.values if x[0] in red_prot and x[1] in red_prot])

In [38]:
np.random.shuffle(edges)

num_test = int(len(edges) * 0.25)
train_edges = edges[num_test:]
np.random.shuffle(train_edges)
train_edges = train_edges[:int(len(train_edges)*1)]
test_edges = edges[:num_test]
    
G = nx.read_edgelist('../Data/Interactome/huri_apid_merge_ppis_edgelist.csv')

nr_train_edges = []
for edge in train_edges:
    if not G.has_edge(edge[0], edge[1]):
        continue
    nr_train_edges.append(edge)   
    G.remove_nodes_from(edge)

train_nodes = set(np.array(nr_train_edges).flatten())
test_nodes = set(test_edges.flatten())

In [39]:
more_40_seq_id = df_mmseq[df_mmseq[2] > 0.4]
node_sim_dict = {}
for x in more_40_seq_id[[0,1]].values:
    if x[0] not in node_sim_dict.keys():
        node_sim_dict[x[0]] = [x[0]]
    if x[1] not in node_sim_dict.keys():
        node_sim_dict[x[1]] = [x[1]]
    
    node_sim_dict[x[0]].append(x[1])
    node_sim_dict[x[1]].append(x[0])

node_sim_dict = {x:list(set(y)) for x,y in node_sim_dict.items()}

In [40]:
sim_train_nodes = []
for x in train_nodes:
    if x in node_sim_dict.keys():
        sim_train_nodes = sim_train_nodes + node_sim_dict[x]
    else:
        sim_train_nodes = sim_train_nodes + [x]

In [41]:
c1 = [x for x in test_edges if x[0] in sim_train_nodes and x[1] in sim_train_nodes]
c2 = [x for x in test_edges if x[0] not in sim_train_nodes or x[1] not in sim_train_nodes]
c3 = [x for x in test_edges if x[0] not in sim_train_nodes and x[1] not in sim_train_nodes]

In [42]:
print('Train:',len(nr_train_edges),'C1:',len(c1),'C2:', len(c2),'C3:', len(c3))

Train: 4110 C1: 22942 C2: 3749 C3: 127
