In [2]:
import glidetools.algorithm.dsd as dsd
import pandas as pd
import numpy as np
import sys
import json
from scipy.spatial.distance import cdist
sys.path.append("../src/")
%load_ext autoreload
%autoreload 2

# Compute DSD matrix of rat and yeast

In [3]:
flydf = pd.read_csv("../data/intact_output/fly.s.tsv", sep = "\t", header = None)
yeastdf = pd.read_csv("../data/intact_output/bakers_add.s.tsv", sep = "\t", header = None)
yeastdf

Unnamed: 0,0,1
0,P38989,P39078
1,P39966,P25294
2,P41056,P10591
3,Q03220,P23292
4,Q12373,Q05024
...,...,...
80395,P32340,P38217
80396,Q06677,P53741
80397,P38786,P53218
80398,P04786,Q05024


In [4]:
from io_utils import compute_adjacency, compute_pairs
Ay, yeastmap = compute_adjacency(yeastdf)
Af, flymap = compute_adjacency(flydf)
dfb = pd.read_csv("../data/intact_output/fly-bakers.tsv", sep = "\t")
Efb = compute_pairs(dfb, flymap, yeastmap, "fly", "bakers")

# (df, nmapA, nmapB, orgA, orgB)
import json
with open("yeast.json", "w") as yj, open("fly.json", "w") as fj:
    json.dump(yeastmap, yj)
    json.dump(flymap, fj)

       fly  bakers     score
0     3943    2679  0.033223
1     3943    2772  0.033033
2     3943    6213  0.044461
3     3462    1788  0.041991
4     7511    2616  0.032133
...    ...     ...       ...
8353  9165    4224  0.031137
8354  9210    1112  0.031469
8355  9210    2018  0.032654
8356  9210    4293  0.035592
8357  9210    1345  0.033863

[8358 rows x 3 columns]


In [5]:
ryeastmap = {val: key for key, val in yeastmap.items()}
rflymap = {val: key for key, val in flymap.items()}

In [6]:
def compute_isorank(A1, A2, E, alpha, maxiter = 20, get_R0 = False, get_R1 = False):
    """
    Compute the isorank using the eigendecomposition
    """
    d1 = np.sum(A1, axis = 1).reshape(-1, 1)
    d2 = np.sum(A2, axis = 1).reshape(-1, 1)
    
    P1 = A1 / d1.T
    P2 = A2 / d2.T
    E = E / np.sum(E)
    
    d = d1 @ d2.T 
    d = d / (np.sum(d1) * np.sum(d2))
    
    R = (1-alpha) * d + alpha * E
    
    if maxiter <= 0:
        return R
    
    if get_R0:
        R0 = R.copy()
    
    # Reshape R and E
    R = R.T
    E = E.T
    
    for i in range(maxiter):
        R = (1-alpha) * (P2 @ R @ P1.T) + alpha * E
        if get_R1 and i == 0:
            R1 = R.T.copy()
            
    payload = [R.T]
    if get_R1:
        payload = [R1] + payload
    if get_R0:
        payload = [R0] + payload
    return payload

def compute_greedy_assignment(R1, n_align):
    """
    Compute greedy assignment
    """
    aligned = []
    R = R1.copy()
    
    n_align = min(n_align, *R.shape)
    
    itr = 1
    while(len(aligned) < n_align):
        itr   += 1
        maxcols = np.argmax(R, axis = 1) # best y ids
        maxid = np.argmax(np.max(R, axis = 1)) # best x id
        maxcol = maxcols[maxid]
        aligned.append((maxid, maxcol))
        R[:, maxcol] = -1
        R[maxid, :]  = -1
    return aligned


R0 = compute_isorank(Af, Ay, Efb, 0.6, -1, get_R0 = False, get_R1 = False)
pairs = compute_greedy_assignment(R0, 1000)

In [7]:
diso_fly_yeast = pd.DataFrame(pairs, columns = ["fly", "bakers"])
diso_fly_yeast.iloc[:, 0] = diso_fly_yeast.iloc[:, 0].apply(lambda x : rflymap[x])
diso_fly_yeast.iloc[:, 1] = diso_fly_yeast.iloc[:, 1].apply(lambda x : ryeastmap[x])
diso_fly_yeast.to_csv("isorank_fly_bakersPlus.tsv", sep = "\t", index = None)

# Load JSON from file

In [68]:
list(yeastmap.items())[:10]

[('P38207', 0),
 ('Q01919', 1),
 ('P42840', 2),
 ('P36069', 3),
 ('P35187', 4),
 ('P53273', 5),
 ('Q07410', 6),
 ('P36009', 7),
 ('P11076', 8),
 ('P22135', 9)]

In [10]:
with open("yeast.json", "r") as yj, open("rat.json", "r") as rj:
    yeastmap = json.load(yj)
    ratmap = json.load(rj)
list(yeastmap.items())[:10]

[('P38207', 0),
 ('Q01919', 1),
 ('P42840', 2),
 ('P36069', 3),
 ('P35187', 4),
 ('P53273', 5),
 ('Q07410', 6),
 ('P36009', 7),
 ('P11076', 8),
 ('P22135', 9)]

# Compute DSD matrix

In [34]:
ADyeast = dsd.compute_dsd_embedding(Ay, is_normalized=False)

In [8]:
# np.save("yeast-dsd-emb.npy", ADyeast)
ADfly = dsd.compute_dsd_embedding(Af, is_normalized = False)

In [10]:
#np.save("fly-dsd-emb.npy", ADfly)

# Load the DSD matrix

In [12]:
ADfly = np.load("fly-dsd-emb.npy")
ADyeast = np.load("yeast-dsd-emb.npy")

In [13]:
from scipy.spatial.distance import pdist, squareform

Dfly = squareform(pdist(ADfly))
Dyeast = squareform(pdist(ADyeast))

In [6]:
#np.save("fly-dsd-dist.npy", Dfly)
#np.save("yeast-dsd-dist.npy", Dyeast)
Dfly = np.load("fly-dsd-dist.npy")
Dyeast = np.load("yeast-dsd-dist.npy")

# Load the GO files for the Yeast network

In [7]:
ynet = pd.read_csv("../data/go/bakers.output.mapping.gaf", sep = "\t")
ynet

Unnamed: 0,GO,type,sgd,swissprot
0,GO:0000006,molecular_function,S000003224,P32804
1,GO:0000006,molecular_function,S000003224,P32804
2,GO:0000007,molecular_function,S000004120,Q12436
3,GO:0000007,molecular_function,S000004120,Q12436
4,GO:0000009,molecular_function,S000000208,P38211
...,...,...,...,...
61177,GO:1990468,cellular_component,S000004447,Q06188
61178,GO:1990468,cellular_component,S000006235,Q12311
61179,GO:1990537,cellular_component,S000003345,P53267
61180,GO:1990860,cellular_component,S000005952,P17157


In [8]:
fnet = pd.read_csv("../data/go/fly.output.mapping.gaf", sep = "\t")
fnet

Unnamed: 0,GO,type,fbid,swissprot
0,GO:0000009,molecular_function,FBgn0037743,Q9VH78
1,GO:0000009,molecular_function,FBgn0037743,Q9VH78
2,GO:0000009,molecular_function,FBgn0265174,Q9V7W1
3,GO:0000010,molecular_function,FBgn0037044,Q9VP87
4,GO:0000010,molecular_function,FBgn0051005,Q8SY08
...,...,...,...,...
161232,GO:1990498,cellular_component,FBgn0002924,Q5UHE2
161233,GO:1990635,cellular_component,FBgn0032593,Q9VJJ7
161234,GO:1990635,cellular_component,FBgn0032593,B7YZW4
161235,GO:1990635,cellular_component,FBgn0086656,Q8T0Q4


In [9]:
fnet.iloc[:, [0, 3]].groupby("swissprot", as_index = False).aggregate(list).values

array([['A0A021WW32',
        list(['GO:0010628', 'GO:0016322', 'GO:0070193', 'GO:1990414', 'GO:0005634'])],
       ['A0A021WW37',
        list(['GO:0005262', 'GO:0008273', 'GO:0006874', 'GO:0070588'])],
       ['A0A021WWX0',
        list(['GO:0005262', 'GO:0008273', 'GO:0006874', 'GO:0070588'])],
       ...,
       ['X2JLM6',
        list(['GO:0004843', 'GO:1990380', 'GO:0002785', 'GO:0006511', 'GO:0016579', 'GO:0016579', 'GO:0045805', 'GO:0051926', 'GO:0061060', 'GO:0071108', 'GO:1901800'])],
       ['X2JLN4',
        list(['GO:0000827', 'GO:0000827', 'GO:0000828', 'GO:0000829', 'GO:0000832', 'GO:0000832', 'GO:0005524', 'GO:0033857', 'GO:0033857', 'GO:0033857', 'GO:0032958'])],
       ['X4YX01',
        list(['GO:0000132', 'GO:0045938', 'GO:0045938', 'GO:0061172', 'GO:0005819'])]],
      dtype=object)

In [10]:
def topk_acc(golists = {}, go = None, k = 1):
    if len(golists) == 0:
        return None
    elif go == None:
        return 0
    else:
        go_k = set(go[: min(k, len(go))])
        return len(go_k.intersection(golists)) / k
    
def topk_accs(prots, pred_go_map, true_go_map, k = 1):
    accs = [topk_acc(true_go_map[p], pred_go_map[p], k) for p in prots]
    accs = list(filter(lambda x : x is not None, accs))
    return np.average(accs)

def compute_metric(prediction_func, scoring_func, allprots, true_go_map, kfold = 5):
    np.random.seed(137)
    permprots = np.random.permutation(allprots)
    blocksize = int(len(allprots) / kfold)
    scores = []
    for i in range(kfold):
        predictprots = permprots[i * blocksize: (i+1) * blocksize]
        trainprots = np.concatenate([permprots[: i * blocksize], permprots[(i+1) * blocksize : ]])
        go_map_training = {tprots: true_go_map[tprots] for tprots in trainprots}
        go_map_training.update({pprots : -1 for pprots in predictprots})
        pred_map = prediction_func(go_map_training)
        scores.append(scoring_func(predictprots, pred_map, true_go_map))
    return scores, np.average(scores)

def predict_dsd(D_mat, train_go_maps, k = 10):
    predprot = [x for x in train_go_maps if train_go_maps[x] == -1]
    D_mat1 = D_mat.copy()
    D_mat1[range(len(D_mat)), range(len(D_mat))] = np.inf
    D_mat1[:, predprot] = np.inf
    sortedD = np.argsort(D_mat1, axis = 1)[:, 1:k+1]
    def vote(neighbors, go_maps):
        gos = {}
        for n in neighbors:
            for g in go_maps[n]:
                if g not in gos:
                    gos[g] = 0
                gos[g] += 1 
        return sorted(gos, key = lambda x : gos[x], reverse=True)
    for p in predprot:
        train_go_maps[p] = vote(sortedD[p], train_go_maps)
    return train_go_maps

def predict_dsd_mundo(D_mat, D_other_species, train_go_maps, go_other, k = 10, k_other = 20):
    predprot = [x for x in train_go_maps if train_go_maps[x] == -1]
    D_mat1 = D_mat.copy()
    D_other = D_other_species.copy()
    D_mat1[range(len(D_mat)), range(len(D_mat))] = np.inf
    D_mat1[:, predprot] = np.inf
    sortedD = np.argsort(D_mat1, axis = 1)[:, 1: k+1]
    sortedDoth = np.argsort(D_other, axis = 1)[:, 1: k_other+1]
    def vote(neighbors, oth_neighbors,  go_maps, vote_other = 0.4):
        gos = {}
        for n in neighbors:
            for g in go_maps[n]:
                if g not in gos:
                    gos[g] = 0
                gos[g] += 1 
        for n in oth_neighbors:
            for g in go_other[n]:
                if g not in gos:
                    gos[g] = 0
                gos[g] += vote_other  
        return sorted(gos, key = lambda x : gos[x], reverse=True)
    for p in predprot:
        train_go_maps[p] = vote(sortedD[p], sortedDoth[p], train_go_maps)
    return train_go_maps

def dsd_func(D_mat, k = 10):
    def pred(train_go_maps):
        return predict_dsd(D_mat, train_go_maps, k = k)
    return pred

def dsd_func_mundo(D_mat, D_other, go_other, k = 10, k_other = 20):
    def pred(train_go_maps):
        return predict_dsd_mundo(D_mat, D_other, train_go_maps, go_other, k, k_other)
    return pred
    
def get_go_maps(nmap, gofile):
    df = pd.read_csv(gofile, sep = "\t")
    gomaps = df.iloc[:, [0, 3]].groupby("swissprot", as_index = False).aggregate(list).values
    go_outs = {}
    for prot, gos in gomaps:
        if prot in nmap:
            go_outs[nmap[prot]] = set(gos)
    return go_outs

In [19]:
yeastgomap = get_go_maps(yeastmap, "../data/go/bakers.output.mapping.gaf")
flygomap = get_go_maps(flymap, "../data/go/fly.output.mapping.gaf")
for i in range(len(yeastmap)):
    if i not in yeastgomap:
        yeastgomap[i] = {}
for i in range(len(flymap)):
    if i not in flygomap:
        flygomap[i] = {}
len(yeastgomap), len(flygomap)

(6478, 11247)

# Function Prediction: Without MUNDO output

In [20]:
Dfly[range(len(Dfly)), range(len(Dfly))] = np.inf
Dyeast[range(len(Dyeast)), range(len(Dyeast))] = np.inf

In [21]:
compute_metric(dsd_func(Dfly, k=10), topk_accs, list(range(len(flymap))), flygomap, kfold = 5)

([0.16839378238341968,
  0.14817190506735087,
  0.1378627968337731,
  0.14576493762311227,
  0.15994623655913978],
 0.15202793169335913)

In [118]:
compute_metric(dsd_func(Dyeast, k=10), topk_accs, list(range(len(yeastmap))), yeastgomap, kfold = 5)

([0.4046692607003891,
  0.3892100192678227,
  0.3346228239845261,
  0.4182547642928786,
  0.34408602150537637],
 0.3781685779501986)

# Self Attention Model 

In [25]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.manifold import Isomap


In [4]:

Dyeast = np.load("yeast-dsd-dist.npy")
embedding = Isomap(n_components=100, metric = "precomputed")
Yeastemb = embedding.fit_transform(Dyeast)

In [32]:
# np.save("yeast-isomap.npy", Yeastemb)
Yeastemb = np.load("yeast-isomap.npy")
# Yeastemb.shape, embedding.reconstruction_error()

In [28]:
Dfly = np.load("fly-dsd-dist.npy")
Dfly = np.where(Dfly > 10, 10, Dfly)
flyemb = Isomap(n_components=100, metric = "precomputed")
Flyemb = flyemb.fit_transform(Dfly)

In [29]:
np.save("fly-isomap.npy", Flyemb)
flyemb.reconstruction_error()

5.89909145774511

In [30]:
class MHA(nn.Module):
    def __init__(self, n_channel, n_head, n_dim, n_out, activation = "identity"):
        super(MHA, self).__init__()
        activations = {"relu" : nn.ReLU(), "sigmoid" : nn.Sigmoid(), "tanh": nn.Tanh()}
        self.Wq = nn.Parameter(torch.randn(n_head, n_dim, n_channel, dtype = torch.float32))
        self.Wk = nn.Parameter(torch.randn(n_head, n_dim, n_channel, dtype = torch.float32))
        self.Wv = nn.Parameter(torch.randn(n_head, n_dim, n_channel, dtype = torch.float32))
        self.drop = nn.Dropout(p=0.2)
        self.activation = activations[activation] if activation in activations else nn.Identity()
        self.n_channel  = nn.Parameter(torch.tensor(n_channel, dtype = torch.float32), requires_grad = False)
        self.Wo = nn.Parameter(torch.randn(n_head * n_channel, n_out))
        
    def forward(self, X):
        # X is batch x n_seq = 100 x n_dim = 1
        X1 = X.unsqueeze(1)
        q  = torch.matmul(X1, self.Wq) # batch x n_head x n_seq x n_channel
        k  = torch.matmul(X1, self.Wk)
        v  = torch.matmul(X1, self.Wv)
        
        att = F.softmax(torch.matmul(q, torch.transpose(k, 2, 3)) / torch.sqrt(self.n_channel), dim = -1) 
        v = torch.matmul(att, v) # batch x n_head x n_seq x n_channel
        v = self.drop(v)
        vc = torch.concatenate(torch.unbind(v, 1), axis = -1)
        return self.activation(torch.matmul(vc, self.Wo))
        
        
class AttentionModel(nn.Module):
    def __init__(self):
        super(AttentionModel, self).__init__()
        self.mha1 = MHA(10, 4, 1, 4, activation = "sigmoid")
        self.layer = nn.LayerNorm(4)
        self.mha2 = MHA(5, 3, 4, 1)
    def forward(self, x):
        out = self.mha1(x)
        out = self.layer(out)
        return self.mha2(out)

class Data(Dataset):
    def __init__(self, matchfile, no_matches, Xa, Xb, nA, nB):
        self.no_matches = no_matches
        self.matchdf = pd.read_csv(matchfile, sep = "\t")
        
        if "scores" in self.matchdf.columns:
            self.matchdf = self.matchdf.sort_values(by = "score", ascending = False).reset_index(drop = True)[: no_matches]
        else:
            self.matchdf = self.matchdf.loc[: no_matches, :]
            # for compatibility
            self.matchdf["scores"] = 1
        self.nA = nA
        self.nB = nB
        self.Xa = Xa
        self.Xb = Xb
        
    def __len__(self):
        return self.no_matches
    
    def __getitem__(self, idx):
        pa, pb, _= self.matchdf.iloc[idx, :].values
        ia, ib = self.nA[pa], self.nB[pb]
        return torch.tensor(self.Xa[ia], dtype = torch.float32).unsqueeze(-1), torch.tensor(self.Xb[ib], dtype = torch.float32).unsqueeze(-1)
        

In [33]:
#Ratemb = np.load("rat-isomap.npy")
#Yeastemb = np.load("yeast-isomap.npy")
data = Data("isorank_fly_bakers.tsv", 500, Flyemb, Yeastemb, flymap, yeastmap)

In [34]:
train, test = torch.utils.data.random_split(data, [450, 50])
trainloader = DataLoader(train, shuffle = True, batch_size = 10)
testloader = DataLoader(test, shuffle = True, batch_size = 10)

In [35]:
loss_fn = nn.MSELoss()
loss_fn(data[0][0],data[0][1])

tensor(0.5517)

In [36]:
model = AttentionModel()
model.train()
optim = torch.optim.Adam(model.parameters(), lr = 0.001)

ep = 100
losses = []
for e in range(ep):
    loss = 0
    for i, data in enumerate(trainloader):
        y, x = data # y = rat, x = yeast
        optim.zero_grad()
        yhat = model(x)
        closs = loss_fn(y, yhat)
        closs.backward()
        optim.step()
        loss += closs.item()
    loss = loss / (i+1)
    if e % 10 == 0:
        print(f"Epoch {e+1}: Loss : {loss}")

Epoch 1: Loss : 9.104500341415406
Epoch 11: Loss : 1.3834210011694166
Epoch 21: Loss : 0.873081910610199
Epoch 31: Loss : 0.7774844209353129
Epoch 41: Loss : 0.7336471690071954
Epoch 51: Loss : 0.7168675939242045
Epoch 61: Loss : 0.7079815652635363
Epoch 71: Loss : 0.7017793907059564
Epoch 81: Loss : 0.6997297419442071
Epoch 91: Loss : 0.6941977156533136


In [37]:
testloss = 0
model.eval()
for i, data in enumerate(testloader):
    y, x = data # y = rat, x = yeast
    yhat = model(x)
    loss = loss_fn(y, yhat)
    testloss += loss.item()
testloss /= (i+1)
print(f"Validation loss: {testloss}")

Validation loss: 0.6522220134735107


In [38]:
allYeastEmb = torch.tensor(Yeastemb, dtype = torch.float32).unsqueeze(-1)
with torch.no_grad():
    Yeast_FlyT = model(allYeastEmb)

In [39]:
yeast_fly = Yeast_FlyT.squeeze().numpy()
np.save("yeast->fly_emb.npy", yeast_fly)

In [40]:
yeast_fly.shape

(6478, 100)

In [41]:
fly_yeast_dist = cdist(Flyemb, yeast_fly) # If rat => m yeast has size n  => m x n

## $\color{red}{\text{With MUNDO output (MASSIVE IMPROVEMENT)!}}$

In [42]:
Dfly[range(len(Dfly)), range(len(Dfly))] = np.inf
Dyeast[range(len(Dyeast)), range(len(Dyeast))] = np.inf
compute_metric(dsd_func_mundo(Dfly, fly_yeast_dist, yeastgomap, k_other=5),
               topk_accs, list(range(len(flymap))), flygomap, kfold = 5)

([0.22215025906735753,
  0.22065426555484285,
  0.2025065963060686,
  0.211424819435325,
  0.22311827956989247],
 0.21597084398669728)

In [43]:
compute_metric(dsd_func_mundo(Dfly, fly_yeast_dist, yeastgomap, k_other=20),
               topk_accs, list(range(len(flymap))), flygomap, kfold = 5)

([0.2648963730569948,
  0.2514432328415651,
  0.25,
  0.247537754432042,
  0.260752688172043],
 0.254926009700529)