# Running Tests between Different Isorank Approximations

In [1]:
# in order to use netalign package, go to the base folder 
# and run:
#           python setup.py build; python setup.py install;
import pandas as pd
import numpy as np
from netalign.approx_isorank.io_utils import compute_adjacency, compute_pairs
from netalign.approx_isorank.isorank_compute import compute_isorank, compute_greedy_assignment, pair_acc
from netalign.approx_isorank.pair_evaluations import compute_edge_correctness, semantic_sim, symmetric_substructure, lccs
from numpy.linalg import norm
import json

## Input parameters:

1. Net1 => Tab delimited file containing the first PPI network to be aligned
2. Net2 => Tab delimited file containing the second PPI network to be aligned
3. Rblast => Reciprocal blast tab-delimited file containing the sequence similarity score between proteins.
4. Alpha => Isorank Parameter. Usually set to 0.7
5. Niter => If R0 approximation, set this to 0, if R1 approximation, set this to 1. Else, set this to 
some value > 10 to get the original IsoRank matrix.
6. Npairs => How many aligned pairs to output
7. Annot1 => GOA annotation file for the first species
8. Annot2 => GOA annotation file for the second species.

*** Both the annotation files are obtained from the official GO website ***

In [18]:
net1 = "../data/intact/bakers.s.tsv"
net2 = "../data/intact/rat.s.tsv"
rblast = "../data/intact/rat-bakers.tsv"
alpha = 0.7
niters = [0, 1, 10]
npairs = 1000
annot1 = "../data/go/bakers.output.mapping.gaf"
annot2 = "../data/go/rat.output.mapping.gaf"

## Run IsoRank approximations

In [7]:
df1 = pd.read_csv(net1, sep = "\t", header = None)
df2 = pd.read_csv(net2, sep = "\t", header = None)
dpairs = pd.read_csv(rblast, sep = "\t")


org1 = net1.split("/")[-1].split(".")[0] # gives "bakers"
org2 = net2.split("/")[-1].split(".")[0] # gives "rat"

print("Computing adjacency matrix ...")
Af1, nA1 = compute_adjacency(df1)
Af2, nA2 = compute_adjacency(df2)

print("Getting the sequence similarity matrix")
E = compute_pairs(dpairs, nA1, nA2, org1, org2)

Computing adjacency matrix ...
Getting the sequence similarity matrix
      bakers   rat     score
0       4976  2254  0.120443
1         35  2254  0.123143
2       3911  2254  0.088307
3       6206  2254  0.066973
4       3893  2254  0.045153
...      ...   ...       ...
8601    4619  9595  0.047313
8602    5940  9595  0.039103
8603    4934  9595  0.037213
8604    6106  9595  0.048879
8605    5304  4187  0.154739

[8606 rows x 3 columns]


In [8]:
# Computing the IsoRank matrices
R0, R1, R2 = compute_isorank(Af1, 
                             Af2,
                             E, 
                             alpha = alpha,
                             maxiter = 5,
                             get_R0 = True,
                             get_R1 = True)

### Computing the norms

In [9]:
norm0 = norm(R0 - R2)
norm1 = norm(R1 - R2)
norm0, norm1

(0.0033744466169843444, 0.00024736547389922654)

### Doing Greedy alignment

In [10]:
print("for R0...")
pairs0 = compute_greedy_assignment(R0, npairs)
print("for R1...")
pairs1 = compute_greedy_assignment(R1, npairs)
print("for R2...")
pairs2 = compute_greedy_assignment(R2, npairs)

for R0...
for R1...
for R2...


## Evaluations

1. Edge Correctness

In [13]:
# First index the edgelist by their indexes
df1[0] = df1[0].apply(lambda x : nA1[x])
df1[1] = df1[1].apply(lambda x : nA1[x])
    
df2[0] = df2[0].apply(lambda x : nA2[x])
df2[1] = df2[1].apply(lambda x : nA2[x])

In [14]:
ec0 = compute_edge_correctness(pairs0, df1, df2)
ec1 = compute_edge_correctness(pairs1, df1, df2)
ec2 = compute_edge_correctness(pairs2, df1, df2)
ec0, ec1, ec2

(0.04032047941636269, 0.04758102395490841, 0.04894268491053488)

2. Symmetric Substructure

In [15]:
sstructure0 = symmetric_substructure(pairs0, df1, df2)
sstructure1 = symmetric_substructure(pairs1, df1, df2)
sstructure2 = symmetric_substructure(pairs2, df1, df2)

sstructure0, sstructure1, sstructure2

(0.03357743422836995, 0.041219075520833336, 0.04225875743555849)

3. LCCS

In [16]:
lc0 = lccs(pairs0, df1, df2)
lc1 = lccs(pairs1, df1, df2)
lc2 = lccs(pairs2, df1, df2)

lc0, lc1, lc2    

(408, 545, 552)

4. Functional Similarities

In order to run FC experiments, convert the pairs and the pandas dataframes to the Gene namespace.
This code section requires an additional package `goatools`. It can be installed by using the command:

```
pip install goatools
```

In [None]:
# Compute reverse dictionaries
rnA1 = {v: k for k, v in nA1.items()}
rnA2 = {v: k for k, v in nA2.items()}

df1.iloc[:, 0] = df1.iloc[:, 0].apply(lambda x: rnA1[x])
df1.iloc[:, 1] = df1.iloc[:, 1].apply(lambda x: rnA1[x])


df2.iloc[:, 0] = df2.iloc[:, 0].apply(lambda x: rnA2[x])
df2.iloc[:, 1] = df2.iloc[:, 1].apply(lambda x: rnA2[x])

In [20]:
FC = {}
for name, pair in [("R0", pairs0), ("R1", pairs1), ("R2", pairs2)]:
    print(f"For {name}:")
    pair_ = [(rnA1[p], rnA2[q]) for p, q in pair]
    for gotype in ["molecular_function", "biological_process", "cellular_component"]:
        fc = semantic_sim(pair_, df1, df2,
                        obofile = "../data/go/go-basic.obo",
                        annot1file=annot1,
                        annot2file = annot2,
                        type = gotype)
        print(f"\t For GO={gotype}, FC = {fc}")
        FC[f"FC-{gotype}({name})"] = fc

For R0:
  EXISTS: ../data/go/go-basic.obo
../data/go/go-basic.obo: fmt(1.2) rel(2022-12-04) 46,763 Terms; optional_attrs(relationship)
	 For GO=molecular_function, FC = 0.49946807698617535
  EXISTS: ../data/go/go-basic.obo
../data/go/go-basic.obo: fmt(1.2) rel(2022-12-04) 46,763 Terms; optional_attrs(relationship)
	 For GO=biological_process, FC = 0.22385897901631305
  EXISTS: ../data/go/go-basic.obo
../data/go/go-basic.obo: fmt(1.2) rel(2022-12-04) 46,763 Terms; optional_attrs(relationship)
	 For GO=cellular_component, FC = 0.4195773878742229
For R1:
  EXISTS: ../data/go/go-basic.obo
../data/go/go-basic.obo: fmt(1.2) rel(2022-12-04) 46,763 Terms; optional_attrs(relationship)
	 For GO=molecular_function, FC = 0.4551539284983664
  EXISTS: ../data/go/go-basic.obo
../data/go/go-basic.obo: fmt(1.2) rel(2022-12-04) 46,763 Terms; optional_attrs(relationship)
	 For GO=biological_process, FC = 0.20266467894311752
  EXISTS: ../data/go/go-basic.obo
../data/go/go-basic.obo: fmt(1.2) rel(2022-12-0