# Install and Download

In [None]:
#Install pyrwr
!git clone https://github.com/jinhongjung/pyrwr.git
%cd ./pyrwr
!pip install -r requirements.txt

In [None]:
#Downloading data
#FunCoup5
!wget "https://funcoup5.scilifelab.se/downloads/download.action?type=network&instanceID=24480085&fileName=FC5.0_H.sapiens_full.gz" -O funcoup5.tab.gz
!gunzip funcoup5.tab.gz

#STRINGv10
!wget http://version10.string-db.org/download/protein.links.v10/9606.protein.links.v10.txt.gz -O ./protein.links.v10.txt.gz
!gunzip ./protein.links.v10.txt.gz

In [None]:
import json
#Dictionaries for ID conversion
path = "../data/ensg.json"
with open(path, mode="r") as f:
    sym_to_ensg = json.load(f)
ensg_to_sym = {value: key for key, value in sym_to_ensg.items()}

path = "../data/ensp.json"
with open(path, mode="r") as f:
    sym_to_ensp = json.load(f)
ensp_to_sym = {value: key for key, value in sym_to_ensp.items()}

# FunCoup

Comparison is performed using the code in Evaluation.ipynb

In [None]:
import tqdm

# Create a mapping of genes to unique numbers from the funcoup file
gene_list, numbd =[], {}
c = 0

with open("funcoup5.tab", "r") as f:
    next(f)  # Skip header
    for line in tqdm.tqdm(f):
        gene1, gene2 = line.split("\t")[2:4]
        if gene1 not in numbd:
            numbd[gene1] = str(c)
            gene_list.append(gene1)
            c += 1
        if gene2 not in numbd:
            numbd[gene2] = str(c)
            gene_list.append(gene2)
            c += 1

In [None]:
import tqdm

# Paths
input_path = "funcoup5.tab"
output_path = "funcoup_graph.tsv"

# Mapping genes to unique numbers and writing relationships to a new file
with open(input_path, "r") as input_file, open(output_path, "w") as output_file:
    next(input_file)  # Skip header
    for line in tqdm.tqdm(input_file):
        gene1, gene2, score = line.strip().split("\t")[2], line.strip().split("\t")[3], line.strip().split("\t")[0]
        output_file.write("\t".join([numbd[gene1], numbd[gene2], score]) + "\n")


In [None]:
from pyrwr.rwr import RWR

# Set up and read graph
rwr = RWR()
rwr.read_graph("./funcoup_graph.tsv", "undirected")

#Using a sample gene ID for testing
seed = 0
result = rwr.compute(seed, c=0.15, epsilon=1e-9, max_iters=100, device="cpu") #Default parameter settings
result

In [None]:
# Calculate RWR similarity for comparison
import pandas as pd
import numpy as np
symbols = ["PLK4","SASS6","CEP152","CEP192","CEP63","PCNT"] #For demonstration

for symbol in symbols:
    ensg =sym_to_ensg[symbol]
    seed = numbd[ensg]
    result = rwr.compute(int(seed), c=0.15, epsilon=1e-9, max_iters=100, device="cpu")

    df = pd.DataFrame({"ENSG":gene_list,"Score":result})
    df["Symbol"] = [ensg_to_sym[gene] if gene in ensg_to_sym else None for gene in gene_list]
    df = df.reindex(columns=['Symbol', 'Score', 'ENSG'])
    df.dropna().to_csv(f"../result/funcoup_rwr/{symbol}.csv",index=None)

#Evaluation is performed in Evaluation.ipynb

# STRING

Comparison was performed using the code in Evaluation.ipynb

In [None]:
import tqdm

# Create a mapping of proteins to unique numbers from the file
protein_list, numbd2 = [], {}
c = 0

with open("./protein.links.v10.txt", "r") as f:
    next(f)  # Skip header
    for line in tqdm.tqdm(f):
        protein1, protein2 = line.strip().split(" ")[:2]
        if protein1 not in numbd2:
            numbd2[protein1] = str(c)
            protein_list.append(protein1)
            c += 1
        if protein2 not in numbd2:
            numbd2[protein2] = str(c)
            protein_list.append(protein2)
            c += 1

In [None]:
import tqdm

#Paths
input_path = "./protein.links.v10.txt"
output_path = "string_graph.tsv"

# Mapping genes to unique numbers and writing relationships to a new file
with open(input_path, "r") as input_file, open(output_path, "w") as output_file:
    next(input_file)  # Skip header
    for line in tqdm.tqdm(input_file):
        gene1, gene2, score = line.strip().split(" ")
        output_file.write(f"{numbd2[gene1]}\t{numbd2[gene2]}\t{score}\n")

In [None]:
from pyrwr.rwr import RWR

# Set up and read graph
rwr2 = RWR()
rwr2.read_graph("./string_graph.tsv", "undirected")

#Using a sample gene ID for testing
seed = 0
result = rwr2.compute(seed, c=0.15, epsilon=1e-9, max_iters=100, device="cpu") #Default parameter settings
result

In [None]:
# Calculate RWR similarity for comparison

import pandas as pd
import numpy as np
symbols = ["PLK4","SASS6","CEP152","CEP192","PCNT"] #For demonstration

with open(output_path, "w") as f:
    for symbol in symbols:
        ensp =sym_to_ensp[symbol]
        seed = numbd2["9606."+ensp]
        result = rwr2.compute(int(seed), c=0.15, epsilon=1e-9, max_iters=100, device="cpu")
        
        df = pd.DataFrame({"ENSP":protein_list,"Score":result})
        df["Symbol"] = [ensp_to_sym[prot[5:]] if prot[5:] in ensp_to_sym else None for prot in protein_list]
        df = df.reindex(columns=['Symbol', 'Score', 'ENSP'])
        df.dropna().to_csv(f"../result/string_rwr/{symbol}.csv",index=None)

#Evaluation is performed in Evaluation.ipynb

# For training LEXAS-plus (with cutoffs)

In [None]:
# Calculate RWR similarity for preparing data for training LEXAS-plus
import numpy as np
output_path = "funcoup_rwr.tsv"
cutoff_score = 0.001  # Use a threshold due to vast number of gene combinations
with open(output_path, "w") as f:
    for ensg, seed in numbd.items():
        result = rwr.compute(int(seed), c=0.15, epsilon=1e-9, max_iters=100, device="cpu")

        # Filter by score, exclude self-relations, and write to file
        for n, score in enumerate(result):
            if score > cutoff_score and ensg != gene_list[n]:
                f.write(f"{ensg}\t{gene_list[n]}\t{round(score*100,3)}\n")

        break # This is for testing, processing just one gene. Remove for processing all genes.

In [None]:
# Calculate RWR similarity
import numpy as np

output_path = "string_rwr.tsv"
cutoff_score = 0.0005  # Use a threshold due to vast number of gene combinations

with open(output_path, "w") as f:
    for ensp, seed in numbd2.items():
        result = rwr.compute(int(seed), c=0.15, epsilon=1e-9, max_iters=100, device="cpu")

        # Filter by score, exclude self-relations, and write to file
        for n, score in enumerate(result):
            if score > cutoff_score and ensg != gene_list[n]:
                f.write(f"{ensp}\t{protein_list[n]}\t{round(score*100,3)}\n")

        break # This is for testing, processing just one gene. Remove for processing all genes.

# STRING-raw and FunCoup-raw
Evaluation is performed in Evaluation.ipynb

In [None]:
funcoup5_dict = {symbol:{} for symbol in sym_to_ensg}
with open("funcoup5.tab", "r") as f:
    next(f)  # Skip header
    for line in tqdm.tqdm(f):
        gene1, gene2, score = line.strip().split("\t")[2], line.strip().split("\t")[3], line.strip().split("\t")[0]
        try:
            symbol1,symbol2 = ensg_to_sym[gene1],ensg_to_sym[gene2]
        except KeyError:
            continue
        funcoup5_dict[symbol1][symbol2] = score
        funcoup5_dict[symbol2][symbol1] = score

In [None]:
symbols = ["PLK4","SASS6","CEP152","CEP192","PCNT"]
for symbol in symbols:
    df = pd.DataFrame({"Symbol":funcoup5_dict[symbol].keys(),
                       "Score":funcoup5_dict[symbol].values()})
    df.dropna().to_csv(f"../result/funcoup_raw/{symbol}.csv",index=None)

In [None]:
string_dict = {symbol:{} for symbol in sym_to_ensp}
with open("./protein.links.v10.txt", "r") as f:
    next(f)  # Skip header
    for line in tqdm.tqdm(f):
        gene1, gene2, score = line.strip().split(" ")
        try:
            symbol1,symbol2 = ensp_to_sym[gene1[5:]],ensp_to_sym[gene2[5:]]
        except KeyError:
            continue
        string_dict[symbol1][symbol2] = score
        string_dict[symbol2][symbol1] = score

In [None]:
symbols = ["PLK4","SASS6","CEP152","CEP192","PCNT"]
for symbol in symbols:
    df = pd.DataFrame({"Symbol":string_dict[symbol].keys(),
                       "Score":string_dict[symbol].values()})
    df.dropna().to_csv(f"../result/string_raw/{symbol}.csv",index=None)