Network Calibration of Yeast

In [1]:
import numpy as np
import pandas as pd
import calibration
import matplotlib.pyplot as plt
import networkx as nx
import math
import time
import copy
from itertools import combinations
import random
import os
from scipy import stats
import sys
from matplotlib import cm
from tqdm import trange
import seaborn as sns


# Import Data

In [None]:

df_abbi = pd.read_csv("datasets/yeast/ito_uetz_ccsb_yeri/ABBI-21.tsv",sep = "\t").drop(columns=["gene_name_a","gene_name_b","Zone","GI_PCC","CS_PCC","GE_PCC"])
df_ito = df_abbi[df_abbi["Ito-core"]==True].iloc[:,:2]
df_uetz = df_abbi[df_abbi["Uetz-screen"]==True].iloc[:,:2]
df_ccsb = df_abbi[df_abbi["CCSB-YI1"]==True].iloc[:,:2]
df_yeri = df_abbi[df_abbi["YeRI"]==True].iloc[:,:2]
df_litbm = pd.read_csv("datasets/yeast/Lit-BM-20/Lit-BM-20.tsv",sep = " ").drop(columns=["gene_name_a","gene_name_b"]).iloc[:,:2]
df_tarassov = pd.read_csv("datasets/yeast/Tarassov/tarassovS3.csv").drop(columns=["NAME1","NAME2"])
df_prs = pd.read_csv("datasets/yeast/scPRS-v2.tsv",sep="\t")
df_rrs = pd.read_csv("datasets/yeast/scRRS-v2.tsv",sep="\t")

prsElist = calibration.sort_elist(np.array(df_prs))
rrsElist = calibration.sort_elist(np.array(df_rrs))
itoElist = calibration.sort_elist(np.array(df_ito))
uetzElist = calibration.sort_elist(np.array(df_uetz))
ccsbElist = calibration.sort_elist(np.array(df_ccsb))
abbiElist = calibration.sort_elist(np.array(df_abbi.iloc[:, :2]))
yeriElist = calibration.sort_elist(np.array(df_yeri))
litbmElist = calibration.sort_elist(np.array(df_litbm))
tarassovElist = calibration.sort_elist(np.array(df_tarassov))

prsDict = calibration.formatting.edgelist_to_neighborhood(prsElist)
rrsDict = calibration.formatting.edgelist_to_neighborhood(rrsElist)
itoDict = calibration.formatting.edgelist_to_neighborhood(itoElist)
uetzDict = calibration.formatting.edgelist_to_neighborhood(uetzElist)
ccsbDict = calibration.formatting.edgelist_to_neighborhood(ccsbElist)
abbiDict = calibration.formatting.edgelist_to_neighborhood(abbiElist)
yeriDict = calibration.formatting.edgelist_to_neighborhood(yeriElist)
litbmDict = calibration.formatting.edgelist_to_neighborhood(litbmElist)
tarassovDict = calibration.formatting.edgelist_to_neighborhood(tarassovElist)

df_Gsim = pd.read_csv("datasets/yeast/genetic_interaction/similarity_ExE_PCC02.txt")
GsimEEElist = calibration.sort_elist(np.array(df_Gsim))
df_Gsim = pd.read_csv("datasets/yeast/genetic_interaction/similarity_NxN_PCC02.txt")
GsimNNElist = calibration.sort_elist(np.array(df_Gsim))
df = pd.read_csv('datasets/yeast/genetic_interaction/GI-PSN_02.txt')
gipsnElist = calibration.sort_elist(df.values)

df_GI = pd.read_csv("datasets/yeast/genetic_interaction/ExE_inter_filter_pos.txt")
GIEEposElist = calibration.sort_elist(np.array(df_GI))
df_GI = pd.read_csv("datasets/yeast/genetic_interaction/ExE_inter_filter_neg.txt")
GIEEnegElist = calibration.sort_elist(np.array(df_GI))
df_GI = pd.read_csv("datasets/yeast/genetic_interaction/NxN_inter_filter_pos.txt")
GINNposElist = calibration.sort_elist(np.array(df_GI))
df_GI = pd.read_csv("datasets/yeast/genetic_interaction/NxN_inter_filter_neg.txt")
GINNnegElist = calibration.sort_elist(np.array(df_GI))

df_complex = pd.read_csv("datasets/yeast/co-complex/co_complex_costanzo_2016.txt",sep="\t")
complexElist = calibration.sort_elist(np.array(df_complex.iloc[:,:2]))

df_biogrid = pd.read_csv("datasets/yeast/BioGRID_PPI/biogrid_PPI.txt")
biogridElist = calibration.sort_elist(df_biogrid.values)
biogridDict = calibration.formatting.edgelist_to_neighborhood(biogridElist)

df_grifn = pd.read_csv("datasets/yeast/GRIFn/GRIFn_pos.txt")
grifnElist = calibration.sort_elist(df_grifn.values)

df_coloc = pd.read_csv("datasets/yeast/localization/co-localization_threshold1.txt")
colocElist = calibration.sort_elist(df_coloc.values)

df_i3d = pd.read_csv("datasets/yeast/I3D-exp-20/I3D-exp-20.tsv",sep='\t')[['orf_name_a','orf_name_b']]
i3dElist = calibration.sort_elist(df_i3d.values)
i3dElist=[pair for pair in i3dElist if pair[0]!=pair[1]]
i3dDict = calibration.formatting.edgelist_to_neighborhood(i3dElist)
df_coloc_bio = pd.read_csv("datasets/yeast/localization/co-localization_biogrid.txt")
colocbioElist = calibration.sort_elist(df_coloc_bio.values)

df_coexpCox = pd.read_csv("datasets/yeast/expression/coxpress/co-express_top5000.txt")
coexpCoxElist = calibration.sort_elist(df_coexpCox.values)[:5000]
coexpCoxDict = calibration.formatting.edgelist_to_adjacency(coexpCoxElist)

df_string = pd.read_csv("datasets/yeast/STRING/STRING_physical_threshold900.txt")
stringElist = calibration.sort_elist(df_string.values)

df_stringLC = pd.read_csv('datasets/yeast/STRING/STRING_physical_threshold150_400.txt')
stringLCElist = calibration.sort_elist(df_stringLC.values)

df_stringMC = pd.read_csv('datasets/yeast/STRING/STRING_physical_threshold400_700.txt')
stringMCElist = calibration.sort_elist(df_stringMC.values)

df_stringHC = pd.read_csv('datasets/yeast/STRING/STRING_physical_threshold700_900.txt')
stringHCElist = calibration.sort_elist(df_stringHC.values)
df_sysnb = pd.read_csv("datasets/yeast/sys-nb-06/Sys-NB-06_processed.txt")
sysnbElist = calibration.sort_elist(df_sysnb.values)

df_alpha = pd.read_csv('datasets/yeast/AlphaFold_RoseTTAFold/AlphaFoldRoseTTAFold_processed.txt')
alphaElist = calibration.sort_elist(df_alpha.values)


# All by all network of networks

In [16]:
labels = ["YeRI","Ito-core","Uetz-screen","CCSB-YI1","Lit-BM-20","Tarassov","Sys-NB-06","PRS","RRS","BioGRID","STRING(HsC)",'STRING(HC)','STRING(MC)',"STRING(LC)","AF+RF","co-complex","co-expression","co-localization","co-annotation","I3D","GI-PSN","GI_ExE_pos","GI_ExE_neg","GI_NxN_pos","GI_NxN_neg"]
dictElist = {"YeRI":yeriElist,"Ito-core":itoElist,"Uetz-screen":uetzElist,"CCSB-YI1":ccsbElist,"Lit-BM-20":litbmElist,"Tarassov":tarassovElist,"Sys-NB-06":sysnbElist,
             "PRS":prsElist,"RRS":rrsElist,"BioGRID":biogridElist,"STRING(HsC)":stringElist,'STRING(HC)':stringHCElist,'STRING(MC)':stringMCElist,"STRING(LC)":stringLCElist,"AF+RF":alphaElist,
             "co-complex":complexElist,"co-expression":coexpCoxElist,"co-localization(SGD)":colocElist,"co-localization":colocbioElist,"co-annotation":grifnElist,"I3D":i3dElist,
             "Gsim_ExE":GsimEEElist,"Gsim_NxN":GsimNNElist,"GI-PSN":gipsnElist,
             "GI_ExE_pos":GIEEposElist,"GI_ExE_neg":GIEEnegElist,"GI_NxN_pos":GINNposElist,"GI_NxN_neg":GINNnegElist}

## Statistics

In [18]:
%%time
res = []
for label in labels:
    test = dictElist[label]
    res.append([label,len(set(np.array(test).flatten())),len(test)])
df_stat = pd.DataFrame(res,columns=['dataset','nodes','links'])
n = df_stat['nodes']
l = df_stat['links']
density = l/(n*(n-1)/2)
df_stat['density'] = density
df_stat.to_csv('results/yeast_dataset_overview.txt',index=False)
df_stat

CPU times: user 970 ms, sys: 38.3 ms, total: 1.01 s
Wall time: 1.04 s


Unnamed: 0,dataset,nodes,links,density
0,YeRI,1346,1880,0.002077
1,Ito-core,766,738,0.002519
2,Uetz-screen,747,607,0.002179
3,CCSB-YI1,1206,1605,0.002209
4,Lit-BM-20,2666,5056,0.001423
5,Tarassov,1078,2534,0.004365
6,Sys-NB-06,3067,12968,0.002758
7,PRS,149,108,0.009795
8,RRS,373,198,0.002854
9,BioGRID,5992,130999,0.007298


## All by all - optimize pos

In [None]:
df_preRes = pd.DataFrame()
testedPairs = [(row[0],row[1]) for row in df_preRes.values]+[(row[1],row[0]) for row in df_preRes.values]
allPairs = list(combinations(labels,2))
results = []
s = time.time()
for i in trange(len(allPairs)):
    pair = allPairs[i]
    if pair in testedPairs: continue # if the pair has been calculated, skip it.
    pos1_mean, pos1_sigma, cur_iter1 = calibration.random_subnetwork.optimize_pos(dictElist[pair[0]],dictElist[pair[1]])
    pos2_mean, pos2_sigma, cur_iter2 = calibration.random_subnetwork.optimize_pos(dictElist[pair[1]],dictElist[pair[0]])
    results.append([pair[0],pair[1],pos1_mean, pos1_sigma, cur_iter1, pos2_mean, pos2_sigma, cur_iter2])

In [None]:
df_results = pd.DataFrame(results,columns=['A','B','pos1_mean', 'pos1_sigma', 'cur_iter1', 'pos2_mean', 'pos2_sigma', 'cur_iter2'])
df_results.to_csv('results/all_by_all_pos.txt',float_format='%.2f')

## All by all - optimize neg

In [None]:
df_preRes = pd.DataFrame()
testedPairs = [(row[0],row[1]) for row in df_preRes.values]+[(row[1],row[0]) for row in df_preRes.values]
allPairs = list(combinations(labels,2))
results = []
s = time.time()
for i in trange(len(allPairs)):
    pair = allPairs[i]
    if pair in testedPairs: continue # if the pair has been calculated, skip it.
    neg1_mean, neg1_sigma, cur_iter1 = calibration.random_network.optimize_neg(dictElist[pair[0]],dictElist[pair[1]])
    neg2_mean, neg2_sigma, cur_iter2 = calibration.random_network.optimize_neg(dictElist[pair[1]],dictElist[pair[0]])
    results.append([pair[0],pair[1],neg1_mean, neg1_sigma, cur_iter1, neg2_mean, neg2_sigma, cur_iter2])

In [None]:
df_results = pd.DataFrame(results,columns=['A','B','neg1_mean', 'neg1_sigma', 'cur_iter1', 'neg2_mean', 'neg2_sigma', 'cur_iter2'])
df_results.to_csv('results/all_by_all_neg.txt',float_format='%.2f')

## Combine results

In [None]:
import scipy
df_pos = pd.read_csv('results/all_by_all_pos.txt')
df_neg = pd.read_csv('results/all_by_all_neg.txt')
df = pd.merge(df_pos,df_neg,on=['A','B'])

# select neg benchmark
z_neg1 = [abs((obs-neg1_mean)/neg1_sigma) if neg1_sigma != 0 else np.nan for obs,
          neg1_mean, neg1_sigma in zip(df['obs'], df['neg1_mean'], df['neg1_sigma'])]
z_neg2 = [abs((obs-neg2_mean)/neg2_sigma) if neg2_sigma != 0 else np.nan for obs,
          neg2_mean, neg2_sigma in zip(df['obs'], df['neg2_mean'], df['neg2_sigma'])]
df['selected_neg'] = [neg1 if z1 < z2 else neg2 for z1, z2, neg1,
                      neg2 in zip(z_neg1, z_neg2, df['neg1_mean'], df['neg2_mean'])]
df['selected_neg_sigma'] = [neg1 if z1 < z2 else neg2 for z1, z2, neg1,
                            neg2 in zip(z_neg1, z_neg2, df['neg1_sigma'], df['neg2_sigma'])]

# select pos benchmark
z_pos1 = [abs((obs-pos1_mean)/pos1_sigma) if pos1_sigma != 0 else np.nan for obs,
          pos1_mean, pos1_sigma in zip(df['obs'], df['pos1_mean'], df['pos1_sigma'])]
z_pos2 = [abs((obs-pos2_mean)/pos2_sigma) if pos2_sigma != 0 else np.nan for obs,
          pos2_mean, pos2_sigma in zip(df['obs'], df['pos2_mean'], df['pos2_sigma'])]
df['selected_pos'] = [pos1 if z1 < z2 else pos2 for z1, z2, pos1,
                      pos2 in zip(z_pos1, z_pos2, df['pos1_mean'], df['pos2_mean'])]
df['selected_pos_sigma'] = [pos1 if z1 < z2 else pos2 for z1, z2, pos1,
                            pos2 in zip(z_pos1, z_pos2, df['pos1_sigma'], df['pos2_sigma'])]

score,score_sigma = calibration.cal_score(df['obs'],df['selected_neg'],df['selected_neg_sigma'],df['selected_pos'],df['selected_pos_sigma'])
df['score'] = score
df['score_sigma'] = score_sigma
neg_z = [(obs-neg_mean)/neg_sigma if neg_sigma!=0 else np.nan for obs,neg_mean,neg_sigma in zip(df['obs'],df['selected_neg'],df['selected_neg_sigma'])]
df['neg_p'] = scipy.stats.norm.sf(neg_z)
pos_z = [(obs-pos_mean)/pos_sigma if pos_sigma!=0 else np.nan for obs,pos_mean,pos_sigma in zip(df['obs'],df['selected_pos'],df['selected_pos_sigma'])]
df['pos_p'] = 1-scipy.stats.norm.sf(pos_z)
df['pos_p_binary'] = [1 if (pos_p>=0.05 and neg_p<0.05) else 0 for neg_p,pos_p in zip(df['neg_p'],df['pos_p'])]
df.round(3).to_csv("results/all_by_all_networks_no_instance.txt",index=False)