In [1]:
import pandas as pd
import numpy as np
import numpy.ma as ma
import pickle
from scipy.spatial import distance
from scipy import stats
import util
import visualization_util
import time

# Data Preprocessing

In [2]:
proteomeHD_df = pd.read_csv('./data_sources/ProteomeHD_v1_1.csv')
proteomeHD_feature_names = [col for col in proteomeHD_df.columns if 'Ratio' in col]
proteomeHD_feature_matrix = proteomeHD_df[proteomeHD_feature_names].to_numpy()
# Keep only proteins quantified in at least 95 experiments 
rows_to_keep = [i for i in range(len(proteomeHD_feature_matrix)) if np.sum(~np.isnan(proteomeHD_feature_matrix[i])) >= 95]
proteomeHD_df = proteomeHD_df.iloc[rows_to_keep]
proteomeHD_gene_names = proteomeHD_df['Gene_names'].to_numpy()
proteomeHD_feature_matrix = proteomeHD_df[proteomeHD_feature_names].to_numpy()

# Pearson Correlation (Higher is better)

In [None]:
proteomeHD_pearson_corr = pd.DataFrame(proteomeHD_feature_matrix.T).corr().to_numpy()
pickle.dump(proteomeHD_pearson_corr, open("./pickle_files/proteomeHD_pearson_corr.p", "wb"))

In [3]:
# Converts to csv for easier analysis in other software
proteomeHD_pearson_corr = pickle.load(open("./pickle_files/proteomeHD_pearson_corr.p", "rb"))
proteomeHD_pearson_corr_mat = pd.DataFrame(columns=proteomeHD_gene_names,data=proteomeHD_pearson_corr)
proteomeHD_pearson_corr_mat.to_csv("./pairwise_csv/proteomeHD_pearson_corr.csv",index=False)

# Cosine Similarity (Lower is better)

In [None]:
def calc_distance_matrix(feature_matrix,dist_func,shared_only=True):
    print(f"Calculating {dist_func.__name__}...")
    start_time = time.time()
    dist_mat = np.empty((feature_matrix.shape[0],feature_matrix.shape[0]))
    for i in range(len(feature_matrix)):
        g1 = feature_matrix[i]
        for j in range(i,len(feature_matrix)):
            g2 = feature_matrix[j]
            if shared_only:
                shared_index = np.where(~np.logical_or(np.isnan(g1),np.isnan(g2)))
                dist_mat[i,j] = dist_func(g1[shared_index],g2[shared_index])
            else:
                dist_mat[i,j] = dist_func(g1,g2)
        if i%200 == 1 and i!=1:
            util.calc_eta(start_time,i,len(feature_matrix))
    print("Done")
    return dist_mat

In [None]:
proteomeHD_cosine_sim = calc_distance_matrix(proteomeHD_feature_matrix,distance.cosine)
pickle.dump(proteomeHD_cosine_sim, open("./pickle_files/proteomeHD_cosine_sim.p", "wb"))

In [4]:
proteomeHD_cosine_sim = pickle.load(open("./pickle_files/proteomeHD_cosine_sim.p", "rb"))
proteomeHD_cosine_sim_mat = pd.DataFrame(columns=proteomeHD_gene_names,data=proteomeHD_cosine_sim)
proteomeHD_cosine_sim_mat.to_csv("./pairwise_csv/proteomeHD_cosine_sim.csv",index=False)

# Euclidean Distance (Lower is better)

In [None]:
def euclidean_wrapper(g1,g2):
    if len(g1) == 0: return float('NaN')
    return np.linalg.norm(g1-g2)
proteomeHD_euclidean_dist = calc_distance_matrix(proteomeHD_feature_matrix,euclidean_wrapper)
pickle.dump(proteomeHD_euclidean_dist, open("./pickle_files/proteomeHD_euclidean_dist.p", "wb"))

In [5]:
proteomeHD_euclidean_dist = pickle.load(open("./pickle_files/proteomeHD_euclidean_dist.p", "rb"))
proteomeHD_euclidean_dist_mat = pd.DataFrame(columns=proteomeHD_gene_names,data=proteomeHD_euclidean_dist)
proteomeHD_euclidean_dist_mat.to_csv("./pairwise_csv/proteomeHD_euclidean_dist.csv",index=False)

# Co-observed (Higher is better)

In [None]:
def coobserved_wrapper(g1,g2):
    return len(g1)
proteomeHD_coobserved_mat = calc_distance_matrix(proteomeHD_feature_matrix,coobserved_wrapper)
pickle.dump(proteomeHD_coobserved_mat, open("./pickle_files/proteomeHD_coobserved_mat.p", "wb"))

In [6]:
proteomeHD_coobserved_mat = pickle.load(open("./pickle_files/proteomeHD_coobserved_mat.p", "rb"))
proteomeHD_coobserved_mat_mat = pd.DataFrame(columns=proteomeHD_gene_names,data=proteomeHD_coobserved_mat)
proteomeHD_coobserved_mat_mat.to_csv("./pairwise_csv/proteomeHD_coobserved_mat.csv",index=False)

# Spearman Correlation (Higher is better)

In [None]:
def spearman_wrapper(g1,g2):
    return stats.spearmanr(g1,g2)[0]
proteomeHD_spearman_mat = calc_distance_matrix(proteomeHD_feature_matrix,spearman_wrapper)
pickle.dump(proteomeHD_spearman_mat, open("./pickle_files/proteomeHD_spearman_corr.p", "wb"))

In [7]:
proteomeHD_spearman_corr = pickle.load(open("./pickle_files/proteomeHD_spearman_corr.p", "rb"))
proteomeHD_spearman_corr_mat = pd.DataFrame(columns=proteomeHD_gene_names,data=proteomeHD_spearman_corr)
proteomeHD_spearman_corr_mat.to_csv("./pairwise_csv/proteomeHD_spearman_corr.csv",index=False)

# Write as pair list

In [None]:
def write_pair_list(names,matrix,col_names,file_path):
    assert len(names) == len(matrix)
    print("Start writing pair list")
    start_time = time.time()
    util.append_to_csv(file_path,col_names,[])
    to_write = []
    for i in range(len(names)):
        for j in range(i,len(matrix)):
            if i!=j:
                to_write.append((names[i],names[j],matrix[i,j]))
        if (i%100==1 or i==len(names)-1) and i!=1:
            util.append_to_csv(file_path,None,to_write)
            to_write.clear()
            util.calc_eta(start_time,i,len(names))
    print("Done writing pair list")

In [None]:
# Load and write proteomeHD pearson corr matrix
proteomeHD_pearson_corr = pickle.load(open("./pickle_files/proteomeHD_pearson_corr.p", "rb"))
proteomeHD_pearson_corr_dist = 1 - proteomeHD_pearson_corr
write_pair_list(proteomeHD_gene_names,proteomeHD_pearson_corr_dist,['gene_1','gene_2','pearson'],"./dist_csv/proteomeHD_pearson_corr_dist.csv")

In [None]:
# Load and write proteomeHD cosine sim matrix
proteomeHD_cosine_sim = pickle.load(open("./pickle_files/proteomeHD_cosine_sim.p", "rb"))
write_pair_list(proteomeHD_gene_names,proteomeHD_cosine_sim,['gene_1','gene_2','cosine'],"./dist_csv/proteomeHD_cosine_sim.csv")

In [None]:
# Load and write proteomeHD euclidean dist matrix
proteomeHD_euclidean_dist = pickle.load(open("./pickle_files/proteomeHD_euclidean_dist.p", "rb"))
write_pair_list(proteomeHD_gene_names,proteomeHD_euclidean_dist,['gene_1','gene_2','euclidean'],"./dist_csv/proteomeHD_euclidean_dist.csv")

In [None]:
# Load and write proteomeHD coobserved dist matrix
proteomeHD_coobserved_dist = pickle.load(open("./pickle_files/proteomeHD_coobserved_mat.p", "rb"))
write_pair_list(proteomeHD_gene_names,proteomeHD_coobserved_dist,['gene_1','gene_2','coobserved'],"./dist_csv/proteomeHD_coobserved_dist.csv")

In [None]:
# Load and write proteomeHD spearman dist matrix
proteomeHD_spearman_dist = pickle.load(open("./pickle_files/proteomeHD_spearman_corr.p", "rb"))
write_pair_list(proteomeHD_gene_names,proteomeHD_spearman_dist,['gene_1','gene_2','spearman'],"./dist_csv/proteomeHD_spearman_corr_dist.csv")

# Create ProteomeHD-String Validation file

In [None]:
# Create Proteome-String Validation file
start_time = time.time()
proteomeHD_pearson_corr_csv = pd.read_csv("./dist_csv/proteomeHD_pearson_corr_dist.csv")
proteomeHD_g1s = proteomeHD_pearson_corr_csv['gene_1']
proteomeHD_g2s = proteomeHD_pearson_corr_csv['gene_2']
string_validation_json = util.read_json_from("./data_sources/StringDB_combined_700_validation.json")
string_validation_vec = np.zeros(len(proteomeHD_g1s))
for i in range(len(proteomeHD_g1s)):
    g1=proteomeHD_g1s[i]
    g2=proteomeHD_g2s[i]
    if g1 in string_validation_json['relations'] and g2 in string_validation_json['relations'][g1]:
        string_validation_vec[i] = 1
    if i%100000 ==1 and i!=1:
        util.calc_eta(start_time,i,len(proteomeHD_g1s))
proteome_string_validation_df = pd.DataFrame({'gene_1':proteomeHD_g1s,'gene_2':proteomeHD_g2s,'string':string_validation_vec})
proteome_string_validation_df.to_csv("./data_sources/ProteomeHD_StringDB_700_validation.csv",index=False)

# Check ROC Performance

In [None]:
# Create a df with all scores included for visualization
proteomeHD_pearson_corr = 1 - np.nan_to_num(pd.read_csv("./dist_csv/proteomeHD_pearson_corr_dist.csv")['pearson'].to_numpy(),nan=-1)
proteomeHD_cosine_dist = 1 - np.nan_to_num(pd.read_csv("./dist_csv/proteomeHD_cosine_sim.csv")['cosine'].to_numpy(),nan=1)
proteomeHD_euclidean_dist_inverted = 1 / (np.nan_to_num(pd.read_csv("./dist_csv/proteomeHD_euclidean_dist.csv")['euclidean'].to_numpy(),nan=1e10) + 1e-10)
proteomeHD_coobserved = pd.read_csv("./dist_csv/proteomeHD_coobserved_dist.csv")['coobserved'].to_numpy()
proteomeHD_spearman_corr = np.nan_to_num(pd.read_csv("./dist_csv/proteomeHD_spearman_corr_dist.csv")['spearman'].to_numpy(),nan=-1)
proteomeHD_coregulation = pd.read_csv('./dist_csv/coregulation_scores_genes_sorted.csv')['coregulation_score'].to_numpy()
proteomeHD_string_validation = pd.read_csv("./data_sources/ProteomeHD_StringDB_700_validation.csv")['string'].to_numpy()
big_df = pd.DataFrame({
    'pearson_dist': proteomeHD_pearson_corr,
    'cosine_dist': proteomeHD_cosine_dist,
    'euclidean': proteomeHD_euclidean_dist_inverted,
    'coobserved': proteomeHD_coobserved,
    'spearman': proteomeHD_spearman_corr,
    'coregulation': proteomeHD_coregulation,
    'validation': proteomeHD_string_validation
})

In [None]:
visualization_util.df_roc_analysis(big_df,['pearson_dist','cosine_dist','euclidean','coobserved','spearman','coregulation'],'validation','ROC Curve, ProteomeHD')

In [None]:
visualization_util.df_precision_recall_analysis(big_df,['pearson_dist','cosine_dist','euclidean','coobserved','spearman','coregulation'],'validation','PRC Curve, ProteomeHD')