In [2]:
import numpy as np
from numpy import nan
import pandas as pd
import json
from csv import writer
import os
import matplotlib.pyplot as plt
 
PATH_ROOT = os.getcwd().replace("\\","/").replace("/notebooks","")

In [3]:
# original vector values are both nonzero
def find_common_observations(vector1,vector2):
  vector1_bool = np.where(vector1 != 0, 1, 0)
  vector2_bool = np.where(vector2 != 0, 1, 0)
  take_indices = np.logical_and(vector1_bool,vector2_bool)
  take_indices = take_indices.nonzero()[0]
  x1 = np.take(vector1,take_indices)
  x2 = np.take(vector2,take_indices)
  return x1,x2

# Appends list of tuples to csv
def append_to_csv(file_name,column_names,cor_tuples):
  with open(file_name, 'a+', newline='') as write_obj:
    csv_writer = writer(write_obj)
    if column_names is not None:
      csv_writer.writerow(column_names)
    for cor_tuple in cor_tuples:
      csv_writer.writerow(cor_tuple)

def write_json_to(json_dict,path):
  json_to_write = json.dumps(json_dict)
  write_file = open(path,"w")
  write_file.write(json_to_write)
  write_file.close()

def read_json_from(path):
  with open(path, "r") as read_file:
    init_dict = json.load(read_file)
    if isinstance(init_dict,str):
        return eval(init_dict)
    return init_dict

In [4]:
proteomeHD_path = f"{PATH_ROOT}/data_sources/ProteomeHD/ProteomeHD_v1_1.csv"
proteomeHD_df = pd.read_csv(proteomeHD_path)
proteomeHD_simplified_protein_ids = proteomeHD_df["Simplified_protein_ID"].to_numpy()
proteomeHD_majority_protein_ids = proteomeHD_df["Majority_protein_IDs"].to_numpy()
proteomeHD_feature_matrix = proteomeHD_df.iloc[:,4:].fillna(0).to_numpy()
proteomeHD_feature_matrix_with_na = proteomeHD_df.iloc[:,4:].to_numpy()
major_simplified_idx_lookup_path = f"{PATH_ROOT}/data_sources/ProteomeHD/major_simplified_to_idx_lookup.json"
major_simplified_idx_lookup = read_json_from(major_simplified_idx_lookup_path)

pQTL_protein_path = f"{PATH_ROOT}/data_sources/pQTL/pQTL_protein_converted.csv"
pQTL_protein_df = pd.read_csv(pQTL_protein_path)
pQTL_protein_ids = pQTL_protein_df['uniprotswissprot'].to_numpy()
pQTL_protein_feature_matrix = pQTL_protein_df.iloc[:,2:].fillna(0).to_numpy()
pQTL_protein_idx_lookup_path = f"{PATH_ROOT}/data_sources/pQTL/pQTL_protein_converted_idx_lookup.json"
pQTL_protein_idx_lookup = read_json_from(pQTL_protein_idx_lookup_path)

nikolai_protein_path = f"{PATH_ROOT}/data_sources/Nikolai/Proteins-processed.csv"
nikolai_protein_df = pd.read_csv(nikolai_protein_path)
nikolai_protein_ids = nikolai_protein_df['uniprot_id']
nikolai_protein_feature_matrix = nikolai_protein_df.iloc[:,1:].fillna(0).to_numpy()
nikolai_protein_idx_lookup_path = f"{PATH_ROOT}/data_sources/Nikolai/protein_processed_lookup.json"
nikolai_protein_idx_lookup = read_json_from(nikolai_protein_idx_lookup_path)

coexpression_lookup_path = f"{PATH_ROOT}/data_sources/StringDB/human/medium_confidence_coexpression_relation_lookup.json"
cooccurence_lookup_path = f"{PATH_ROOT}/data_sources/StringDB/human/medium_confidence_cooccurence_relation_lookup.json"
experiments_lookup_path = f"{PATH_ROOT}/data_sources/StringDB/human/medium_confidence_experiments_relation_lookup.json"
fusion_lookup_path = f"{PATH_ROOT}/data_sources/StringDB/human/medium_confidence_fusion_relation_lookup.json"
homology_lookup_path = f"{PATH_ROOT}/data_sources/StringDB/human/medium_confidence_homology_relation_lookup.json"
cocomplex_lookup_path = f"{PATH_ROOT}/data_sources/Corum/all_corum_complex_pairs_size_only.json"
database_lookup_path = f"{PATH_ROOT}/data_sources/StringDB/human/medium_confidence_database_relation_lookup.json"

coexpression_lookup = read_json_from(coexpression_lookup_path)
cooccurence_lookup = read_json_from(cooccurence_lookup_path)
experiments_lookup = read_json_from(experiments_lookup_path)
fusion_lookup = read_json_from(fusion_lookup_path)
homology_lookup = read_json_from(homology_lookup_path)
cocomplex_lookup = read_json_from(cocomplex_lookup_path)
database_lookup = read_json_from(database_lookup_path)

lookup_list = [coexpression_lookup,cooccurence_lookup,experiments_lookup,fusion_lookup,homology_lookup,cocomplex_lookup,database_lookup]

In [5]:
# kNN
def get_top_k_nearest_neighbors(vector,k,candidates,dist_function):
    k += 1
    dist_to_all = np.array(list(map(lambda x: dist_function(vector,x),candidates)))
    neighbor_indices = np.argpartition(dist_to_all, k)[0:k]
    neighbor_indices = neighbor_indices[np.argsort(dist_to_all[neighbor_indices])]
    neighbor_indices = neighbor_indices[1:]
    top_k_neighbors_dist = dist_to_all[neighbor_indices]
    return neighbor_indices,top_k_neighbors_dist

# returns report json containing relations between two proteins
def eval_relation(protein1,protein2):
    report_json = {}
    for lookup in lookup_list:
        lookup_type = lookup['relation_type']
        relation_score = float('NaN')
        try:
            relation_score = lookup[protein1][protein2]
        except KeyError:
            pass
        report_json[lookup_type] = relation_score
    return report_json

# Only returns names of the top k nearest neighbors
def protein_query_simple(protein,k,dist_function,name_vector,feature_matrix,lookup):
    protein_idx = lookup[protein]
    protein_vec = feature_matrix[protein_idx]
    if dist_function.__name__ == 'euclidean_dist':
        diff_vec = feature_matrix - protein_vec
        norm_vec = np.linalg.norm(diff_vec, axis=1)
        top_indices = take_top_k(norm_vec,k+1)[1:]
        return name_vector[top_indices]
    else:
        neighbor_indices,_ = get_top_k_nearest_neighbors(protein_vec,k,feature_matrix,dist_function)
        return name_vector[neighbor_indices] 

def get_closest_neighbors_in_embeddings(protein,k,embeddings,lookup,dist_func):
    neighbor_lists = []
    for embedding in embeddings:
        protein_vec = embedding[lookup[protein]]
        neighbor_indices,_ = get_top_k_nearest_neighbors(protein_vec,k,embedding,dist_func)
        neighbor_lists.append(neighbor_indices)
    return neighbor_lists

def get_common_closest_neighbors_in_embeddings(protein,k,embeddings,lookup,dist_func):
    neighbor_lists = get_closest_neighbors_in_embeddings(protein,k,embeddings,lookup,dist_func)
    shared_proteins_indices = reduce(np.intersect1d, neighbor_lists)
    name_list = list(map(lambda x: shared_proteins[x], shared_proteins_indices))
    return name_list

def calc_interaction_count(protein,candidates):
    count = 0
    for candidate in candidates:
        relation_report = eval_relation(protein,candidate)
        if relation_report['cocomplex'] > 0 or relation_report['experiments'] > 0 or relation_report['database'] > 0:
            count += 1
    return count

def overall_interaction_counts(proteins,interation_candidates):
    assert len(proteins) == len(interation_candidates)
    count_list = []
    for i in range(len(proteins)):
        short_list = []
        # Number of interacting proteins
        short_list.append(calc_interaction_count(proteins[i],interation_candidates[i]))
        # Number of overlapping proteins
        short_list.append(len(interation_candidates[i]))
        count_list.append(short_list)
    return count_list

def calc_combined_counts(proteins,candidates_1,candidates_2):
    assert len(proteins) == len(candidates_1) and len(candidates_1) == len(candidates_2)
    count_list = []
    for i in range(len(proteins)):
        short_list = []
        intersection = np.intersect1d(candidates_1[i],candidates_2[i])
        jaccard = len(intersection) / (len(candidates_1[i]) + len(candidates_2[i]) - len(intersection))
        # Number of interacting proteins
        short_list.append(calc_interaction_count(proteins[i],intersection))
        # Number of overlapping proteins
        short_list.append(len(intersection))
        # Jaccard Similarity
        short_list.append(jaccard)
        count_list.append(short_list)
    return count_list

def euclidean_dist(vec1,vec2):
    return np.linalg.norm(vec1-vec2)

def combined_count_statistics(combined_count):
    combined_count = np.array(combined_count)
    # ALL
    combined_filtered_all = np.array(list(filter(lambda x: x[1] >= 1, combined_count)))
    combined_filtered_all_precision = np.sum(combined_filtered_all[:,0]) / np.sum(combined_filtered_all[:,1])
    # J >= 0.2
    combined_filtered_J = np.array(list(filter(lambda x: x[2] >= 0.2, combined_count)))
    combined_filtered_J_precision = 0
    if (len(combined_filtered_J) > 0):
        combined_filtered_J_precision = np.sum(combined_filtered_J[:,0]) / np.sum(combined_filtered_J[:,1])
    # Known >= 1
    combined_filtered_known = np.array(list(filter(lambda x: x[0] >= 1 and x[1] >= 2, combined_count)))
    combined_filtered_known_precision = 0
    if (len(combined_filtered_known) > 0):
        combined_filtered_known_precision = np.sum(combined_filtered_known[:,0]) / np.sum(combined_filtered_known[:,1])
    return combined_filtered_all_precision,combined_filtered_J_precision,combined_filtered_known_precision,np.sum(combined_filtered_all[:,1]),np.sum(combined_filtered_J[:,1]),np.sum(combined_filtered_known[:,1])

def draw_bar(xs,ys,neighbor_k,data_name,y_label,title,ylim=None,colors=None):
    x_pos = [i for i, _ in enumerate(xs)]
    plt.bar(x_pos, ys,color=colors)
    plt.xlabel(f"{data_name}, k={neighbor_k}")
    plt.ylabel(y_label)
    plt.title(title)
    plt.xticks(x_pos, xs)
    if ylim:
        plt.ylim(ylim)
    plt.show()
    
# Cut an embedding into chunks with size n
def chunk_features(embedding,n):
    columns_to_cut = [ i for i in range(n,embedding.shape[1],n)]
    return np.hsplit(embedding,columns_to_cut)


def calc_appearance_freq(neighbors):
    unique, counts = np.unique(neighbors, return_counts=True)
    return np.asarray((unique, counts)).T

# Indices of the top k element in the given array
def take_top_k(arr,k,desc=False):
    if desc: arr = -arr
    part = np.argpartition(arr,k)[:k]
    correct_order = np.argsort(arr[part])
    return part[correct_order]

# Keep chunk_size small but neighbor size big
def generate_link_candidates(protein,name_vector,embedding,lookup,candidate_size=5,neighbor_size=30,chunk_size=5,dist=euclidean_dist,return_all=False):
    protein_vec = embedding[lookup[protein]]
    diff_embeddeding = embedding - protein_vec
    chunked_embeddings = chunk_features(diff_embeddeding,chunk_size)
    candidates = np.zeros((len(chunked_embeddings),neighbor_size))
    for idx,chunk in enumerate(chunked_embeddings):
        norm_vec = np.linalg.norm(chunk, axis=1)
        top_indices = take_top_k(norm_vec,neighbor_size+1)[1:]
        candidates[idx] = top_indices
    interaction_freq_indices = calc_appearance_freq(candidates)
    top_interacting_indices = take_top_k(interaction_freq_indices[:,1],candidate_size,desc=True)
    top_interacting = name_vector[interaction_freq_indices[top_interacting_indices][:,0].astype(int)]
    return top_interacting
    
#proteomeHD_closest_neighbors = list(map(lambda x: protein_query_simple(x,k,euclidean_dist,shared_proteins,proteomeHD_shared_df_feature_matrix,shared_lookup),shared_proteins))
# def generate_rank_distance_matrix(embedding):
#     rows = embedding.shape[0]
#     final_matrix = np.zeros((rows,rows))
#     for i in range(rows):
#         diff_vec = embedding - embedding[i]
#         norm_vec = np.linalg.norm(diff_vec, axis=1)
#         sort_vec = np.argsort(norm_vec)
#         rank_vec = np.zeros(rows)
#         for j in range(rows):
#             rank_vec[sort_vec[j]] = j
#         final_matrix[i] = rank_vec
#     return final_matrix

In [193]:
%time generate_link_candidates('A0AVT1',proteomeHD_simplified_protein_ids,proteomeHD_feature_matrix,major_simplified_idx_lookup,CANDIDATE_SIZE,NEIGHBOR_SIZE,CHUNK_SIZE)

[[5.852e+03 6.000e+00]
 [2.905e+03 5.000e+00]
 [1.989e+03 5.000e+00]
 ...
 [6.479e+03 1.000e+00]
 [6.480e+03 1.000e+00]
 [6.000e+00 1.000e+00]]
[[5.852e+03 6.000e+00]
 [2.905e+03 5.000e+00]
 [1.989e+03 5.000e+00]
 [2.191e+03 5.000e+00]
 [1.881e+03 5.000e+00]]
Wall time: 25 ms


array(['Q7Z6Z7', 'P45974', 'P14868', 'P20073', 'P12081'], dtype=object)

## Comparisons

In [6]:
CANDIDATE_SIZE = 5
NEIGHBOR_SIZE = 30
CHUNK_SIZE = 5

## ProteomeHD

In [210]:
%time proteomeHD_knn_candidates = np.array(list(map(lambda x: protein_query_simple(x,CANDIDATE_SIZE,euclidean_dist,proteomeHD_simplified_protein_ids,proteomeHD_feature_matrix,major_simplified_idx_lookup),proteomeHD_simplified_protein_ids)))

In [211]:
%time proteomeHD_chunked_candidates = np.array(list(map(lambda x: generate_link_candidates(x,proteomeHD_simplified_protein_ids,proteomeHD_feature_matrix,major_simplified_idx_lookup,CANDIDATE_SIZE,NEIGHBOR_SIZE,CHUNK_SIZE),proteomeHD_simplified_protein_ids))) 

Wall time: 3min 56s


In [214]:
proteomeHD_knn_interaction_counts = np.array(overall_interaction_counts(proteomeHD_simplified_protein_ids,proteomeHD_knn_candidates))
proteomeHD_knn_precision = np.sum(proteomeHD_knn_interaction_counts[:,0]) / np.sum(proteomeHD_knn_interaction_counts[:,1])
proteomeHD_knn_precision

0.044134457037682845

In [215]:
proteomeHD_chunked_interaction_counts = np.array(overall_interaction_counts(proteomeHD_simplified_protein_ids,proteomeHD_chunked_candidates))
proteomeHD_chunked_precision = np.sum(proteomeHD_chunked_interaction_counts[:,0]) / np.sum(proteomeHD_chunked_interaction_counts[:,1])
proteomeHD_chunked_precision

0.04126707352513804

## pQTL

In [218]:
%time pQTL_knn_candidates = np.array(list(map(lambda x: protein_query_simple(x,CANDIDATE_SIZE,euclidean_dist,pQTL_protein_ids,pQTL_protein_feature_matrix,pQTL_protein_idx_lookup),pQTL_protein_ids)))

In [219]:
pQTL_knn_interaction_counts = np.array(overall_interaction_counts(pQTL_protein_ids,pQTL_knn_candidates))
pQTL_knn_precision = np.sum(pQTL_knn_interaction_counts[:,0]) / np.sum(pQTL_knn_interaction_counts[:,1])
pQTL_knn_precision

0.13648523985239852

In [234]:
%time pQTL_chunked_candidates = np.array(list(map(lambda x: generate_link_candidates(x,pQTL_protein_ids,pQTL_protein_feature_matrix,pQTL_protein_idx_lookup,CANDIDATE_SIZE,NEIGHBOR_SIZE,CHUNK_SIZE),pQTL_protein_ids))) 

Wall time: 8.54 s


In [235]:
pQTL_chunked_interaction_counts = np.array(overall_interaction_counts(pQTL_protein_ids,pQTL_chunked_candidates))
pQTL_chunked_precision = np.sum(pQTL_chunked_interaction_counts[:,0]) / np.sum(pQTL_chunked_interaction_counts[:,1])
pQTL_chunked_precision

0.11748154981549816

## nikolai

In [227]:
%time nikolai_knn_candidates = np.array(list(map(lambda x: protein_query_simple(x,CANDIDATE_SIZE,euclidean_dist,nikolai_protein_ids,nikolai_protein_feature_matrix,nikolai_protein_idx_lookup),nikolai_protein_ids)))

Wall time: 1min


In [230]:
nikolai_knn_interaction_counts = np.array(overall_interaction_counts(nikolai_protein_ids,nikolai_knn_candidates))
nikolai_knn_precision = np.sum(nikolai_knn_interaction_counts[:,0]) / np.sum(nikolai_knn_interaction_counts[:,1])
nikolai_knn_precision

0.004329004329004329

In [236]:
%time nikolai_chunked_candidates = np.array(list(map(lambda x: generate_link_candidates(x,nikolai_protein_ids,nikolai_protein_feature_matrix,nikolai_protein_idx_lookup,CANDIDATE_SIZE,NEIGHBOR_SIZE,CHUNK_SIZE),nikolai_protein_ids))) 

Wall time: 1min 3s


In [237]:
nikolai_chunked_interaction_counts = np.array(overall_interaction_counts(nikolai_protein_ids,nikolai_chunked_candidates))
nikolai_chunked_precision = np.sum(nikolai_chunked_interaction_counts[:,0]) / np.sum(nikolai_chunked_interaction_counts[:,1])
nikolai_chunked_precision

0.015584415584415584

# CNN Methods

In [245]:
import torch
import torch.nn as nn
import torch.nn.functional as F

m = nn.Conv1d(16, 33, 3, stride=2)
input = torch.randn(20, 16, 50)
output = m(input)

# class Conv1DNet(nn.module):
    
#     def __init__(self):
#         super(Conv1DNet,self).__init__()
#         self.conv1 = nn.Conv1D()
    
#     def forward(self,x):
#         pass
        

SyntaxError: invalid syntax (<ipython-input-245-4b684acb21a3>, line 6)

In [243]:
output.shape

torch.Size([20, 33, 24])

In [244]:
input.shape

torch.Size([20, 16, 50])

## Correlation Method

In [7]:
proteomeHD_corr_mat = np.corrcoef(proteomeHD_feature_matrix)

In [11]:
proteomeHD_corr_candidates = []
for i in range(len(proteomeHD_corr_mat)):
    top_candidates = take_top_k(proteomeHD_corr_mat[i],CANDIDATE_SIZE+1,desc=True)[1:]
    proteomeHD_corr_candidates.append(proteomeHD_simplified_protein_ids[top_candidates])

In [12]:
proteomeHD_corr_interaction_counts = np.array(overall_interaction_counts(proteomeHD_simplified_protein_ids,proteomeHD_corr_candidates))
proteomeHD_corr_precision = np.sum(proteomeHD_corr_interaction_counts[:,0]) / np.sum(proteomeHD_corr_interaction_counts[:,1])
proteomeHD_corr_precision

0.05756078659304466