# Setup

Shared libraries and util functions

In [10]:
import numpy as np
import pandas as pd
import json
from csv import writer

directory_path = "D:/Desktop/Northeastern_University/Research/Proteomics/ProteinProteinAssociation/Development"

# original vector values are both nonzero
def find_common_observations(vector1,vector2):
  vector1_bool = np.where(vector1 != 0, 1, 0)
  vector2_bool = np.where(vector2 != 0, 1, 0)
  take_indices = np.logical_and(vector1_bool,vector2_bool)
  take_indices = take_indices.nonzero()[0]
  x1 = np.take(vector1,take_indices)
  x2 = np.take(vector2,take_indices)
  return x1,x2

def find_correlation(protein1_row,protein2_row,feature_matrix):
  to_return_obj = {}
  protein1_vec = feature_matrix[protein1_row]
  protein2_vec = feature_matrix[protein2_row]
  vec1_obs,vec2_obs = find_common_observations(protein1_vec,protein2_vec)
  shared_obs = len(vec1_obs)
  to_return_obj["shared_obs"] = shared_obs
  r = 0
  r2 = 0
  # r not defined on 1
  if shared_obs > 1:
    r = np.corrcoef(vec1_obs,vec2_obs)[0,1]
    r2 = r**2
  to_return_obj['r'] = r
  to_return_obj['r2'] = r2
  return to_return_obj

# Sample proteins from given data numpy array where single_num is the number of 
# proteins in one sample and total_num is the total amount of samples
def sample_proteins(data,single_num,total_num):
  return np.random.choice(data,(total_num,single_num))

# Appends list of tuples to csv
def append_to_csv(file_name,column_names,cor_tuples):
  with open(file_name, 'a+', newline='') as write_obj:
    csv_writer = writer(write_obj)
    if column_names is not None:
      csv_writer.writerow(column_names)
    for cor_tuple in cor_tuples:
      csv_writer.writerow(cor_tuple)

def write_json_to(json_dict,path):
  json_to_write = json.dumps(json_dict)
  write_file = open(path,"w")
  write_file.write(json_to_write)
  write_file.close()

def read_json_from(path):
  with open(path, "r") as read_file:
    return json.load(read_file)

class SourceData:
  def __init__(self,path,feature_matrix_start,feature_matrix_end=None,delimiter=None,name=None):
      self.name = name
      self.dataframe = pd.read_csv(path,delimiter=delimiter)
      self.feature_matrix_start = feature_matrix_start
      self.feature_matrix_end = len(self.dataframe.columns) if feature_matrix_end is None else feature_matrix_end
      self.feature_matrix = self.dataframe.iloc[:,self.feature_matrix_start:self.feature_matrix_end].fillna(0).to_numpy()
  
  # Returns the column of the original dataframe where the identifier is not na and the corresponding feature matrix and the full dataframe
  def get_subset_by(self,identifier_column_name):
    cleaned_dataframe = self.dataframe[self.dataframe[identifier_column_name].notna()]
    cleaned_identifier_column = cleaned_dataframe[identifier_column_name]
    cleaned_feature_matrix = cleaned_dataframe.iloc[:,self.feature_matrix_start:self.feature_matrix_end].fillna(0).to_numpy()
    return cleaned_dataframe,cleaned_identifier_column,cleaned_feature_matrix

# class ValidationData:
#   def __init__(self,name,dataframe)

# Load Data

Data prep (source data)

## ProteomeHD Related

In [2]:
proteomeHD_csv_path = f"{directory_path}/data_sources/ProteomeHD/ProteomeHD_v1_1.csv"
proteomeHD_dataframe = pd.read_csv(proteomeHD_csv_path)
proteomeHD_feature_matrix = proteomeHD_dataframe.iloc[:,4:].fillna(0).to_numpy()
proteomeHD_majority_protein_column = proteomeHD_dataframe.iloc[:,0].to_numpy()
proteomeHD_simplified_protein_column = proteomeHD_dataframe.iloc[:,1].to_numpy()
proteomeHD_unique_proteins = np.unique(proteomeHD_simplified_protein_column)

# The complete version is left joined with ensembl protein id
proteomeHD_complete_csv_path = f"{directory_path}/data_sources/ProteomeHD/proteomeHD_complete.csv"
proteomeHD_complete_dataframe = pd.read_csv(proteomeHD_complete_csv_path)
proteomeHD_complete_feature_matrix = proteomeHD_complete_dataframe.iloc[:,5:].fillna(0).to_numpy()
proteomeHD_complete_majority_protein_column = proteomeHD_complete_dataframe.iloc[:,0].to_numpy()
proteomeHD_complete_simplified_protein_column = proteomeHD_complete_dataframe.iloc[:,1].to_numpy()
proteomeHD_complete_ensembl_id_column = proteomeHD_complete_dataframe.iloc[:,2].fillna("").to_numpy()
proteomeHD_complete_unique_proteins = np.unique(proteomeHD_complete_simplified_protein_column)

## pQTL silac protein related

In [0]:
pQTL_silac_protein_csv_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/silac_protein/silac_protein_complete.csv"
pQTL_silac_protein_source_data = SourceData(pQTL_silac_protein_csv_path,2)
pQTL_silac_protein_feature_matrix = pQTL_silac_protein_source_data.feature_matrix
pQTL_silac_protein_uniprot_dataframe,pQTL_silac_protein_uniprot_column,pQTL_silac_protein_uniprot_feature_matrix = pQTL_silac_protein_source_data.get_subset_by("Uniprot_Id")

## pQTL ribo related

In [0]:
pQTL_ribo_csv_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/ribo/ribo_complete.csv"
pQTL_ribo_source_data = SourceData(pQTL_ribo_csv_path,2)
pQTL_ribo_feature_matrix = pQTL_ribo_source_data.feature_matrix
pQTL_ribo_uniprot_dataframe,pQTL_ribo_uniprot_column,pQTL_ribo_uniprot_feature_matrix = pQTL_ribo_source_data.get_subset_by("Uniprot_Id")

## pQTL rna_seq related

In [0]:
pQTL_rna_seq_csv_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/rna_seq/rna_seq_complete.csv"
pQTL_rna_seq_source_data = SourceData(pQTL_rna_seq_csv_path,2)
pQTL_rna_seq_feature_matrix = pQTL_rna_seq_source_data.feature_matrix
pQTL_rna_seq_uniprot_dataframe,pQTL_rna_seq_uniprot_column,pQTL_rna_seq_uniprot_feature_matrix = pQTL_rna_seq_source_data.get_subset_by("Uniprot_Id")

## Corum related

In [11]:
# Makes a numpy array of all unique proteins in Corum
path_to_corum_complexes = f"{directory_path}/data_sources/Corum/allComplexes.txt"
corum_complexes_dataframe = pd.read_csv(path_to_corum_complexes,sep='\t')
all_corum_subunits = corum_complexes_dataframe["subunits(UniProt IDs)"].to_numpy()
all_corum_subunits_list = list(map(lambda x: x.split(";"),all_corum_subunits))
all_corum_proteins = np.unique(np.array([item for sublist in all_corum_subunits_list for item in sublist]))
corum_pairs_lookup_path = f"{directory_path}/data_sources/Corum/all_corum_complex_pairs.json"
corum_pairs_lookup_json = read_json_from(corum_pairs_lookup_path)

In [0]:
for i in range(len(all_corum_subunits_list)):
  subunits = all_corum_subunits_list[i]
  if (len(subunits) == 80):
    print(i)
    print(f"Found one, complex name {corum_complexes_dataframe.iloc[i,1]}")

271
Found one, complex name Ribosome, cytoplasmic
1101
Found one, complex name C complex spliceosome


In [0]:
corum_pairs_lookup_json

## Huri Related

In [0]:
# Makes a numpy array of all unique proteins in huri
huri_union_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Huri/HI-union-processed.csv"
huri_union_df = pd.read_csv(huri_union_path)
all_ids_in_huri = np.array(huri_union_df['Uniprot_Id_1'].tolist() + huri_union_df['Uniprot_Id_2'].tolist())
huri_combined = (huri_union_df["Uniprot_Id_1"] + ";" + huri_union_df["Uniprot_Id_2"]).to_numpy()
unique_ids_in_huri = np.unique(all_ids_in_huri)

## String Related

In [0]:
string_validation_json_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/StringDB/physical_interactions_cutoff_400.json"
string_validation_dict = {}
# Loads the validation dict for string db
with open(string_validation_json_path, "r") as read_file:
    string_validation_dict = json.load(read_file)

string_validation_uniprot_json_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/StringDB/physical_interactions_cutoff_400_uniprot_ver.json"
string_validation_uniprot_dict = {}
# Loads the validation dict for string db
with open(string_validation_uniprot_json_path, "r") as read_file:
    string_validation_uniprot_dict = json.load(read_file)

## pQTL silac protein overlaps

In [0]:
# Makes a numpy array of all huri_interactions that can be verified by pQTL silac proteins
pQTL_silac_protein_verifiable_huri_interactions_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL/silac_protein/huri_validated/all_pQTL_verifiable_interactions.csv"
pQTL_silac_protein_verifiable_huri_interactions_df = pd.read_csv(pQTL_silac_protein_verifiable_huri_interactions_path)
pQTL_silac_protein_verifiable_huri_interactions = (pQTL_silac_protein_verifiable_huri_interactions_df["protein1"] + ";" + pQTL_silac_protein_verifiable_huri_interactions_df["protein2"]).to_numpy()

# Makes a smaller pQTL feature matrix and protein column
# that only contain rows that are overlapping proteins in corum 
_,pQTL_silac_protein_corum_overlap_indices,_ = np.intersect1d(pQTL_silac_protein_uniprot_dataframe["Uniprot_Id"],all_corum_proteins,False,True)
pQTL_silac_protein_uniprot_corum_overlap_df = pQTL_silac_protein_uniprot_dataframe.iloc[pQTL_silac_protein_corum_overlap_indices]
pQTL_silac_protein_uniprot_corum_overlap_feature_matrix = pQTL_silac_protein_uniprot_feature_matrix[pQTL_silac_protein_corum_overlap_indices]

# Makes a smaller pQTL feature matrix and protein column
# that only contain rows that are overlapping proteins in huri 
_,pQTL_silac_protein_huri_overlap_indices,_ = np.intersect1d(pQTL_silac_protein_uniprot_dataframe["Uniprot_Id"],unique_ids_in_huri,False,True)
pQTL_silac_protein_uniprot_huri_overlap_df = pQTL_silac_protein_uniprot_dataframe.iloc[pQTL_silac_protein_huri_overlap_indices]
pQTL_silac_protein_uniprot_huri_overlap_feature_matrix = pQTL_silac_protein_uniprot_feature_matrix[pQTL_silac_protein_huri_overlap_indices]

In [0]:
len(pQTL_silac_protein_uniprot_corum_overlap_df)

1741

## pQTL ribo overlaps

In [0]:
# Makes a smaller pQTL feature matrix and protein column
# that only contain rows that are overlapping proteins in corum 
_,pQTL_ribo_corum_overlap_indices,_ = np.intersect1d(pQTL_ribo_uniprot_column,all_corum_proteins,False,True)
pQTL_ribo_uniprot_corum_overlap_df = pQTL_ribo_uniprot_dataframe.iloc[pQTL_ribo_corum_overlap_indices]
pQTL_ribo_uniprot_corum_overlap_feature_matrix = pQTL_ribo_uniprot_feature_matrix[pQTL_ribo_corum_overlap_indices]

In [0]:
len(pQTL_ribo_uniprot_corum_overlap_df)

3167

## pQTL rna_seq overlaps

In [0]:
# Makes a smaller pQTL feature matrix and protein column
# that only contain rows that are overlapping proteins in corum 
_,pQTL_rna_seq_corum_overlap_indices,_ = np.intersect1d(pQTL_rna_seq_uniprot_column,all_corum_proteins,False,True)
pQTL_rna_seq_uniprot_corum_overlap_df = pQTL_rna_seq_uniprot_dataframe.iloc[pQTL_rna_seq_corum_overlap_indices]
pQTL_rna_seq_uniprot_corum_overlap_feature_matrix = pQTL_rna_seq_uniprot_feature_matrix[pQTL_rna_seq_corum_overlap_indices]

In [0]:
len(pQTL_rna_seq_uniprot_corum_overlap_df)

3101

## proteomeHD overlaps

In [0]:
# Makes a numpy array of all huri_interactions that can be verified by proteomeHD
proteomeHD_verifiable_huri_interactions_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD_huri/all_proteomeHD_verifiable_interactions.csv"
proteomeHD_verifiable_huri_interactions_df = pd.read_csv(proteomeHD_verifiable_huri_interactions_path)
proteomeHD_verifiable_huri_interactions = (proteomeHD_verifiable_huri_interactions_df["protein1"] + ";" + proteomeHD_verifiable_huri_interactions_df["protein2"]).to_numpy()

# Makes a smaller proteomeHD feature matrix and protein column
# that only contain rows that are overlapping proteins in corum 
_,proteomeHD_corum_overlap_indices,_ = np.intersect1d(proteomeHD_unique_proteins,all_corum_proteins,True,True)
proteomeHD_corum_overlap_feature_matrix = proteomeHD_feature_matrix[proteomeHD_corum_overlap_indices]
proteomeHD_corum_overlap_simplified_protein_column = proteomeHD_simplified_protein_column[proteomeHD_corum_overlap_indices]
proteomeHD_corum_overlap_majority_protein_column = proteomeHD_majority_protein_column[proteomeHD_corum_overlap_indices]

# Makes a smaller proteomeHD feature matrix and protein column
# that only contain rows that are overlapping proteins in huri 
_,proteomeHD_huri_overlap_indices,_ = np.intersect1d(proteomeHD_unique_proteins,unique_ids_in_huri,True,True)
proteomeHD_huri_overlap_feature_matrix = proteomeHD_feature_matrix[proteomeHD_huri_overlap_indices]
proteomeHD_huri_overlap_simplified_protein_column = proteomeHD_simplified_protein_column[proteomeHD_huri_overlap_indices]
proteomeHD_huri_overlap_majority_protein_column = proteomeHD_majority_protein_column[proteomeHD_huri_overlap_indices]


# Makes a smaller proteomeHD feature matrix and protein column
# that only contain rows that are overlapping proteins in string
string_proteins =  np.array(list(string_validation_dict.keys()))
_,proteomeHD_complete_string_overlap_indices,_ = np.intersect1d(proteomeHD_complete_ensembl_id_column,string_proteins,False,True)
proteomeHD_complete_string_overlap_feature_matrix = proteomeHD_complete_feature_matrix[proteomeHD_complete_string_overlap_indices]
proteomeHD_complete_string_overlap_protein_column = proteomeHD_complete_simplified_protein_column[proteomeHD_complete_string_overlap_indices]
proteomeHD_complete_string_overlap_majority_protein_column = proteomeHD_complete_majority_protein_column[proteomeHD_complete_string_overlap_indices]

# Pair Generation
Generates every pair in proteomeHD along with their r,r2,and shared observations. (Takes about 2hrs, 4 gb)

In [0]:
import itertools
all_proteomeHD_index_combinations = list(itertools.combinations(range(len(proteomeHD_dataframe)),2))

53277003

In [0]:
import time
all_proteomeHD_pairs_write_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/ProteomeHD/all_proteomeHD_pairs.csv"
column_names = ['protein1_majority_name','protein2_majority_name','protein1_simplified_name','protein2_simplified_name','r','r2','observations']
append_to_csv(all_proteomeHD_pairs_write_path,column_names,[])
to_append = []
i = 0
start_time = time.time()
for idx in range(len(all_proteomeHD_index_combinations)):
  indices = all_proteomeHD_index_combinations[idx]
  candidate = find_correlation(indices[0],indices[1],proteomeHD_feature_matrix)
  protein1_majority_name = proteomeHD_majority_protein_column[indices[0]]
  protein1_simplified_name = proteomeHD_simplified_protein_column[indices[0]]
  protein2_majority_name = proteomeHD_majority_protein_column[indices[1]]
  protein2_simplified_name = proteomeHD_simplified_protein_column[indices[1]]
  to_append.append((protein1_majority_name,protein2_majority_name,
                    protein1_simplified_name,protein2_simplified_name,
                    candidate['r'],candidate['r2'],candidate['shared_obs']))
  if (idx % 10000 == 1 or idx == len(all_proteomeHD_index_combinations) - 1):
    append_to_csv(all_proteomeHD_pairs_write_path,None,to_append)
    to_append = []
    percent_done = (idx+1) / len(all_proteomeHD_index_combinations)
    print(f"Percent done: {percent_done}")
    time_since_start = time.time() - start_time
    print(f"Time elasped: {time_since_start}")
    print(f"Estimate finishing in: {time_since_start / (percent_done) - time_since_start}")

Percent done: 3.753964914280182e-08
Time elasped: 0.0061762332916259766
Estimate finishing in: 164525.59362709522


  c /= stddev[:, None]


Percent done: 0.0001877357853631519
Time elasped: 1.6456403732299805
Estimate finishing in: 8764.079924663241
Percent done: 0.000375434031077161
Time elasped: 2.688014268875122
Estimate finishing in: 7157.0632239524375
Percent done: 0.0005631322767911701
Time elasped: 3.565293788909912
Estimate finishing in: 6327.618223564425
Percent done: 0.0007508305225051792
Time elasped: 4.857633113861084
Estimate finishing in: 6464.822232394771
Percent done: 0.0009385287682191883
Time elasped: 6.194722652435303
Estimate finishing in: 6594.266405661703
Percent done: 0.0011262270139331973
Time elasped: 7.271886348724365
Estimate finishing in: 6449.584732041448
Percent done: 0.0013139252596472066
Time elasped: 8.50612998008728
Estimate finishing in: 6465.3247958148895
Percent done: 0.0015016235053612157
Time elasped: 9.699960231781006
Estimate finishing in: 6449.948678158225
Percent done: 0.0016893217510752247
Time elasped: 10.612086534500122
Estimate finishing in: 6271.250162469107
Percent done: 0.0

  c /= stddev[None, :]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Estimate finishing in: 2072.4841278691647
Percent done: 0.6875387115900645
Time elasped: 4557.634750843048
Estimate finishing in: 2071.27890014636
Percent done: 0.6877264098357785
Time elasped: 4558.958923816681
Estimate finishing in: 2070.07096163633
Percent done: 0.6879141080814924
Time elasped: 4560.323807477951
Estimate finishing in: 2068.8814289085094
Percent done: 0.6881018063272065
Time elasped: 4561.16841673851
Estimate finishing in: 2067.4559740098275
Percent done: 0.6882895045729205
Time elasped: 4562.357758283615
Estimate finishing in: 2066.1869572347923
Percent done: 0.6884772028186346
Time elasped: 4563.675689697266
Estimate finishing in: 2064.9761683650177
Percent done: 0.6886649010643485
Time elasped: 4564.921510219574
Estimate finishing in: 2063.732720835872
Percent done: 0.6888525993100626
Time elasped: 4566.22100019455
Estimate finishing in: 2062.5135139351223
Percent done: 0.6890402975557766
Time elaspe

In [None]:
full_pairs_df_path = f"{directory_path}/data_sources/ProteomeHD/all_proteomeHD_pairs.csv"
full_pairs_df = pd.read_csv(full_pairs_df_path)

In [16]:
# For every pair in proteomeHD , check if they are corum validated 
# Since it uses is_validated, run that block first
full_corum_validated_pairs_df_path = f"{directory_path}/data_sources/ProteomeHD/all_proteomeHD_pairs_with_corum_validation.csv"
column_names = ['protein1_majority_name','protein2_majority_name','corum_validated','part_of_complex']
append_to_csv(full_corum_validated_pairs_df_path,column_names,[])
to_append = []
start_time = time.time()
for idx,row in full_pairs_df.iterrows():
    protein1_majority_name = row['protein1_majority_name']
    protein2_majority_name = row['protein2_majority_name']
    validation,part_of_complex = is_validated(protein1_simplified_name,protein2_simplified_name)
    to_append.append((protein1_majority_name,protein2_majority_name,validation,part_of_complex))
    if (idx % 10000 == 1 or idx == len(full_pairs_df) - 1):
        append_to_csv(full_corum_validated_pairs_df_path,None,to_append)
        to_append = []
        percent_done = (idx+1) / len(full_pairs_df)
        print(f"Percent done: {percent_done}")
        time_since_start = time.time() - start_time
        print(f"Time elasped: {time_since_start}")
        print(f"Estimate finishing in: {time_since_start / (percent_done) - time_since_start}")

MemoryError: 

# Sampling

Define Functions

In [12]:
import time
# Verify if the two given proteins uniprot id are validated in the validation source
# If so, return 1 and a string wth subunits that contain the proteins
# If not, return 0 and an empty string
def is_validated(protein1,protein2,data_source="proteomeHD",validation_source='corum',score_threshold=100,dict_ver="uniprot"):
  #huri_interactions = proteomeHD_verifiable_huri_interactions if data_source=="proteomeHD" else pQTL_verifiable_huri_interactions
  if(validation_source=='corum'):
    try:
      if dict_ver == "uniprot":
        part_of_complex = ';'.join(map(str, corum_lookup_json[protein1][protein2]))
        return 1,part_of_complex
    except KeyError:
      return 0, ""
  elif(validation_source=='huri'):
    for relation in huri_interactions:
      if (protein1 in relation and protein2 in relation):
        return 1,relation
    return 0,""
  elif(validation_source=='string'):
    score = 0
    try:
      if dict_ver == "uniprot":
        score = string_validation_uniprot_dict[protein1][protein2]
      else:
        score = string_validation_dict[protein1][protein2]
    except KeyError:
      pass
    if (score >= score_threshold):
      return 1,score
    return 0,-1
  else:
    return 0,""

# runs unbiased sampling 
# file_name: path of output csv
# data: list of proteins to generate samples from
# name_vector: the name vector of the feature matrix
# feature_matrix: matrix containing only features (numbers)
# sample_number: the number of proteins in each sample, currently only 2 is supported
# total-samples: the total amount of samples to generate
# validation_source: source in which the pair will be validated against, Currently
# supports corum and huri
# eager: True = writes to csv as new candidate is satisfied instead of waiting till the end
def unbiased_sampling(file_name,name_vector,feature_matrix,sample_number,total_samples,data_source="proteomeHD",validation_source='corum',extra_name_vector=[]):
  start_time = time.time()
  to_write = []
  print("Sampling...")
  while len(to_write) < total_samples:
    random_protein_row= np.random.choice(len(name_vector),sample_number,replace=False)
    protein1 = random_protein_row[0]
    protein2 = random_protein_row[1]
    protein1_name = name_vector[protein1]
    protein2_name = name_vector[protein2]    
    candidate = find_correlation(protein1,protein2,feature_matrix)
    validation,belong_to = is_validated(protein1_name,protein2_name,data_source,validation_source)
    if candidate["r2"] != 1:
      to_append = (protein1_name,protein2_name,
                  candidate["r"],candidate["r2"],candidate["shared_obs"],
                  validation,belong_to)
      if data_source == "proteomeHD":
        to_append = (extra_name_vector[protein1],extra_name_vector[protein2]) + to_append
      to_write.append(to_append)
  column_names = ['protein1_name','protein2_name','r','r2','observations','is_validated','belong_to']
  if (data_source == "proteomeHD"):
    column_names = ['protein1_majority_name','protein2_majority_name'] + column_names
  append_to_csv(file_name,column_names,to_write)
  print("Done. Time spent:")
  print(time.time() - start_time)

# runs unbiased sampling 
# file_name: path of output csv
# sample_number: the number of proteins in each sample, currently only 2 is supported
# total-samples: the total amount of samples to generate
# validation_source: source in which the pair will be validated against, Currently
# supports corum and huri
# min_pos_ratio: Minimum percentage of positive class required before sampling negative classes
def biased_sampling(file_name,name_vector,feature_matrix,sample_number,total_samples,min_pos_ratio,data_source="proteomeHD",validation_source='corum',extra_name_vector=[],eager=False):
  start_time = time.time()
  positive_class = []
  negative_class = []
  positive_seen_pairs= []
  print("Sampling...")
  while len(positive_class) + len(negative_class) < total_samples:
    duplicate_allowed = False
    validation_target = 1
    if len(positive_class) >= total_samples*min_pos_ratio:
      validation_target = 0
      duplicate_allowed = True
    random_protein_row= np.random.choice(len(name_vector),sample_number,replace=False)
    protein1 = random_protein_row[0]
    protein2 = random_protein_row[1]
    protein1_name = name_vector[protein1]
    protein2_name = name_vector[protein2]
    candidate_pair_name = protein1_name + ";" + protein2_name
    is_duplicate = False
    if not duplicate_allowed:
      is_duplicate = candidate_pair_name in positive_seen_pairs
    candidate = find_correlation(protein1,protein2,feature_matrix)
    validation,belong_to = is_validated(protein1_name,protein2_name,data_source,validation_source)
    if candidate["r2"] != 1 and validation == validation_target and not is_duplicate:
      to_append = (protein1_name,protein2_name,
                  candidate["r"],candidate["r2"],candidate["shared_obs"],
                  validation,belong_to)
      if data_source == "proteomeHD":
        to_append = (extra_name_vector[protein1],extra_name_vector[protein2]) + to_append
      if(validation_target == 1):
        positive_class.append(to_append)
        positive_seen_pairs.append(protein1_name + ";" + protein2_name)
        positive_seen_pairs.append(protein2_name + ";" + protein1_name)
        if (len(positive_class) % 100 == 0):
          print(f"Positive class number: {len(positive_class)}")
      else:
        negative_class.append(to_append)
      if (eager):
        with open(file_name, 'a+', newline='') as write_obj:
          csv_writer = writer(write_obj)
          csv_writer.writerow(to_append)   
    # if (len(positive_class) != 0 and (len(positive_class) + len(negative_class)) % 10000 == 0):
    #   print(f"Collected total samples: {len(positive_class) + len(negative_class)}")
  column_names = ['protein1_name','protein2_name','r','r2','observations','is_validated','belong_to']
  if (data_source == "proteomeHD"):
    column_names = ['protein1_majority_name','protein2_majority_name'] + column_names
  if (not eager):
    append_to_csv(file_name,column_names,positive_class + negative_class)
  print("Done. Time spent:")
  print(time.time() - start_time)

# Execution (Unbiased Sampling)

Unbiased sampling of all proteins in proteomeHD dataset, validated against Corum (for 100k samples it takes about 68 seconds)

In [0]:
proteomeHD_corum_unbiased_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD_corum/unbiased_all_100k.csv'
unbiased_sampling(proteomeHD_corum_unbiased_write_path_name,
                  proteomeHD_simplified_protein_column,
                  proteomeHD_feature_matrix,
                  2,
                  100000,
                  data_source="proteomeHD",
                  validation_source='corum',
                  extra_name_vector=proteomeHD_majority_protein_column
                  )

Sampling...


  c /= stddev[:, None]
  c /= stddev[None, :]


Done. Time spent:
68.63069558143616


Unbiased sampling overlapping proteins between proteomeHD and Corum in proteomeHD dataset, validated against Corum (for 100k samples it takes about 55 seconds)

In [0]:
proteomeHD_corum_overlapping_unbiased_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD_corum/unbiased_overlap_100k.csv'
unbiased_sampling(proteomeHD_corum_overlapping_unbiased_write_path_name,
                  proteomeHD_corum_overlap_simplified_protein_column,
                  proteomeHD_corum_overlap_feature_matrix,
                  2,
                  100000,
                  data_source="proteomeHD",
                  validation_source='corum',
                  extra_name_vector=proteomeHD_corum_overlap_majority_protein_column
                  )

Sampling...


  c /= stddev[:, None]


Done. Time spent:
55.14083647727966


In [0]:
# unbiased_sampling with 100000 samples takes about 300 seconds ~ 5 minutes using huri
huri_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD_huri_overlap/2_sample_100000_total_ver_1.csv'
unbiased_sampling(huri_write_path_name,proteomeHD_huri_overlap,2,100000,'huri')

Sampling...


  c /= stddev[:, None]


Done. Time spent:
297.35521602630615


In [0]:
# unbiased_sampling with 100000 samples takes about 12 seconds using proteomeHD
proteomeHD_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD/2_sample_100000_total_ver_1.csv'
unbiased_sampling(proteomeHD_write_path_name,proteomeHD_unique_proteins,2,100000,None)

Sampling...


  c /= stddev[:, None]
  c /= stddev[None, :]


Done. Time spent:
12.373075008392334


In [0]:
# unbiased_sampling with 100000 samples takes about 56 seconds using pQTL and corum
pQTL_corum_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL_corum_overlap/2_sample_100000_total_ver_1.csv'
unbiased_sampling(pQTL_corum_write_path_name,pQTL_corum_overlap,pQTL_protein_column,pQTL_feature_matrix,2,100000,'pQTL','corum')

Sampling...
Done. Time spent:
56.42493438720703


In [0]:
# unbiased_sampling with 100000 samples takes about 75 seconds using proteomeHD and string
proteomeHD_string_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD_string/unbiased_100000_cutoff_400.csv'
unbiased_sampling(proteomeHD_string_write_path_name,
                  proteomeHD_complete_ensembl_id_column,
                  proteomeHD_complete_feature_matrix,
                  2,
                  100000,
                  data_source="proteomeHD",
                  validation_source='string',
                  extra_name_vector=proteomeHD_complete_majority_protein_column
                  )

Sampling...


  c /= stddev[:, None]
  c /= stddev[None, :]


Done. Time spent:
72.99455189704895


# Execution (Biased Sampling).
To save time, some samplings are done on only overlapped protein rows

Biased sampling proteins in proteomeHD that are NOT validated in Corum, with positive ratio 0 (all negative samples). Takes about 73 seconds

In [0]:
# biased_sampling with 100000 samples takes about 460 seconds using corum
biased_proteomeHD_corum_no_validation_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD/corum_validated/has_no_validation.csv'
proteomeHD_complete_csv_path = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/ProteomeHD/proteomeHD_complete.csv'
proteomeHD_complete_dataframe = pd.read_csv(proteomeHD_complete_csv_path)
proteomeHD_complete_feature_matrix = proteomeHD_complete_dataframe.iloc[:,5:].fillna(0).to_numpy()
proteomeHD_complete_majority_protein_column = proteomeHD_complete_dataframe.iloc[:,0].to_numpy()
proteomeHD_complete_simplified_protein_column = proteomeHD_complete_dataframe.iloc[:,1].to_numpy()
biased_sampling(biased_proteomeHD_corum_no_validation_write_path_name,
                proteomeHD_complete_simplified_protein_column,
                proteomeHD_complete_feature_matrix,
                2,
                100000,
                0,
                data_source="proteomeHD",
                validation_source='corum',
                extra_name_vector=proteomeHD_complete_majority_protein_column,
                eager=False
                )

Sampling...


  c /= stddev[:, None]
  c /= stddev[None, :]


Done. Time spent:
73.66319608688354


Biased sampling proteins in proteomeHD dataset that overlaps with Corum, with positive ratio 0.01, validated against Corum

In [0]:
# biased_sampling with 100000 samples takes about 460 seconds using corum
biased_proteomeHD_corum_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD_corum/overlap_1000_pos_99000_neg.csv'
biased_sampling(biased_proteomeHD_corum_write_path_name,
                proteomeHD_corum_overlap_simplified_protein_column,
                pQTL_corum_overlap_feature_matrix,
                2,
                100000,
                0.01,
                data_source="proteomeHD",
                validation_source='corum',
                extra_name_vector=proteomeHD_corum_overlap_majority_protein_column,
                eager=False
                )

Sampling...


  c /= stddev[:, None]
  c /= stddev[None, :]


Done. Time spent:
936.6090219020844


Biased sampling proteins in proteomeHD dataset that overlaps with Huri, with positive ratio 0.01, validated against Huri (with 100000 samples takes about 40 minutes)

In [0]:
# biased_sampling with 100000 samples takes about 62 minutes using huri
biased_proteomeHD_huri_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD_huri/overlap_1000_pos_99000_neg.csv'
biased_sampling(biased_proteomeHD_huri_write_path_name,
                proteomeHD_huri_overlap_simplified_protein_column,
                proteomeHD_huri_overlap_feature_matrix,
                2,
                100000,
                0.01,
                data_source="proteomeHD",
                validation_source='huri',
                extra_name_vector=proteomeHD_huri_overlap_majority_protein_column,
                eager=False
                )

Sampling...


  c /= stddev[:, None]


Positive class number: 100


  c /= stddev[None, :]


Positive class number: 200
Positive class number: 300
Positive class number: 400
Positive class number: 500
Positive class number: 600
Positive class number: 700
Positive class number: 800
Positive class number: 900
Positive class number: 1000
Done. Time spent:
3721.2460198402405


Biased sampling of proteins in pQTL dataset that overlaps with Corum, with positive ratio 0.01, validated against Corum (with 100000 samples takes about 316 seconds ~ 5 minutes)

In [0]:
# biased_sampling with 100000 samples takes about 316 seconds ~5 minutes using pQTL and corum
pQTL_corum_overlapping_biased_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL_corum/overlap_1000_pos_99000_neg.csv'
biased_sampling(pQTL_corum_overlapping_biased_write_path_name,
                pQTL_corum_overlap_protein_column,
                pQTL_corum_overlap_feature_matrix,
                2,
                100000,
                0.01,
                data_source='pQTL',
                validation_source='corum',
                eager=True
                )

Sampling...
Done. Time spent:
316.7773790359497


Biased sampling of proteins in pQTL dataset that overlaps with Huri, with positive ratio 0.01, validated against Huri (with 100000 samples takes about 20 minutes minutes)

In [0]:
pQTL_huri_overlapping_biased_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL_huri/overlap_1000_pos_99000_neg.csv'
biased_sampling(pQTL_huri_overlapping_biased_write_path_name,
                pQTL_huri_overlap_protein_column,
                pQTL_huri_overlap_feature_matrix,
                2,
                100000,
                0.01,
                data_source='pQTL',
                validation_source='huri',
                eager=False
                )

Sampling...
Positive class number: 100
Positive class number: 200
Positive class number: 300
Positive class number: 400
Positive class number: 500
Positive class number: 600
Positive class number: 700
Positive class number: 800
Positive class number: 900
Positive class number: 1000
Done. Time spent:
1313.2442202568054


Biased sampling of proteins in proteomeHD dataset that overlaps with string, with positive ratio 0 (negative samples only), validated against string (with 200000 samples takes about 65 seconds)

In [0]:
proteomeHD_complete_string_overlapping_biased_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD_string/overlap_200000_neg.csv'
biased_sampling(proteomeHD_complete_string_overlapping_biased_write_path_name,
                proteomeHD_complete_string_overlap_protein_column,
                proteomeHD_complete_string_overlap_feature_matrix,
                2,
                200000,
                0,
                data_source='proteomeHD',
                validation_source='string',
                eager=False,
                extra_name_vector=proteomeHD_complete_string_overlap_majority_protein_column,
                )

Sampling...


  c /= stddev[:, None]
  c /= stddev[None, :]


Done. Time spent:
64.17428350448608


In [0]:
print(len(proteomeHD_complete_string_overlap_protein_column))
# print(len(proteomeHD_complete_string_overlap_feature_matrix))
print(len(np.unique(proteomeHD_complete_string_overlap_protein_column)))

5703
5672


Biased sampling of proteins in pQTL silac protein dataset with positive ratio 0 (negative samples only), validated against string (with 100000 samples takes about 62 seconds)

In [0]:
pQTL_silac_protein_string_biased_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL/silac_protein/string_validated/has_no_validation.csv'
pQTL_silac_protein_uniprot_dataframe_feature_matrix = pQTL_silac_protein_uniprot_dataframe.iloc[:,2:].fillna(0).to_numpy()
pQTL_silac_protein_uniprot_dataframe_name_vector = pQTL_silac_protein_uniprot_dataframe['Uniprot_Id'].to_numpy()
biased_sampling(pQTL_silac_protein_string_biased_write_path_name,
                pQTL_silac_protein_uniprot_dataframe_name_vector,
                pQTL_silac_protein_uniprot_dataframe_feature_matrix,
                2,
                100000,
                0,
                data_source='pQTL',
                validation_source='string',
                eager=False,
                )

Sampling...
Done. Time spent:
62.044405698776245


Biased sampling of proteins in pQTL silac protein dataset with positive ratio 0 (negative samples only), validated against corum (with 100000 samples takes about 63 seconds)

In [0]:
pQTL_silac_protein_corum_biased_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL/silac_protein/corum_validated/has_no_validation.csv'
pQTL_silac_protein_uniprot_dataframe_feature_matrix = pQTL_silac_protein_uniprot_dataframe.iloc[:,2:].fillna(0).to_numpy()
pQTL_silac_protein_uniprot_dataframe_name_vector = pQTL_silac_protein_uniprot_dataframe['Uniprot_Id'].to_numpy()
biased_sampling(pQTL_silac_protein_corum_biased_write_path_name,
                pQTL_silac_protein_uniprot_dataframe_name_vector,
                pQTL_silac_protein_uniprot_dataframe_feature_matrix,
                2,
                100000,
                0,
                data_source='pQTL_silac_protein',
                validation_source='corum',
                eager=False,
                )

Sampling...
Done. Time spent:
63.3782639503479


Biased sampling of proteins in pQTL ribo dataset that overlaps with string, with positive ratio 0 (negative samples only), validated against string (with 100000 samples takes about 134 seconds)

In [0]:
pQTL_ribo_string_biased_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL/ribo/string_validated/has_no_validation.csv'
pQTL_ribo_uniprot_dataframe_feature_matrix = pQTL_ribo_uniprot_dataframe.iloc[:,2:].fillna(0).to_numpy()
pQTL_ribo_uniprot_dataframe_name_vector = pQTL_ribo_uniprot_dataframe['Uniprot_Id'].to_numpy()
biased_sampling(pQTL_ribo_string_biased_write_path_name,
                pQTL_ribo_uniprot_dataframe_name_vector,
                pQTL_ribo_uniprot_dataframe_feature_matrix,
                2,
                100000,
                0,
                data_source='pQTL',
                validation_source='string',
                eager=False,
                )

Biased sampling of proteins in pQTL ribo dataset with positive ratio 0 (negative samples only), validated against corum (with 100000 samples takes about 128 seconds)

In [0]:
pQTL_ribo_corum_biased_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL/ribo/corum_validated/has_no_validation.csv'
pQTL_ribo_uniprot_dataframe_feature_matrix = pQTL_ribo_uniprot_dataframe.iloc[:,2:].fillna(0).to_numpy()
pQTL_ribo_uniprot_dataframe_name_vector = pQTL_ribo_uniprot_dataframe['Uniprot_Id'].to_numpy()
biased_sampling(pQTL_ribo_corum_biased_write_path_name,
                pQTL_ribo_uniprot_dataframe_name_vector,
                pQTL_ribo_uniprot_dataframe_feature_matrix,
                2,
                100000,
                0,
                data_source='pQTL_ribo',
                validation_source='corum',
                eager=False,
                )

Sampling...
Done. Time spent:
128.59196138381958


Biased sampling of proteins in pQTL rna_seq dataset that overlaps with string, with positive ratio 0 (negative samples only), validated against string (with 100000 samples takes about 131 seconds)

In [0]:
pQTL_rna_seq_string_biased_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL/rna_seq/string_validated/has_no_validation.csv'
pQTL_rna_seq_uniprot_dataframe_feature_matrix = pQTL_rna_seq_uniprot_dataframe.iloc[:,2:].fillna(0).to_numpy()
pQTL_rna_seq_uniprot_dataframe_name_vector = pQTL_rna_seq_uniprot_dataframe['Uniprot_Id'].to_numpy()
biased_sampling(pQTL_rna_seq_string_biased_write_path_name,
                pQTL_rna_seq_uniprot_dataframe_name_vector,
                pQTL_rna_seq_uniprot_dataframe_feature_matrix,
                2,
                100000,
                0,
                data_source='pQTL',
                validation_source='string',
                eager=False,
                )

Sampling...
Done. Time spent:
131.04134511947632


Biased sampling of proteins in pQTL rna_seq dataset with positive ratio 0 (negative samples only), validated against corum (with 100000 samples takes about 128 seconds)

In [0]:
pQTL_rna_seq_corum_biased_write_path_name = '/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL/rna_seq/corum_validated/has_no_validation.csv'
pQTL_rna_seq_uniprot_dataframe_feature_matrix = pQTL_rna_seq_uniprot_dataframe.iloc[:,2:].fillna(0).to_numpy()
pQTL_rna_seq_uniprot_dataframe_name_vector = pQTL_rna_seq_uniprot_dataframe['Uniprot_Id'].to_numpy()
biased_sampling(pQTL_rna_seq_corum_biased_write_path_name,
                pQTL_rna_seq_uniprot_dataframe_name_vector,
                pQTL_rna_seq_uniprot_dataframe_feature_matrix,
                2,
                100000,
                0,
                data_source='pQTL_rna_seq',
                validation_source='corum',
                eager=False,
                )

Sampling...
Done. Time spent:
128.64095830917358


# Calculate Correlations

Data Prep

In [0]:
interactions_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD_huri_overlap/all_proteomeHD_verifiable_interactions.csv"
interactions_df = pd.read_csv(interactions_path)
#interactions_df.head()

Define Functions

In [0]:
# Convert a list of proteins to their corresponding indices in given vector
def convert_protein_to_indices(proteins,vector):
  indices = []
  for protein in proteins:
    indices.append(np.where(vector==protein)[0][0])
  return indices


def find_correlation_stat(protein1_index,protein2_index):
  protein1_vec = proteomeHD_feature_matrix[protein1_index]
  protein2_vec = proteomeHD_feature_matrix[protein2_index]
  vec1_obs,vec2_obs = find_common_observations(protein1_vec,protein2_vec)
  shared_obs = len(vec1_obs)
  r = 0
  r2 = 0
  # r not defined on 1
  if shared_obs > 1:
    r = np.corrcoef(vec1_obs,vec2_obs)[0,1]
    r2 = r**2
  return (r,r2,shared_obs)

def find_correlation_vectors(protein1_indices,protein2_indices):
  r_vector = []
  r2_vector = []
  observation_vector = []
  for i in range(len(protein1_indices)):
    result = find_correlation_stat(protein1_indices[i],protein2_indices[i])
    r_vector.append(result[0])
    r2_vector.append(result[1])
    observation_vector.append(result[2])
  return r_vector,r2_vector,observation_vector

Execution

In [0]:
protein1_indices = convert_protein_to_indices(interactions_df['protein1'].to_numpy(),proteomeHD_simplified_protein_column)
protein2_indices = convert_protein_to_indices(interactions_df['protein2'].to_numpy(),proteomeHD_simplified_protein_column)

In [0]:
interaction_to_csv_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD_huri_overlap/all_proteomeHD_verifiable_interactions_stats.csv"
r_vector,r2_vector,observation_vector = find_correlation_vectors(protein1_indices,protein2_indices)
interactions_df['r'] = r_vector
interactions_df['r2'] = r2_vector
interactions_df['observations'] = observation_vector
interactions_df.to_csv(interaction_to_csv_path,index=False)

  c /= stddev[:, None]


# Generate Verifiable Complexes

In [0]:
# to_verify: list to be checked where if each element is in verify_list
# verify_list: list to be checked against
# min_verify_count: the minimum amount of elements in to_verify that needs to 
# be in verify_list in order for to_verify to be considered verifiable
def is_verifiable(to_verify,verify_list,min_verify_count):
  count = 0
  for node in to_verify:
    if node in verify_list:
      count += 1
  return count >= min_verify_count

In [0]:
pQTL_verifiable_huri_relations = []
i=0
for relation in huri_combined:
  to_verify = relation.split(";")
  if is_verifiable(to_verify,pQTL_protein_column,2):
    pQTL_verifiable_huri_relations.append(to_verify)
  i += 1
  if (i % 10000 == 0):
    print(i / len(huri_combined))

print(f"Number of verfiable huri relations wrt pQTL: {len(pQTL_verifiable_huri_relations)} ({len(pQTL_verifiable_huri_relations)/len(huri_combined)} of all huri relations)")

In [0]:
pQTL_verifiable_huri_relations_matrix = pd.DataFrame(pQTL_verifiable_huri_relations)
pQTL_verifiable_huri_relations_matrix.columns = ['protein1','protein2']
print(pQTL_verifiable_huri_relations_matrix)

     protein1 protein2
0      Q8IZE3   P60520
1      Q8IZE3   Q99757
2      P28838   P45973
3      P28838   Q13185
4      P28838   Q15942
...       ...      ...
3388   Q01081   P61978
3389   Q01081   P04264
3390   Q01081   Q15287
3391   O15160   P0DPB5
3392   O15160   P0DPB6

[3393 rows x 2 columns]


In [0]:
pQTL_huri_verifiable_matrix_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL_huri/all_pQTL_verifiable_interactions.csv"
pQTL_verifiable_huri_relations_matrix.to_csv(pQTL_huri_verifiable_matrix_path,index=False)

ProteomeHD and huri, using ensp id

In [0]:
# Makes a numpy array of all unique proteins in huri
huri_union_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/HI-union-converted-fixed.csv"
huri_union_df = pd.read_csv(huri_union_path)
all_ensembl_ids_in_huri = np.array(huri_union_df['Ensembl_Id_1'].tolist() + huri_union_df['Ensembl_Id_2'].tolist())
huri_ensembl_ids_combined = (huri_union_df["Ensembl_Id_1"] + ";" + huri_union_df["Ensembl_Id_2"]).to_numpy()
unique_ensembl_ids_in_huri = np.unique(all_ensembl_ids_in_huri)
#print(unique_ensembl_ids_in_huri)
print(huri_ensembl_ids_combined)
print(proteomeHD_complete_ensembl_id_column)

['ENSG00000000005;ENSG00000061656' 'ENSG00000000005;ENSG00000099968'
 'ENSG00000000005;ENSG00000104765' ... 'ENSG00000276076;ENSG00000276076'
 'ENSG00000276644;ENSG00000276644' 'ENSG00000280987;ENSG00000280987']
['ENSP00000295971' 'ENSP00000371212' 'ENSP00000371214' ...
 'ENSP00000358071' 'ENSP00000284818' 'ENSP00000430533']


In [0]:
proteomeHD_complete_verifiable_huri_relations = []

i=0
for relation in huri_ensembl_ids_combined:
  to_verify = relation.split(";")
  if is_verifiable(to_verify,proteomeHD_complete_ensembl_id_column,2):
    proteomeHD_complete_verifiable_huri_relations.append(to_verify)
  i += 1
  if (i % 5000 == 0):
    print(i / len(huri_ensembl_ids_combined))

print(f"Number of verfiable huri relations wrt pQTL: {len(proteomeHD_complete_verifiable_huri_relations)} ({len(proteomeHD_complete_verifiable_huri_relations)/len(huri_ensembl_ids_combined)} of all huri relations)")

0.07811767646783115
0.1562353529356623
0.23435302940349342
0.3124707058713246
0.3905883823391557
0.46870605880698685
0.546823735274818
0.6249414117426492
0.7030590882104802
0.7811767646783114
0.8592944411461425
0.9374121176139737
Number of verfiable huri relations wrt pQTL: 0 (0.0 of all huri relations)


In [0]:
proteomeHD_complete_verifiable_huri_relations_matrix = pd.DataFrame(proteomeHD_complete_verifiable_huri_relations)
proteomeHD_complete_verifiable_huri_relations_matrix.columns = ['protein1','protein2']
print(proteomeHD_complete_verifiable_huri_relations_matrix)

In [0]:
pQTL_huri_verifiable_matrix_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL_huri/all_pQTL_verifiable_interactions.csv"
pQTL_verifiable_huri_relations_matrix.to_csv(pQTL_huri_verifiable_matrix_path,index=False)

proteomeHD and string

In [0]:
proteomeHD_complete_dataframe = pd.read_csv(proteomeHD_complete_dataframe_write_path)
proteomeHD_all_protein_ensembl_id = proteomeHD_complete_dataframe["ENSEMBL_PRO_ID"].fillna("").to_numpy()
proteomeHD_all_protein_ensembl_id = np.unique(proteomeHD_all_protein_ensembl_id)[1:] # dropping ""
proteomeHD_all_protein_ensembl_id

array(['ENSP00000000233', 'ENSP00000000412', 'ENSP00000000442', ...,
       'ENSP00000500943', 'ENSP00000500952', 'ENSP00000500953'],
      dtype=object)

In [0]:
# 11 million rows, takes 6 seconds to load
string_db_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/9606.protein.links.v11.0.txt"
string_db_df = pd.read_csv(string_db_path,delimiter=" ")

11759454

In [0]:
import time
from csv import writer
proteomeHD_verifiable_string_relations = []
proteomeHD_string_verifiable_relations_file_name = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD_string/verifiable_interactions.csv"
i=0
start_time = time.time()
with open(proteomeHD_string_verifiable_relations_file_name, 'a+', newline='') as write_obj:
  csv_writer = writer(write_obj)
  csv_writer.writerow(['protein1','protein2','combined_score'])
for index,row in string_db_df.iterrows():
  to_verify = [row[0].strip('9606.'),row[1].strip('9606.')]
  if is_verifiable(to_verify,proteomeHD_all_protein_ensembl_id,2):
    proteomeHD_verifiable_string_relations.append(to_verify + [row[2]])
  i += 1
  if (i % 10000 == 0):
    percent_done = i / len(string_db_df)
    print(f"Percent done: {percent_done}")
    time_since_start = time.time() - start_time
    print(f"Time elasped: {time_since_start}")
    print(f"Estimate finishing in: {time_since_start / (percent_done) - time_since_start}")
    with open(proteomeHD_string_verifiable_relations_file_name, 'a+', newline='') as write_obj:
      csv_writer = writer(write_obj)
      for relation in proteomeHD_verifiable_string_relations:
        csv_writer.writerow(relation)
    proteomeHD_verifiable_string_relations.clear()

print(f"Number of verfiable string relations wrt proteomeHD: {len(proteomeHD_verifiable_string_relations)} ({len(proteomeHD_verifiable_string_relations)/len(string_db_df)} of all string relations)")

In [0]:
proteomeHD_verifiable_string_relations_matrix = pd.DataFrame(proteomeHD_verifiable_string_relations)
proteomeHD_verifiable_string_relations_matrix.columns = ['protein1','protein2','combined_score']
print(proteomeHD_verifiable_string_relations_matrix)

# Joining Dataframes

In [0]:
mapper_csv_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/silac_protein/silac_protein_ensg_to_uniprot.csv"
mapper_df = pd.read_csv(mapper_csv_path)
mapper_df.head()

Unnamed: 0,Uniprot_Id,Ensembl_Id
0,H0Y368,ENSG00000000419
1,O60762,ENSG00000000419
2,Q5QPJ9,ENSG00000000419
3,Q5QPK2,ENSG00000000419
4,Q8IZE3,ENSG00000000457


In [0]:
mapper_df = mapper_df.rename(columns={"yourlist:M20200508A94466D2655679D1FD8953E075198DA8A82206X":"ENSP"})
mapper_df.head()

Unnamed: 0,Entry,ENSP
0,P84085,ENSP00000000233
1,P20645,ENSP00000000412
2,Q02790,ENSP00000001008
3,Q9NR63,ENSP00000001146
4,Q7L592,ENSP00000002125


In [0]:
unmapped_csv_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/silac_protein/silac_protein_original.csv"
unmapped_df = pd.read_csv(unmapped_csv_path)
unmapped_df.head()

Unnamed: 0,ENSG,GM18486,GM18498,GM18499,GM18501,GM18502,GM18504,GM18505,GM18507,GM18508,GM18510,GM18511,GM18516,GM18517,GM18519,GM18520,GM18522,GM18523,GM18852,GM18855,GM18858,GM18861,GM18862,GM18870,GM18871,GM18907,GM18909,GM18912,GM18913,GM18916,GM19092,GM19093,GM19098,GM19099,GM19101,GM19102,GM19108,GM19114,GM19116,GM19119,GM19127,GM19128,GM19130,GM19131,GM19137,GM19138,GM19140,GM19143,GM19144,GM19147,GM19152,GM19153,GM19160,GM19172,GM19192,GM19193,GM19200,GM19203,GM19204,GM19207,GM19209,GM19222,GM19257,txStart,txEnd,chr
0,ENSG00000000419,-0.152939,0.112534,-0.026651,0.242549,-0.187889,-0.34956,0.33476,0.057335,0.1466,-0.103322,-0.179562,0.085958,0.091181,-0.054969,-0.427172,-0.472674,-0.129662,0.080905,-0.247386,0.073625,-0.191584,0.211285,-0.006265,0.632453,0.005731,-0.34319,-0.370613,-0.132016,0.211113,0.182937,0.096074,-0.055273,-0.27246,-0.000566,0.008699,0.245791,0.154592,0.209305,0.022032,-0.023424,0.169723,0.120391,0.062228,-0.164579,-0.01054,-0.679252,-0.022398,0.348017,0.076796,0.07202,0.022578,-0.052259,0.173688,0.049722,0.120151,-0.126637,0.265958,0.403956,-0.058587,0.28179,-0.197876,-0.130815,48984810,49008499,chr20
1,ENSG00000000457,0.791652,,0.22337,0.519652,0.31527,0.183589,0.150964,0.448134,-0.512635,-0.422164,-0.255114,0.714715,0.480594,,0.488497,-0.313829,-0.463073,-0.523384,-0.51239,-0.272013,-0.493687,-0.222451,-0.357053,0.360712,,-0.180658,0.46825,-0.177729,-0.541095,0.052715,-0.010703,-0.484071,-0.21212,-0.183239,-0.296604,-0.289633,-0.182665,-0.186624,-0.218342,-0.17612,-0.924445,0.001114,-0.030255,,0.076401,0.409067,-0.057851,0.055437,-0.713579,-0.578332,-0.113628,0.117461,-0.15285,-0.440206,0.750732,0.116232,-0.107043,0.039066,-0.704787,,-0.172065,-0.597988,168088838,168129670,chr1
2,ENSG00000000938,-2.005159,-1.938036,-1.683624,-0.008059,-0.937846,-0.923066,-1.304436,0.243792,-0.970566,-1.224468,-0.479161,1.398202,-1.178612,,0.219817,0.040035,0.530783,0.443388,-0.575884,-0.176216,-1.001933,-0.031192,-0.848803,0.162731,-0.145061,-0.879867,0.052687,-0.597926,,-0.670345,-0.21426,0.312518,-0.375362,-0.444694,-0.283473,-0.132148,-0.637974,-0.212011,-0.70007,0.366213,0.113333,-2.757292,0.315559,-0.031424,-1.397275,-1.008311,-0.330614,-1.822429,0.398711,-0.567212,-0.088856,-1.325413,-0.228247,0.065184,0.320327,0.161375,-1.805762,-0.343803,-0.228933,-1.491936,-0.414706,-1.144538,27811389,27834314,chr1
3,ENSG00000001084,-2.120697,-1.606023,,-1.26109,,,-1.18044,-0.483953,,-1.200069,-1.353906,-0.187343,-1.228592,-1.546785,-0.838023,,-1.20107,,-1.680513,-1.357563,,,,,-1.652121,,-1.925472,-1.256585,,-1.455362,,,-1.379289,-1.144755,,-0.754509,-1.566459,-1.819803,,-0.916553,-1.328249,,-1.645913,-1.08222,-1.428809,-0.869807,,,-1.258309,-1.387518,,-1.331363,-1.616961,-1.052469,,,-1.323309,-1.332167,-0.879335,-1.250224,-1.360112,-0.817248,53470097,53517790,chr6
4,ENSG00000001630,0.374041,-0.734608,0.062591,-0.643505,0.039957,,-0.555022,,,,,-1.009774,,0.223405,-0.428647,,-0.144491,-0.75464,-0.321471,,0.201388,-0.991795,-0.626633,,-1.047564,-0.335947,,-0.482614,,-0.341852,-0.108933,-0.388592,-0.093644,-1.233221,,-0.68669,-0.462148,-0.75416,-0.537566,-1.011486,,-0.421702,-0.292803,-1.147531,-1.354128,-0.561644,,,-0.950487,-0.590991,-0.670369,-0.290589,-0.655566,-0.936743,-0.774535,,-0.502627,-0.565275,-1.214427,,,-0.604082,91579408,91601946,chr7


In [0]:
long_protein_vec = np.concatenate((unmapped_df['item_id_a'].to_numpy(),unmapped_df['item_id_b'].to_numpy()))
unique_long_vec = np.unique(long_protein_vec)

In [0]:
complete_dataframe = pd.DataFrame.merge(unmapped_df,mapper_df,how='left',left_on='ENSG',right_on='Ensembl_Id')
cols = complete_dataframe.columns.tolist()
cols = cols[:1] + cols[-2:-1] + cols[1:-5]
complete_dataframe = complete_dataframe[cols]
complete_dataframe = complete_dataframe.rename(columns={"Entry":"Uniprot_Id"})

In [0]:
complete_dataframe

Unnamed: 0,ENSG,Uniprot_Id,GM18486,GM18498,GM18499,GM18501,GM18502,GM18504,GM18505,GM18507,GM18508,GM18510,GM18511,GM18516,GM18517,GM18519,GM18520,GM18522,GM18523,GM18852,GM18855,GM18858,GM18861,GM18862,GM18870,GM18871,GM18907,GM18909,GM18912,GM18913,GM18916,GM19092,GM19093,GM19098,GM19099,GM19101,GM19102,GM19108,GM19114,GM19116,GM19119,GM19127,GM19128,GM19130,GM19131,GM19137,GM19138,GM19140,GM19143,GM19144,GM19147,GM19152,GM19153,GM19160,GM19172,GM19192,GM19193,GM19200,GM19203,GM19204,GM19207,GM19209,GM19222,GM19257
0,ENSG00000000419,H0Y368,-0.152939,0.112534,-0.026651,0.242549,-0.187889,-0.349560,0.334760,0.057335,0.146600,-0.103322,-0.179562,0.085958,0.091181,-0.054969,-0.427172,-0.472674,-0.129662,0.080905,-0.247386,0.073625,-0.191584,0.211285,-0.006265,0.632453,0.005731,-0.343190,-0.370613,-0.132016,0.211113,0.182937,0.096074,-0.055273,-0.272460,-0.000566,0.008699,0.245791,0.154592,0.209305,0.022032,-0.023424,0.169723,0.120391,0.062228,-0.164579,-0.010540,-0.679252,-0.022398,0.348017,0.076796,0.072020,0.022578,-0.052259,0.173688,0.049722,0.120151,-0.126637,0.265958,0.403956,-0.058587,0.281790,-0.197876,-0.130815
1,ENSG00000000419,O60762,-0.152939,0.112534,-0.026651,0.242549,-0.187889,-0.349560,0.334760,0.057335,0.146600,-0.103322,-0.179562,0.085958,0.091181,-0.054969,-0.427172,-0.472674,-0.129662,0.080905,-0.247386,0.073625,-0.191584,0.211285,-0.006265,0.632453,0.005731,-0.343190,-0.370613,-0.132016,0.211113,0.182937,0.096074,-0.055273,-0.272460,-0.000566,0.008699,0.245791,0.154592,0.209305,0.022032,-0.023424,0.169723,0.120391,0.062228,-0.164579,-0.010540,-0.679252,-0.022398,0.348017,0.076796,0.072020,0.022578,-0.052259,0.173688,0.049722,0.120151,-0.126637,0.265958,0.403956,-0.058587,0.281790,-0.197876,-0.130815
2,ENSG00000000419,Q5QPJ9,-0.152939,0.112534,-0.026651,0.242549,-0.187889,-0.349560,0.334760,0.057335,0.146600,-0.103322,-0.179562,0.085958,0.091181,-0.054969,-0.427172,-0.472674,-0.129662,0.080905,-0.247386,0.073625,-0.191584,0.211285,-0.006265,0.632453,0.005731,-0.343190,-0.370613,-0.132016,0.211113,0.182937,0.096074,-0.055273,-0.272460,-0.000566,0.008699,0.245791,0.154592,0.209305,0.022032,-0.023424,0.169723,0.120391,0.062228,-0.164579,-0.010540,-0.679252,-0.022398,0.348017,0.076796,0.072020,0.022578,-0.052259,0.173688,0.049722,0.120151,-0.126637,0.265958,0.403956,-0.058587,0.281790,-0.197876,-0.130815
3,ENSG00000000419,Q5QPK2,-0.152939,0.112534,-0.026651,0.242549,-0.187889,-0.349560,0.334760,0.057335,0.146600,-0.103322,-0.179562,0.085958,0.091181,-0.054969,-0.427172,-0.472674,-0.129662,0.080905,-0.247386,0.073625,-0.191584,0.211285,-0.006265,0.632453,0.005731,-0.343190,-0.370613,-0.132016,0.211113,0.182937,0.096074,-0.055273,-0.272460,-0.000566,0.008699,0.245791,0.154592,0.209305,0.022032,-0.023424,0.169723,0.120391,0.062228,-0.164579,-0.010540,-0.679252,-0.022398,0.348017,0.076796,0.072020,0.022578,-0.052259,0.173688,0.049722,0.120151,-0.126637,0.265958,0.403956,-0.058587,0.281790,-0.197876,-0.130815
4,ENSG00000000457,Q8IZE3,0.791652,,0.223370,0.519652,0.315270,0.183589,0.150964,0.448134,-0.512635,-0.422164,-0.255114,0.714715,0.480594,,0.488497,-0.313829,-0.463073,-0.523384,-0.512390,-0.272013,-0.493687,-0.222451,-0.357053,0.360712,,-0.180658,0.468250,-0.177729,-0.541095,0.052715,-0.010703,-0.484071,-0.212120,-0.183239,-0.296604,-0.289633,-0.182665,-0.186624,-0.218342,-0.176120,-0.924445,0.001114,-0.030255,,0.076401,0.409067,-0.057851,0.055437,-0.713579,-0.578332,-0.113628,0.117461,-0.152850,-0.440206,0.750732,0.116232,-0.107043,0.039066,-0.704787,,-0.172065,-0.597988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21504,ENSG00000221914,P63151,0.138373,0.021224,-0.141203,-0.158765,-0.067018,-0.110952,-0.118415,-0.324637,-0.411995,0.075296,-0.057755,-0.013635,-0.111669,-0.370439,-0.083003,-0.264441,-0.237288,-0.266821,0.265624,0.051860,-0.137592,-0.211058,-0.205046,-0.347853,0.076561,-0.077785,0.155261,-0.016886,-0.302185,-0.014438,-0.045450,-0.074223,-0.128438,-0.074871,-0.309361,-0.275936,-0.290389,-0.148865,-0.148273,-0.031519,-0.459239,-0.130024,0.088041,-0.385428,-0.104713,-0.169678,-0.488990,0.060367,-0.056371,-0.135260,-0.073178,-0.321482,-0.483736,-0.132180,-0.458237,-0.264977,0.106831,-0.061422,-0.399286,0.212417,-0.190635,-0.241396
21505,ENSG00000221983,M0R1M6,0.154847,-0.147347,0.138279,-0.160248,0.109442,0.227438,0.131967,-0.165104,0.038383,0.047489,-0.031260,0.051644,-0.190447,-0.135020,-0.100832,-0.263877,-0.318371,-0.168591,-0.007305,0.268299,-0.193320,-0.046982,-0.239139,-0.826140,-0.089409,-0.147145,-0.110391,-0.234799,0.061384,-0.159622,-0.155461,-0.076989,-0.173556,-0.205958,0.081031,-0.186946,-0.044344,-0.109061,,-0.259402,-0.156380,-0.037507,-0.050859,-0.259811,-0.162596,0.084434,0.234234,-0.090957,-0.244134,-0.109852,-0.155394,-0.051672,,-0.115700,-0.021778,0.115323,0.142888,-0.001553,-0.042591,,-0.138550,-0.100753
21506,ENSG00000221983,M0R1V7,0.154847,-0.147347,0.138279,-0.160248,0.109442,0.227438,0.131967,-0.165104,0.038383,0.047489,-0.031260,0.051644,-0.190447,-0.135020,-0.100832,-0.263877,-0.318371,-0.168591,-0.007305,0.268299,-0.193320,-0.046982,-0.239139,-0.826140,-0.089409,-0.147145,-0.110391,-0.234799,0.061384,-0.159622,-0.155461,-0.076989,-0.173556,-0.205958,0.081031,-0.186946,-0.044344,-0.109061,,-0.259402,-0.156380,-0.037507,-0.050859,-0.259811,-0.162596,0.084434,0.234234,-0.090957,-0.244134,-0.109852,-0.155394,-0.051672,,-0.115700,-0.021778,0.115323,0.142888,-0.001553,-0.042591,,-0.138550,-0.100753
21507,ENSG00000221983,M0R2S1,0.154847,-0.147347,0.138279,-0.160248,0.109442,0.227438,0.131967,-0.165104,0.038383,0.047489,-0.031260,0.051644,-0.190447,-0.135020,-0.100832,-0.263877,-0.318371,-0.168591,-0.007305,0.268299,-0.193320,-0.046982,-0.239139,-0.826140,-0.089409,-0.147145,-0.110391,-0.234799,0.061384,-0.159622,-0.155461,-0.076989,-0.173556,-0.205958,0.081031,-0.186946,-0.044344,-0.109061,,-0.259402,-0.156380,-0.037507,-0.050859,-0.259811,-0.162596,0.084434,0.234234,-0.090957,-0.244134,-0.109852,-0.155394,-0.051672,,-0.115700,-0.021778,0.115323,0.142888,-0.001553,-0.042591,,-0.138550,-0.100753


In [0]:
complete_dataframe_write_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/silac_protein/silac_protein_complete.csv"
complete_dataframe.to_csv(complete_dataframe_write_path,index=False)

# Making a big json file

## Making string validation json

In [0]:
import time
import json

string_physical_interactions_file_name = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/StringDB/physical_interactions_cutoff_400_complete.csv"
string_physical_interactions_df = pd.read_csv(string_physical_interactions_file_name)

In [0]:
print(len(string_physical_interactions_df))
string_physical_interactions_df

370530


Unnamed: 0,item_id_a,item_id_b,item_id_a_uniprot,item_id_b_uniprot,score
0,ENSP00000000233,ENSP00000222547,P84085,O15155,913
1,ENSP00000000233,ENSP00000223369,P84085,O15498,913
2,ENSP00000000233,ENSP00000249923,P84085,P53618,923
3,ENSP00000000233,ENSP00000258739,P84085,P33947,913
4,ENSP00000000233,ENSP00000262225,P84085,Q15363,909
...,...,...,...,...,...
370525,ENSP00000485663,ENSP00000470972,Q9Y262,P39019,954
370526,ENSP00000485663,ENSP00000472469,Q9Y262,P62857,955
370527,ENSP00000485663,ENSP00000472985,Q9Y262,P46782,961
370528,ENSP00000485663,ENSP00000475027,Q9Y262,S4R435,437


In [0]:
string_physical_interactions_mapping_json = {}
i=0
start_time = time.time()
for index,row in string_physical_interactions_df.iterrows():
  protein1 = row[2]
  protein2 = row[3]
  if protein1 not in string_physical_interactions_mapping_json:
    string_physical_interactions_mapping_json[protein1] = {}
  if protein2 not in string_physical_interactions_mapping_json:
    string_physical_interactions_mapping_json[protein2] = {}  
  string_physical_interactions_mapping_json[protein1][protein2] = row[4]
  string_physical_interactions_mapping_json[protein2][protein1] = row[4]
  i += 1
  if (i % 10000 == 0):
    percent_done = i / len(string_physical_interactions_df)
    print(f"Percent done: {percent_done}")
    time_since_start = time.time() - start_time
    print(f"Time elasped: {time_since_start}")
    print(f"Estimate finishing in: {time_since_start / (percent_done) - time_since_start}")

In [0]:
big_json = json.dumps(string_physical_interactions_mapping_json)
big_json_save_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/StringDB/physical_interactions_cutoff_400_uniprot_ver.json"
big_json_file = open(big_json_save_path,"w")
big_json_file.write(big_json)
big_json_file.close()

In [0]:
#Check if everything loads alright
with open(big_json_save_path, "r") as read_file:
    check_json = json.load(read_file)
    print(check_json["P84085"])

{'O15155': 913, 'O15498': 913, 'P53618': 923, 'P33947': 913, 'Q15363': 909, 'O75154': 522, 'O14579': 912, 'Q9NP61': 919, 'Q9H254': 902, 'P48444': 923, 'P16157': 912, 'Q12955': 912, 'P62330': 468, 'Q9Y3Q3': 901, 'P49755': 912, 'Q9H0U4': 918, 'Q8N6T3': 928, 'Q9NRC6': 902, 'Q9Y678': 922, 'P35606': 923, 'P24390': 913, 'Q9BVK6': 902, 'Q01082': 902, 'Q01484': 912, 'B1AP13': 900, 'P53621': 918, 'Q92538': 929, 'O15260': 905, 'Q13813': 903, 'P15328': 901, 'P13987': 900, 'P01308': 900, 'O43731': 913, 'P62820': 918, 'Q9UBF2': 922, 'Q9Y3B3': 902, 'Q9ULH1': 452, 'O15020': 905, 'Q8N6H7': 919, 'O60763': 918, 'P61923': 915, 'O75396': 919, 'Q9P299': 915}


## Making proteomeHD lookup json

In [0]:
proteomeHD_ensembl_id_idx_lookup_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/ProteomeHD/ensembl_id_idx_lookup.json"
ensembl_to_complete_index_lookup_json = {}
for idx,row in enumerate(proteomeHD_complete_ensembl_id_column):
  ensembl_to_complete_index_lookup_json[row] = idx
write_json_to(ensembl_to_complete_index_lookup_json,proteomeHD_ensembl_id_idx_lookup_path)

In [0]:
proteomeHD_uniprot_id_idx_lookup_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/ProteomeHD/uniprot_id_idx_lookup.json"
uniprot_id_complete_index_lookup_json = {}
for idx,row in enumerate(proteomeHD_complete_simplified_protein_column):
  uniprot_id_complete_index_lookup_json[row] = idx
write_json_to(uniprot_id_complete_index_lookup_json,proteomeHD_uniprot_id_idx_lookup_path)

In [6]:
proteomeHD_majority_to_simplified_path = f"{directory_path}/data_sources/ProteomeHD/majority_to_simplfied_lookup.json"
proteomeHD_majority_to_simplified_lookup_json = {}
for idx,row in proteomeHD_dataframe.iterrows():
  proteomeHD_majority_to_simplified_lookup_json[row["Majority_protein_IDs"]] = row["Simplified_protein_ID"]
write_json_to(proteomeHD_majority_to_simplified_lookup_json,proteomeHD_majority_to_simplified_path)

## Making a pQTL silac protein lookup json

In [0]:
import pandas as pd

# pQTL_complete_df_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/ribo/ribo_complete.csv"
# pQTL_complete_df = pd.read_csv(pQTL_silac_protein_complete_df_path)
# pQTL_complete_uniprot_col = pQTL_complete_df['Uniprot_Id'].dropna()
# pQTL_uniprot_idx_lookup_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/ribo/ribo_uniprot_idx_lookup.json"
# pQTL_uniprot_idx_lookup_json = {}
# for idx,row in enumerate(pQTL_silac_protein_complete_uniprot_col):
#   pQTL_uniprot_idx_lookup_json[row] = idx
# write_json_to(pQTL_uniprot_idx_lookup_json,pQTL_uniprot_idx_lookup_path)

## Making Corum look up json

All corum interactions json

In [0]:
corum_flattened_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Corum/all_complexes_relations_flattened.csv"
corum_flattened_df = pd.read_csv(corum_flattened_path)
corum_lookup_json = {}
for idx,row in corum_flattened_df.iterrows():
  protein1 = row[0]
  protein2 = row[1]
  if protein1 not in corum_lookup_json:
    corum_lookup_json[protein1] = set()
  if protein2 not in corum_lookup_json:
    corum_lookup_json[protein2] = set()
  corum_lookup_json[protein1].add(protein2)
  corum_lookup_json[protein2].add(protein1)


In [0]:
# convert sets to list so it's json serializable
for key in corum_lookup_json:
  corum_lookup_json[key] = list(corum_lookup_json[key])

In [0]:
big_json = json.dumps(corum_lookup_json)
big_json_save_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Corum/all_corum_complex_pairs.json"
write_json_to(big_json,big_json_save_path)

In [4]:
corum_complexes_dataframe

Unnamed: 0,ComplexID,ComplexName,Organism,Synonyms,Cell line,subunits(UniProt IDs),subunits(Entrez IDs),Protein complex purification method,GO ID,GO description,FunCat ID,FunCat description,subunits(Gene name),Subunits comment,PubMed ID,Complex comment,Disease comment,SWISSPROT organism,subunits(Gene name syn),subunits(Protein name)
0,1,BCL6-HDAC4 complex,Human,,,P41182;P56524,604;9759,MI:0007-anti tag coimmunoprecipitation,GO:0006265;GO:0045892;GO:0051276;GO:0030183;GO...,DNA topological change;negative regulation of ...,10.01.09.05;11.02.03.04.03;42.10.03;43.03.07.0...,DNA conformation modification (e.g. chromatin)...,BCL6;HDAC4,,11929873,Transcriptional repression by BCL6 is thought ...,,Homo sapiens (Human);Homo sapiens (Human),BCL5 LAZ3 ZBTB27 ZNF51;KIAA0288,B-cell lymphoma 6 protein;Histone deacetylase 4
1,2,BCL6-HDAC5 complex,Human,,,P41182;Q9UQL6,604;10014,MI:0007-anti tag coimmunoprecipitation,GO:0006265;GO:0045892;GO:0051276;GO:0030183;GO...,DNA topological change;negative regulation of ...,10.01.09.05;11.02.03.04.03;42.10.03;43.03.07.0...,DNA conformation modification (e.g. chromatin)...,BCL6;HDAC5,,11929873,Transcriptional repression by BCL6 is thought ...,,Homo sapiens (Human);Homo sapiens (Human),BCL5 LAZ3 ZBTB27 ZNF51;KIAA0600,B-cell lymphoma 6 protein;Histone deacetylase 5
2,3,BCL6-HDAC7 complex,Human,,,P41182;Q8WUI4,604;51564,MI:0007-anti tag coimmunoprecipitation,GO:0006265;GO:0045892;GO:0051276;GO:0030183;GO...,DNA topological change;negative regulation of ...,10.01.09.05;11.02.03.04.03;42.10.03;43.03.07.0...,DNA conformation modification (e.g. chromatin)...,BCL6;HDAC7,,11929873,Transcriptional repression by BCL6 is thought ...,,Homo sapiens (Human);Homo sapiens (Human),BCL5 LAZ3 ZBTB27 ZNF51;HDAC7A,B-cell lymphoma 6 protein;Histone deacetylase 7
3,4,Multisubunit ACTR coactivator complex,Human,,,Q09472;Q92793;Q92831;Q9Y6Q9,2033;1387;8850;8202,MI:0004-affinity chromatography technologies;M...,GO:0045893;GO:0023052;GO:0005634,"positive regulation of transcription, DNA-temp...",11.02.03.04.01;30.01;70.10,transcription activation;cellular signalling;n...,EP300;CREBBP;KAT2B;NCOA3,,9267036,Cofactor ACTR binds directly nuclear receptors...,,Homo sapiens (Human);Homo sapiens (Human);Homo...,"P300;CBP;PCAF;AIB1, BHLHE42, RAC3, TRAM1, ACTR",Histone acetyltransferase p300;CREB-binding pr...
4,9,6S-nuclear aryl hydrocarbon (Ah) receptor liga...,Mouse,,,P30561;P53762,11622;11863,MI:0019- coimmunoprecipitation,GO:2001141;GO:0006355;GO:0003677;GO:0023052;GO...,regulation of RNA biosynthetic process;regulat...,11.02.03.04;16.03.01;30.01;34.11.03;70.10,transcriptional control;DNA binding;cellular s...,Ahr;Arnt,,1317062,"Arnt contains a basic helix-loop-helix motif, ...",,Mus musculus (Mouse);Mus musculus (Mouse),;None,Aryl hydrocarbon receptor ;Aryl hydrocarbon re...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4269,7588,APP(AICD)-FOXO1 complex,Human,,HEK 293T cells; SY-SH5Y cells,P05067;Q12778,351;2308,MI:0019-coimmunoprecipitation,GO:0008219;GO:0005737;GO:0006915,cell death;cytoplasm;apoptotic process,40.10;70.03;40.10.02,cell death;cytoplasm;apoptosis (type I program...,APP;FOXO1,AICD is a small APP intracellular domain.,24832605,"In human cells, AICD physically interacts with...",,Homo sapiens (Human);Homo sapiens (Human),"Amyloid beta precursor protein;FKHR, FOXO1A",Amyloid beta A4 protein;Forkhead box protein O1
4270,7589,APP(AICD)-FOXO4 complex,Human,,HEK 293T cells; SY-SH5Y cells,P05067;P98177,351;4303,MI:0019-coimmunoprecipitation,GO:0008219;GO:0005737;GO:0006915,cell death;cytoplasm;apoptotic process,40.10;70.03;40.10.02,cell death;cytoplasm;apoptosis (type I program...,APP;FOXO4,AICD is a small APP intracellular domain.,24832605,"In human cells, AICD physically interacts with...",,Homo sapiens (Human);Homo sapiens (Human),Amyloid beta precursor protein;AFX AFX1 MLLT7,Amyloid beta A4 protein;Forkhead box protein O4
4271,7592,Meiob-Rpa2-Spata22 complex,Mouse,,P18 testes,Q5SV06;Q62193;Q9D513,380709;19891;75178,MI:0019-coimmunoprecipitation,GO:0031490,chromatin DNA binding,,,Spata22;Rpa2;Meiob,,24240703,"Based on experimental results, the authors sug...",,Mus musculus (Mouse);Mus musculus (Mouse);Mus ...,"Gm882,Repro42;Rpa34;None",Spermatogenesis-associated protein 22;Replicat...
4272,7593,AKAP13-MAP2K3-MAP3K20-MAPK14-PKN1 complex,Human,,HEK293 cells,P46734;Q12802;Q16512;Q16539;Q9NYL2,5606;11214;5585;1432;51776,MI:0019-coimmunoprecipitation,GO:0000165,MAPK cascade,30.01.05.01.03,MAPKKK cascade,MAP2K3;AKAP13;PKN1;MAPK14;MAP3K20,,21224381,Silencing of AKAP13 expression specifically r...,,Homo sapiens (Human);Homo sapiens (Human);Homo...,"MEK3,MKK3,PRKMK3,SKK2;BRX,HT31,LBC;PAK1,PKN,PR...",Dual specificity mitogen-activated protein kin...


Protein and the subunits it's in json

In [0]:
protein_to_subunit_json = {}
for idx,row in corum_complexes_dataframe.iterrows():
  proteins = row['subunits(UniProt IDs)'].split(";")
  complex_id = row["ComplexID"]
  for protein in proteins:
    if protein not in protein_to_subunit_json:
      protein_to_subunit_json[protein] = []
    protein_to_subunit_json[protein].append(complex_id) 

In [0]:
json_write_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Corum/protein_to_included_complex_id.json"
write_json_to()
protein_to_subunit_json(protein_to_subunit_json,json_write_path)

size n proteomeHD verifiable corum complexes 

In [7]:
size_n_verifiable_to_amount_json = {}
proteomeHD_uniprot_to_idx_lookup_json_path = f"{directory_path}/data_sources/ProteomeHD/uniprot_id_idx_lookup.json"
proteomeHD_uniprot_to_idx_lookup_json = read_json_from(proteomeHD_uniprot_to_idx_lookup_json_path)
def is_verifiable(to_verify,verify_list,min_verify_count):
  count = 0
  for node in to_verify:
    if node in verify_list:
      count += 1
  return count >= min_verify_count
def is_proteomeHD_verifiable(subunits,min_count=2):
  return is_verifiable(subunits.split(";"),proteomeHD_uniprot_to_idx_lookup_json.keys(),min_count)
for i in range(0,201):
    all_proteomeHD_verifiable_corum_subunits = list(filter(lambda x: is_proteomeHD_verifiable(x,i), all_corum_subunits))
    size_n_verifiable_to_amount_json[i] = len(all_proteomeHD_verifiable_corum_subunits)

In [9]:
size_n_verifiable_to_amount_json_save_path = f"{directory_path}/data_sources/Corum/proteomeHD_verifiable_complexes_amount.json"
write_json_to(size_n_verifiable_to_amount_json,size_n_verifiable_to_amount_json_save_path)

# White Box Generating
Generating r values for given pairs

In [0]:
string_physical_interaction_cutoff_400_csv_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/StringDB/physical_interactions_cutoff_400.csv"
string_physical_interaction_cutoff_400_df = pd.read_csv(string_physical_interaction_cutoff_400_csv_path)
string_physical_interaction_cutoff_400_df

In [0]:
import time
from csv import writer

# Takes in a a df that has all the interactions that we need to generate stuff for
# A path for the generated csv file
# A lookup dictionary where the keys are the proteins and the value are their index in the feature matrix
# feature matrix in which the stuff will be computed by 
def white_box_generate(all_interactions_df,interactions_write_path,lookup_dict_path,feature_matrix,column_names=['protein1','protein2','score','r','r2','observations'],data_source="string"):
  lookup_dict = {}
  with open(lookup_dict_path, "r") as read_file:
    lookup_dict = json.load(read_file)
  with open(interactions_write_path, 'a+', newline='') as write_obj:
    csv_writer = writer(write_obj)
    csv_writer.writerow(column_names)
  start_time = time.time()
  to_writes = []
  for idx,row in all_interactions_df.iterrows():
    protein1 = row[0]
    protein2 = row[1]
    if data_source == "string":
      score = row[2]
    try:
      protein1_idx = lookup_dict[protein1]
      protein2_idx = lookup_dict[protein2]
      corr = find_correlation(protein1_idx,protein2_idx,feature_matrix)
      to_write = (protein1,protein2)
      if data_source == "string":
        to_write = to_write + score
      to_write = to_write + (corr['r'],corr['r2'],corr['shared_obs'])
      to_writes.append(to_write)
    except KeyError:
      pass
    if (idx % 10000 == 0 or idx == len(all_interactions_df) - 1):
      percent_done = (idx+1) / len(all_interactions_df)
      print(f"Percent done: {percent_done}")
      time_since_start = time.time() - start_time
      print(f"Time elasped: {time_since_start}")
      print(f"Estimate finishing in: {time_since_start / (percent_done) - time_since_start}")
      with open(interactions_write_path, 'a+', newline='') as write_obj:
        csv_writer = writer(write_obj)
        for cor_tuple in to_writes:
          csv_writer.writerow(cor_tuple)
        to_writes = []



## Generating r values for all corum pairs that are verifiable by proteomeHD

In [0]:
all_interactions_df_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Corum/all_corum_complex_pairs.csv"
all_interactions_df = pd.read_csv(all_interactions_df_path)

interactions_write_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD/corum_validated/has_validation.csv"

lookup_dict_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/ProteomeHD/uniprot_id_idx_lookup.json"

feature_matrix_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/ProteomeHD/proteomeHD_complete.csv"
feature_matrix_df = pd.read_csv(feature_matrix_path)
feature_matrix = feature_matrix_df.iloc[:,5:].fillna(0).to_numpy()

In [0]:
white_box_generate(all_interactions_df,interactions_write_path,lookup_dict_path,feature_matrix,data_source="corum",column_names=['protein1','protein2','r','r2','observations'])

Percent done: 1.950420315578007e-05
Time elasped: 0.0016429424285888672
Estimate finishing in: 84.23365831375122
Percent done: 0.19506153576095647
Time elasped: 3.1092679500579834
Estimate finishing in: 12.830665763312968
Percent done: 0.39010356731875717
Time elasped: 6.590628385543823
Estimate finishing in: 10.30393228418356
Percent done: 0.5851455988765579
Time elasped: 10.302476167678833
Estimate finishing in: 7.304212129146652
Percent done: 0.7801876304343586
Time elasped: 12.574220418930054
Estimate finishing in: 3.542698035582653
Percent done: 0.9752296619921593
Time elasped: 15.12166428565979
Estimate finishing in: 0.38408259120393495
Percent done: 1.0
Time elasped: 15.408273696899414
Estimate finishing in: 0.0


## Generating r values for pairs in cytoplasmic ribosome (case study)

In [0]:
# Creates the csv to do white box generation on
import itertools
cytoplasmic_ribosome_idx = 271
cytoplasmic_ribosome_complex_id = 306
cytoplasmic_ribosome_subunits = corum_complexes_dataframe.iloc[cytoplasmic_ribosome_idx,5]
all_cytoplasmic_ribosome_pairs = list(itertools.combinations(cytoplasmic_ribosome_subunits.split(";"),2))
cytoplasmic_ribosome_pairs_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Corum/cytoplasmic_ribosome_pairs.csv"
append_to_csv(cytoplasmic_ribosome_pairs_path,["protein1","protein2"],all_cytoplasmic_ribosome_pairs) 

In [0]:
all_interactions_df_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Corum/cytoplasmic_ribosome_pairs.csv"
all_interactions_df = pd.read_csv(all_interactions_df_path)

interactions_write_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD/corum_validated/cytoplasmic_ribosome.csv"

lookup_dict_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/ProteomeHD/uniprot_id_idx_lookup.json"

feature_matrix_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/ProteomeHD/proteomeHD_complete.csv"
feature_matrix_df = pd.read_csv(feature_matrix_path)
feature_matrix = feature_matrix_df.iloc[:,5:].fillna(0).to_numpy()

In [0]:
white_box_generate(all_interactions_df,interactions_write_path,lookup_dict_path,feature_matrix,data_source="corum",column_names=['protein1','protein2','r','r2','observations'])

Percent done: 0.00031645569620253165
Time elasped: 0.0021407604217529297
Estimate finishing in: 6.762662172317505
Percent done: 1.0
Time elasped: 1.609212875366211
Estimate finishing in: 0.0


Get all corum interactions that are not part of ribosome


In [0]:
all_corum_complex_pairs_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Corum/all_corum_complex_pairs.csv"
all_corum_complex_pairs_df = pd.read_csv(all_corum_complex_pairs_path)
all_corum_complex_pairs_df_no_ribo = all_corum_complex_pairs_df[~all_corum_complex_pairs_df["part_of_complex_id"].str.contains(str(cytoplasmic_ribosome_complex_id))]

interactions_write_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/proteomeHD/corum_validated/non_cytoplasmic_ribosome.csv"

lookup_dict_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/ProteomeHD/uniprot_id_idx_lookup.json"

feature_matrix_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/ProteomeHD/proteomeHD_complete.csv"
feature_matrix_df = pd.read_csv(feature_matrix_path)
feature_matrix = feature_matrix_df.iloc[:,5:].fillna(0).to_numpy()

In [0]:
white_box_generate(all_corum_complex_pairs_df_no_ribo,interactions_write_path,lookup_dict_path,feature_matrix,data_source="corum",column_names=['protein1','protein2','r','r2','observations'])

## Generating all verified string interactions in pQTL silac protein

In [0]:
all_interactions_df_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/StringDB/physical_interactions_cutoff_400_complete.csv"
all_interactions_df = pd.read_csv(all_interactions_df_path)
all_interactions_df = all_interactions_df[['item_id_a_uniprot','item_id_b_uniprot','score']]

interactions_write_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL/silac_protein/string_validated/has_validation.csv"

lookup_dict_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/silac_protein/silac_protein_uniprot_idx_lookup.json"

feature_matrix_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/silac_protein/silac_protein_complete.csv"
feature_matrix_df = pd.read_csv(feature_matrix_path)
feature_matrix = feature_matrix_df.iloc[:,2:].fillna(0).to_numpy()

In [0]:
white_box_generate(all_interactions_df,interactions_write_path,lookup_dict_path,feature_matrix)

## Generating all verified corum interactions in pQTL silac protein

In [0]:
all_interactions_df_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Corum/all_complexes_relations_flattened.csv"
all_interactions_df = pd.read_csv(all_interactions_df_path)

interactions_write_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL/silac_protein/corum_validated/has_validation.csv"

lookup_dict_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/silac_protein/silac_protein_uniprot_idx_lookup.json"

feature_matrix_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/silac_protein/silac_protein_complete.csv"
feature_matrix_df = pd.read_csv(feature_matrix_path)
feature_matrix = feature_matrix_df.iloc[:,2:].fillna(0).to_numpy()

In [0]:
white_box_generate(all_interactions_df,interactions_write_path,lookup_dict_path,feature_matrix,data_source="corum",column_names=['protein1','protein2','r','r2','observations'])

Percent done: 1.950420315578007e-05
Time elasped: 0.0012352466583251953
Estimate finishing in: 63.33109617233277
Percent done: 0.19506153576095647
Time elasped: 2.6835031509399414
Estimate finishing in: 11.073710132915847
Percent done: 0.39010356731875717
Time elasped: 5.526334285736084
Estimate finishing in: 8.63999165616556
Percent done: 0.5851455988765579
Time elasped: 7.221728324890137
Estimate finishing in: 5.120034714523289
Percent done: 0.7801876304343586
Time elasped: 9.288692951202393
Estimate finishing in: 2.6170238134059396
Percent done: 0.9752296619921593
Time elasped: 11.315356254577637
Estimate finishing in: 0.28740430078025625
Percent done: 1.0
Time elasped: 11.543065547943115
Estimate finishing in: 0.0


## Generating all verified string interactions in pQTL ribo

In [0]:
all_interactions_df_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/StringDB/physical_interactions_cutoff_400_complete.csv"
all_interactions_df = pd.read_csv(all_interactions_df_path)
all_interactions_df = all_interactions_df[['item_id_a_uniprot','item_id_b_uniprot','score']]

interactions_write_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL/ribo/string_validated/has_validation.csv"

lookup_dict_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/ribo/ribo_uniprot_idx_lookup.json"

feature_matrix_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/ribo/ribo_complete.csv"
feature_matrix_df = pd.read_csv(feature_matrix_path)
feature_matrix = feature_matrix_df.iloc[:,2:].fillna(0).to_numpy()

In [0]:
white_box_generate(all_interactions_df,interactions_write_path,lookup_dict_path,feature_matrix)

## Generating all verified corum interactions in pQTL ribo

In [0]:
all_interactions_df_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Corum/all_complexes_relations_flattened.csv"
all_interactions_df = pd.read_csv(all_interactions_df_path)

interactions_write_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL/ribo/corum_validated/has_validation.csv"

lookup_dict_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/ribo/ribo_uniprot_idx_lookup.json"

feature_matrix_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/ribo/ribo_complete.csv"
feature_matrix_df = pd.read_csv(feature_matrix_path)
feature_matrix = feature_matrix_df.iloc[:,2:].fillna(0).to_numpy()

In [0]:
white_box_generate(all_interactions_df,interactions_write_path,lookup_dict_path,feature_matrix,data_source="corum",column_names=['protein1','protein2','r','r2','observations'])

## Generating all verified string interactions in pQTL rna seq

In [0]:
all_interactions_df_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/StringDB/physical_interactions_cutoff_400_complete.csv"
all_interactions_df = pd.read_csv(all_interactions_df_path)
all_interactions_df = all_interactions_df[['item_id_a_uniprot','item_id_b_uniprot','score']]

interactions_write_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL/rna_seq/string_validated/has_validation.csv"

lookup_dict_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/rna_seq/rna_seq_uniprot_idx_lookup.json"

feature_matrix_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/rna_seq/rna_seq_complete.csv"
feature_matrix_df = pd.read_csv(feature_matrix_path)
feature_matrix = feature_matrix_df.iloc[:,2:].fillna(0).to_numpy()

In [0]:
white_box_generate(all_interactions_df,interactions_write_path,lookup_dict_path,feature_matrix)

## Generating all verified corum interactions in pQTL rna seq

In [0]:
all_interactions_df_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Corum/all_complexes_relations_flattened.csv"
all_interactions_df = pd.read_csv(all_interactions_df_path)

interactions_write_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/csv_outputs/pQTL/rna_seq/corum_validated/has_validation.csv"

lookup_dict_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/rna_seq/rna_seq_uniprot_idx_lookup.json"

feature_matrix_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/pQTL/rna_seq/rna_seq_complete.csv"
feature_matrix_df = pd.read_csv(feature_matrix_path)
feature_matrix = feature_matrix_df.iloc[:,2:].fillna(0).to_numpy()

In [0]:
white_box_generate(all_interactions_df,interactions_write_path,lookup_dict_path,feature_matrix,data_source="corum",column_names=['protein1','protein2','r','r2','observations'])

In [None]:
## Generating

# Flattens Corum

In [0]:
path_to_corum_complexes = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Corum/allComplexes.txt"
corum_complexes_dataframe = pd.read_csv(path_to_corum_complexes,sep='\t')
all_corum_subunits = corum_complexes_dataframe["subunits(UniProt IDs)"].to_numpy()
all_corum_subunits_list = list(map(lambda x: x.split(";"),all_corum_subunits))

In [0]:
import itertools
all_corum_subunits_pairs_list = list(map(lambda x: list(itertools.combinations(x,2)),all_corum_subunits_list))

In [0]:
corum_lookup_json = {}
for i in range(len(all_corum_subunits_pairs_list)):
  cur_complex_pairs = all_corum_subunits_pairs_list[i]
  cur_complex_id = corum_complexes_dataframe.iloc[i,0]
  for pair in cur_complex_pairs:
    protein1 = pair[0]
    protein2 = pair[1]
    if protein1 not in corum_lookup_json:
      corum_lookup_json[protein1] = {}
    if protein2 not in corum_lookup_json:
      corum_lookup_json[protein2] = {}
    if protein1 not in corum_lookup_json[protein2]:
      corum_lookup_json[protein2][protein1] = []
    if protein2 not in corum_lookup_json[protein1]:
      corum_lookup_json[protein1][protein2] = []  
    corum_lookup_json[protein1][protein2].append(int(cur_complex_id))
    corum_lookup_json[protein2][protein1].append(int(cur_complex_id))

In [0]:
all_corum_complex_pairs_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Corum/all_corum_complex_pairs.json"
write_json_to(corum_lookup_json,all_corum_complex_pairs_path)

In [0]:
corum_lookup_json = read_json_from(all_corum_complex_pairs_path)

In [0]:
rows_to_write = []
for key in corum_lookup_json:
  for item in corum_lookup_json[key]:
    rows_to_write.append((key,item,';'.join(map(str, corum_lookup_json[key][item]))))

In [0]:
def get_all_unique_pairs(pair_lists):
  all_pairs = []
  unique_pair_set = set()
  for pair in pair_lists:
    pair_name1 = pair[0] + ";" + pair[1]
    pair_name2 = pair[1] + ";" + pair[0]
    if pair_name1 not in unique_pair_set and pair_name2 not in unique_pair_set :
      all_pairs.append(pair)
      unique_pair_set.add(pair_name1)
      unique_pair_set.add(pair_name2)
  return all_pairs

In [0]:
rows_to_write_deduped = get_all_unique_pairs(rows_to_write)

In [0]:
all_corum_complex_pairs_csv_path = "/content/drive/My Drive/Colab Notebooks/Research/ProteinProteinAssociation/data_sources/Corum/all_corum_complex_pairs.csv"
column_names = ['protein1','protein2','part_of_complex_id']
append_to_csv(all_corum_complex_pairs_csv_path,column_names,rows_to_write_deduped)