In [4]:
import numpy as np
import pandas as pd
import json
import csv
PATH_ROOT = "D:/Desktop/Northeastern_University/Research/Proteomics/ProteinProteinAssociation/Development"

## Load Data

In [5]:
# Appends list of tuples to csv
def append_to_csv(file_name,column_names,cor_tuples):
  with open(file_name, 'a+', newline='') as write_obj:
    csv_writer = writer(write_obj)
    if column_names is not None:
      csv_writer.writerow(column_names)
    for cor_tuple in cor_tuples:
      csv_writer.writerow(cor_tuple)

def write_json_to(json_dict,path):
  json_to_write = json.dumps(json_dict)
  write_file = open(path,"w")
  write_file.write(json_to_write)
  write_file.close()

def read_json_from(path):
  with open(path, "r") as read_file:
    return json.load(read_file)

def progress_report(cur_idx,total,start_time):
    percent_done = (cur_idx+1) / total
    print(f"Percent done: {percent_done}")
    time_since_start = time.time() - start_time
    print(f"Time elasped: {time_since_start}")
    print(f"Estimate finishing in: {time_since_start / (percent_done) - time_since_start}")  

corum_complex_pairs_lookup_path = f"{PATH_ROOT}/data_sources/Corum/all_corum_complex_pairs.json" 
corum_complex_pairs_lookup_json = read_json_from(corum_complex_pairs_lookup_path)

In [6]:
corum_complex_pairs_lookup_json

{'P41182': {'P56524': [1],
  'Q9UQL6': [2],
  'Q8WUI4': [3],
  'O94776': [41],
  'O95983': [41, 585],
  'Q13547': [41, 585],
  'Q9BTC8': [41, 585],
  'Q14839': [585],
  'Q13105': [1508]},
 'P56524': {'P41182': [1],
  'P27361': [55],
  'P28482': [57],
  'P59768': [1620],
  'P62873': [1620]},
 'Q9UQL6': {'P41182': [2], 'P59768': [1619], 'P62873': [1619]},
 'Q8WUI4': {'P41182': [3]},
 'Q09472': {'Q92793': [4, 570, 571, 2638],
  'Q92831': [4, 6653],
  'Q9Y6Q9': [4],
  'P04637': [98],
  'Q00987': [98],
  'O14497': [570, 571],
  'P51532': [570],
  'Q12824': [570],
  'Q8TAQ2': [570],
  'Q92922': [570],
  'Q9UK53': [1158, 1160],
  'P12004': [1160],
  'O43463': [1471, 5118],
  'Q08999': [1471, 5118],
  'Q13547': [1471, 5118],
  'Q15329': [1471, 5118],
  'P40763': [1521],
  'Q15797': [1521, 2642],
  'P84022': [1831],
  'Q9Y6X2': [1831],
  'P24928': [2638, 2639],
  'P50613': [2638, 2639],
  'Q06330': [2638, 2639],
  'Q7KZ85': [2638, 2639],
  'O60563': [2639],
  'P46531': [2639],
  'P49336': [2639

In [15]:
# Clean the dataframe such that only up to complex_cap amount of pairs can come from the same complex
def limit_complex_pairs(df_to_clean,complex_cap,column_names):
    drop_indices = set()
    complex_counter = {}
    p1_col_name = column_names[0]
    p2_col_name = column_names[1]
    for idx,row in df_to_clean.iterrows():
        p1 = row[p1_col_name]
        p2 = row[p2_col_name]
        if p2 in corum_complex_pairs_lookup_json[p1]:
            for complex_id in corum_complex_pairs_lookup_json[p1][p2]:
                try:
                    complex_counter[complex_id] += 1
                except KeyError:
                    complex_counter[complex_id] = 1
                if complex_counter[complex_id] > complex_cap:
                    drop_indices.add(idx)
    drop_indices_list = list(drop_indices)
    return df_to_clean.drop(drop_indices_list),complex_counter

In [19]:
validated_corum_complex_pairs_df_path = f"{PATH_ROOT}/data_sources/ProteomeHD/corum_validated/has_validation.csv"
validated_corum_complex_pairs_df = pd.read_csv(validated_corum_complex_pairs_df_path)
non_validated_corum_complex_pairs_df_path = f"{PATH_ROOT}/data_sources/ProteomeHD/corum_validated/has_no_validation.csv"
non_validated_corum_complex_pairs_df = pd.read_csv(non_validated_corum_complex_pairs_df_path)

In [12]:
validated_corum_complex_pairs_df

Unnamed: 0.1,Unnamed: 0,protein1,protein2,r,r2,observations,normalized_manhattan_distance
0,0,P56524,P27361,-0.049762,0.002476,39,0.943735
1,1,P56524,P28482,-0.052761,0.002784,43,0.964669
2,2,P56524,P59768,0.250236,0.062618,9,0.927765
3,3,P56524,P62873,0.045789,0.002097,42,0.917572
4,4,Q09472,Q92793,0.139837,0.019554,94,0.902074
...,...,...,...,...,...,...,...
36236,36236,Q12802,Q16539,0.156922,0.024624,158,0.892161
36237,36237,Q12802,Q9NYL2,0.116803,0.013643,57,0.986548
36238,36238,Q16512,Q16539,0.330290,0.109092,191,0.837066
36239,36239,Q16512,Q9NYL2,-0.108800,0.011837,65,0.948991


In [16]:
validated_corum_complex_pairs_df_cleaned,complex_counter = limit_complex_pairs(validated_corum_complex_pairs_df,10,("protein1","protein2"))

In [18]:
validated_corum_complex_pairs_df_cleaned.describe()

Unnamed: 0.1,Unnamed: 0,r,r2,observations,normalized_manhattan_distance
count,4557.0,4557.0,4557.0,4557.0,4504.0
mean,20994.676542,0.194696,0.174376,102.441519,0.876632
std,14300.994239,0.369458,0.219462,86.250256,0.109201
min,0.0,-1.0,0.0,0.0,0.046154
25%,4314.0,-0.024398,0.017846,26.0,0.826633
50%,29738.0,0.205013,0.082351,78.0,0.90331
75%,33723.0,0.454673,0.254021,170.0,0.953748
max,36240.0,1.0,1.0,294.0,1.0


In [22]:
non_validated_corum_complex_pairs_df

Unnamed: 0.1,Unnamed: 0,protein1_majority_name,protein2_majority_name,protein1_name,protein2_name,r,r2,observations,is_validated,belong_to,normalized_manhattan_distance
0,0,P09488;P09488-2,O75410;O75410-2;O75410-7;O75410-3;O75410-6;O75...,P09488,O75410,0.006984,0.000049,114,0,,0.952787
1,1,Q9Y6K9-2;Q9Y6K9,Q69YQ0-2;Q69YQ0,Q9Y6K9,Q69YQ0,-0.396372,0.157111,50,0,,0.927501
2,2,Q6NUS8,Q13442,Q6NUS8,Q13442,0.549295,0.301725,7,0,,0.850621
3,3,O14924-4;O14924;O14924-3,P09429,O14924,P09429,-0.679201,0.461315,12,0,,0.944358
4,4,Q04323,P55212,Q04323,P55212,-0.471113,0.221948,178,0,,0.959469
...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,Q86SQ0;Q86SQ0-2;Q86SQ0-3,Q5VTL8;Q5VTL8-2,Q86SQ0,Q5VTL8,-0.056373,0.003178,42,0,,0.950999
99996,99996,Q16853;Q16853-2,Q96N67-4;Q96N67-2;Q96N67-3;Q96N67-5;Q96N67-6;Q...,Q16853,Q96N67,0.182728,0.033389,14,0,,1.000000
99997,99997,Q9ULT0;Q9ULT0-4,Q14289-2,Q9ULT0,Q14289,-0.425692,0.181214,50,0,,0.996100
99998,99998,Q01518,O95866;O95866-5;O95866-3,Q01518,O95866,0.047692,0.002275,17,0,,1.000000


In [37]:
non_validated_corum_complex_pairs_df_no_belong_to = non_validated_corum_complex_pairs_df.drop(columns=['protein1_majority_name','protein2_majority_name','belong_to'])
non_validated_corum_complex_pairs_df_no_belong_to = non_validated_corum_complex_pairs_df_no_belong_to.rename(columns={"protein1_name":"protein1","protein2_name":"protein2"})
belong_to_col = [1 for i in range(len(validated_corum_complex_pairs_df_cleaned))]
validated_corum_complex_pairs_df_cleaned.insert(4,"is_validated",belong_to_col)

In [39]:
validated_corum_complex_pairs_df_cleaned

Unnamed: 0.1,Unnamed: 0,protein1,protein2,r,is_validated,r2,observations,normalized_manhattan_distance
0,0,P56524,P27361,-0.049762,1,0.002476,39,0.943735
1,1,P56524,P28482,-0.052761,1,0.002784,43,0.964669
2,2,P56524,P59768,0.250236,1,0.062618,9,0.927765
3,3,P56524,P62873,0.045789,1,0.002097,42,0.917572
4,4,Q09472,Q92793,0.139837,1,0.019554,94,0.902074
...,...,...,...,...,...,...,...,...
36236,36236,Q12802,Q16539,0.156922,1,0.024624,158,0.892161
36237,36237,Q12802,Q9NYL2,0.116803,1,0.013643,57,0.986548
36238,36238,Q16512,Q16539,0.330290,1,0.109092,191,0.837066
36239,36239,Q16512,Q9NYL2,-0.108800,1,0.011837,65,0.948991


In [40]:
new_df = pd.concat([non_validated_corum_complex_pairs_df_no_belong_to, validated_corum_complex_pairs_df_cleaned], ignore_index=True)

In [42]:
new_df = new_df.drop(columns=["Unnamed: 0"])

In [43]:
new_df

Unnamed: 0,protein1,protein2,r,r2,observations,is_validated,normalized_manhattan_distance
0,P09488,O75410,0.006984,0.000049,114,0,0.952787
1,Q9Y6K9,Q69YQ0,-0.396372,0.157111,50,0,0.927501
2,Q6NUS8,Q13442,0.549295,0.301725,7,0,0.850621
3,O14924,P09429,-0.679201,0.461315,12,0,0.944358
4,Q04323,P55212,-0.471113,0.221948,178,0,0.959469
...,...,...,...,...,...,...,...
104552,Q12802,Q16539,0.156922,0.024624,158,1,0.892161
104553,Q12802,Q9NYL2,0.116803,0.013643,57,1,0.986548
104554,Q16512,Q16539,0.330290,0.109092,191,1,0.837066
104555,Q16512,Q9NYL2,-0.108800,0.011837,65,1,0.948991


In [45]:
new_df_write_path = f"{PATH_ROOT}/data_sources/ProteomeHD/corum_validated/combined_max_10_pairs_per_complex.csv"
new_df.to_csv(new_df_write_path,index=False)

In [17]:
complex_counter

{55: 1,
 57: 1,
 1620: 3,
 4: 3,
 570: 21,
 571: 3,
 2638: 15,
 98: 1,
 1158: 1,
 1160: 3,
 1471: 6,
 5118: 10,
 1521: 3,
 2642: 1,
 1831: 3,
 2639: 78,
 7581: 153,
 7582: 136,
 5260: 10,
 5261: 3,
 6354: 3,
 5375: 1,
 6502: 3,
 6653: 1,
 7328: 1,
 2727: 21,
 2829: 45,
 3066: 66,
 3061: 10,
 3062: 3,
 3137: 28,
 3142: 21,
 2728: 6,
 3750: 1,
 3754: 3,
 3753: 3,
 3749: 1,
 2958: 1,
 5198: 3,
 5264: 3,
 5573: 1,
 6590: 1,
 1054: 3,
 2124: 3,
 6451: 1,
 10: 10,
 157: 10,
 159: 21,
 167: 10,
 49: 21,
 160: 10,
 353: 3,
 5239: 1,
 12: 3,
 654: 55,
 13: 1,
 15: 15,
 58: 10,
 741: 10,
 752: 3,
 1188: 3,
 2183: 45,
 1413: 45,
 1505: 21,
 7480: 6,
 5614: 190,
 5609: 36,
 5426: 1,
 6244: 3,
 7406: 3,
 7487: 1,
 1255: 3,
 3044: 3,
 3204: 1,
 3205: 1,
 3206: 1,
 23: 28,
 652: 91,
 7545: 28,
 6416: 3,
 6415: 3,
 6689: 1,
 7217: 6,
 6692: 1,
 27: 21,
 30: 1,
 192: 120,
 193: 630,
 32: 190,
 181: 231,
 443: 10,
 704: 3,
 705: 3,
 33: 15,
 112: 15,
 145: 15,
 3074: 15,
 781: 36,
 5199: 120,
 36: 28,
 