# Evaluate

In [1]:
import pandas as pd
import sys
sys.path += ["../../.."]
from utils import is_valid_aa_change_in_experiment_id, is_valid_aa_change_mutation
from src.benchmark.benchmark_utils import get_mutation_corrected
import numpy as np

# Reflect changes in the modules immediately.
%load_ext autoreload
%autoreload 2

In [2]:
predictions = pd.read_csv("kim_prediction_2021-11-19/3fc26bef/predictions_soft_2021-11-19.csv")
predictions.head()

Unnamed: 0,UniProt_ID,Mutation,Interactor_UniProt_ID,Prediction,Median_Probability
0,O96017,K373E,O43293,1,0.52331
1,O96017,K373E,O15297,1,0.70992
2,P01112,G12D,Q12967,1,0.58402
3,P01112,G12D,O95267,1,0.59477
4,P01112,G12D,Q8TDF6,1,0.60715


# Supp 03

In [3]:
supp3_sheet1 = pd.read_excel("science.abf3066_tables_s2_to_s12/science.abf3066_Table_S3.xlsx", sheet_name=1)
supp3_sheet2 = pd.read_excel("science.abf3066_tables_s2_to_s12/science.abf3066_Table_S3.xlsx", sheet_name=2)
supp3_sheet3 = pd.read_excel("science.abf3066_tables_s2_to_s12/science.abf3066_Table_S3.xlsx", sheet_name=3)

In [4]:
def parse_supp3_sheet_1(data):
    # Parse sheet 1
    data_parsed = data[
        data["Experiment.ID"].apply(lambda x: is_valid_aa_change_in_experiment_id(x))
    ].copy()
    data_parsed = data_parsed[
        ~data_parsed["FoldChange"].isna()
    ]
    data_parsed["PROTEIN"] = data_parsed["Bait ID"]
    data_parsed["MUTATION"] = data_parsed["Experiment.ID"].apply(
        lambda x: get_mutation_corrected(x.split('_')[1])
    )
    data_parsed["INTERACTOR"] = data_parsed["Prey ID"]
    data_parsed["MUTATION_EFFECT"] = data_parsed["FoldChange"].apply(lambda x: int(np.log2(x) > 0))
    data_ready = data_parsed[["PROTEIN", "MUTATION", "INTERACTOR", "MUTATION_EFFECT"]].copy()
    return data_ready

def parse_supp3_sheet_2(data):
    return parse_supp3_sheet_1(data)

def parse_supp3_sheet_3(data):
    data = data[
        ~data["Experiment.ID"].isna()
    ].copy()
    return parse_supp3_sheet_1(data)

In [5]:
# Parse sheet 1
supp3_sheet1_ready = parse_supp3_sheet_1(supp3_sheet1)

In [6]:
# Parse sheet 2
supp3_sheet2_ready = parse_supp3_sheet_2(supp3_sheet2)

In [7]:
# Parse sheet 3
supp3_sheet3_ready = parse_supp3_sheet_3(supp3_sheet3)

In [8]:
# Merge all sheets in supp 03
supp3_merged = pd.concat([
    supp3_sheet1_ready, supp3_sheet2_ready, supp3_sheet3_ready
])
supp3_merged = supp3_merged.drop_duplicates()
supp3_merged.reset_index(drop=True, inplace=True)
supp3_merged.head()

Unnamed: 0,PROTEIN,MUTATION,INTERACTOR,MUTATION_EFFECT
0,P31749,E17K,Q14532,1
1,P31749,E17K,Q16543,1
2,P31749,E17K,P33764,1
3,P31749,E17K,Q8IUC1,1
4,P31749,E17K,Q8WTT2,1


# Supp 04

In [9]:
supp4_sheet1 = pd.read_excel("science.abf3066_tables_s2_to_s12/science.abf3066_Table_S4.xlsx", sheet_name=1)
supp4_sheet2 = pd.read_excel("science.abf3066_tables_s2_to_s12/science.abf3066_Table_S4.xlsx", sheet_name=2)
supp4_sheet3 = pd.read_excel("science.abf3066_tables_s2_to_s12/science.abf3066_Table_S4.xlsx", sheet_name=3)
supp4_sheet4 = pd.read_excel("science.abf3066_tables_s2_to_s12/science.abf3066_Table_S4.xlsx", sheet_name=4)

In [10]:
def parse_supp4_sheet_1(data):
    # Parse sheet 1
    data_parsed = data[
        data["Experiment.ID"].apply(lambda x: is_valid_aa_change_in_experiment_id(x))
    ].copy()
    data_parsed = data_parsed[
        ~data_parsed["FoldChange"].isna()
    ]
    data_parsed["PROTEIN"] = data_parsed["Bait ID"]
    data_parsed["MUTATION"] = data_parsed["Experiment.ID"].apply(
        lambda x: get_mutation_corrected(x.split('_')[1])
    )
    data_parsed["INTERACTOR"] = data_parsed["Prey ID"]
    data_parsed["MUTATION_EFFECT"] = data_parsed["FoldChange"].apply(lambda x: int(np.log2(x) > 0))
    data_ready = data_parsed[["PROTEIN", "MUTATION", "INTERACTOR", "MUTATION_EFFECT"]].copy()
    return data_ready

def parse_supp4_sheet_2(data):
    # Parse sheet 2
    data_parsed = data[
        data["Experiment.ID"].apply(lambda x: is_valid_aa_change_mutation(x))
    ].copy()
    data_parsed = data_parsed[
        ~data_parsed["FoldChange"].isna()
    ]
    data_parsed["PROTEIN"] = data_parsed["Bait ID"]
    data_parsed["MUTATION"] = data_parsed["Experiment.ID"]
    data_parsed["INTERACTOR"] = data_parsed["Prey ID"]
    data_parsed["MUTATION_EFFECT"] = data_parsed["FoldChange"].apply(lambda x: int(np.log2(x) > 0))
    data_ready = data_parsed[["PROTEIN", "MUTATION", "INTERACTOR", "MUTATION_EFFECT"]].copy()
    return data_ready

def parse_supp4_sheet_3(data):
    return parse_supp4_sheet_1(data)

def parse_supp4_sheet_4(data):
    return parse_supp4_sheet_1(data)

In [11]:
# Parse sheet 1
supp4_sheet1_ready = parse_supp4_sheet_1(supp4_sheet1)

In [12]:
# Parse sheet 2
supp4_sheet2_ready = parse_supp4_sheet_2(supp4_sheet2)

In [13]:
# Parse sheet 3
supp4_sheet3_ready = parse_supp4_sheet_3(supp4_sheet3)

In [14]:
# Parse sheet 4
supp4_sheet4_ready = parse_supp4_sheet_4(supp4_sheet4)

In [15]:
supp4_sheet1_ready.head(3)

Unnamed: 0,PROTEIN,MUTATION,INTERACTOR,MUTATION_EFFECT
0,P31749,E17K,P33764,1
1,P31749,E17K,Q14532,1
2,P31749,E17K,Q16543,1


In [16]:
supp4_sheet2_ready.head(3)

Unnamed: 0,PROTEIN,MUTATION,INTERACTOR,MUTATION_EFFECT
39,P38398,C61G,Q9BX63,1
40,P38398,C61G,P40692,1
41,P38398,C61G,Q96RL1,1


In [17]:
supp4_sheet3_ready.head(3)

Unnamed: 0,PROTEIN,MUTATION,INTERACTOR,MUTATION_EFFECT
19,P12830,E243K,P35222,1
20,P12830,E243K,O60716,1
57,P01112,G12D,P13995,1


In [18]:
supp4_sheet4_ready.head(3)

Unnamed: 0,PROTEIN,MUTATION,INTERACTOR,MUTATION_EFFECT
0,P31749,E17K,P11717,1
21,P12830,E243K,P35222,1
22,P12830,E243K,P35221,1


In [21]:
# Merge all sheets in supp 04
supp4_merged = pd.concat([
    supp4_sheet1_ready, supp4_sheet2_ready, supp4_sheet3_ready, supp4_sheet4_ready
])
supp4_merged = supp4_merged.drop_duplicates()
supp4_merged.reset_index(drop=True, inplace=True)
supp4_merged.head()

Unnamed: 0,PROTEIN,MUTATION,INTERACTOR,MUTATION_EFFECT
0,P31749,E17K,P33764,1
1,P31749,E17K,Q14532,1
2,P31749,E17K,Q16543,1
3,P38398,C61G,Q9BX63,1
4,P38398,C61G,P40692,1


# Merge All Excel Files

In [25]:
kim_et_al_data = pd.concat(
    [supp3_merged, supp4_merged]
)
kim_et_al_data = kim_et_al_data.drop_duplicates()
kim_et_al_data.reset_index(drop=True, inplace=True)
kim_et_al_data

Unnamed: 0,PROTEIN,MUTATION,INTERACTOR,MUTATION_EFFECT
0,P31749,E17K,Q14532,1
1,P31749,E17K,Q16543,1
2,P31749,E17K,P33764,1
3,P31749,E17K,Q8IUC1,1
4,P31749,E17K,Q8WTT2,1
...,...,...,...,...
62247,P04637,R273H,Q9H2K0,0
62248,P04637,R273H,P10412,0
62249,P04637,R273H,P50750,0
62250,P04637,R273H,Q7Z7K0,0


# Extract Protein and Mutation Pairs

In [28]:
protein_mutation_pairs = set(
    zip(kim_et_al_data["PROTEIN"], kim_et_al_data["MUTATION"])
)
len(protein_mutation_pairs)
protein_mutation_pairs

{('O96017', 'K373E'),
 ('P01112', 'G12D'),
 ('P04637', 'R175H'),
 ('P04637', 'R248W'),
 ('P04637', 'R273H'),
 ('P12830', 'E243K'),
 ('P31749', 'E17K'),
 ('P38398', 'C61G'),
 ('P38398', 'I26A'),
 ('P38398', 'M1775R'),
 ('P38398', 'R71G'),
 ('P38398', 'S1655F'),
 ('P42336', 'E545K'),
 ('P42336', 'H1047R'),
 ('P42336', 'M1043V'),
 ('P60484', 'R130Q'),
 ('Q86UE4', 'A78S'),
 ('Q86YC2', 'E837K'),
 ('Q9BX63', 'A745T'),
 ('Q9Y243', 'E17K')}

In [29]:
with open("ELASPIC_Input/input_pairs_20.txt", "w") as file:
    for pair in protein_mutation_pairs:
        file.write(f"{pair[0]}.{pair[1]}\n")