# Comparing Predator with other studies

February 16th, 2022

Studies:
1. Predator
2. SAAMBE_3D
3. PANDA
4. mCSM-PPI2

In [1]:
import pandas as pd

In [2]:
PREDATOR_PREDICTIONS_DATA_PATH = "../../../dev/leave_one_complex_out/Leave_one_complex_out_predictions_2022-02-14.csv"
SAAMBE_3D_PREDICTIONS_DATA_PATH = "../../SAAMBE_3D_Study/SAAMBE-3D-master/PredatorBenchmark/SAAMBE_3D_predictions_2022-02-12.csv"
PANDA_PREDICTIONS_DATA_PATH = "../../PANDA/panda-master/PredatorBenchmark/train_data_with_PANDA_prediction_2022-02-14.csv"
MCSM_PPI2_PREDICTION_DATA_PATH = "../mCSM_PPI2/src/Record/mcsm_ppi2_predictions_2022-02-16.csv"

In [3]:
def introduce_triplet_column(data):
    data.insert(
        0, "Triplet", data.apply(
            lambda row: (row["UniProt_ID"], row["Mutation"], row["Interactor_UniProt_ID"]), axis=1
        )
    )

    data.drop(columns=["UniProt_ID", "Mutation", "Interactor_UniProt_ID"], inplace=True)

    data = data.sort_values("Triplet", ignore_index=True)

    return data

#### 1. Predator Predictions

In [4]:
predator_predictions_data = pd.read_csv(PREDATOR_PREDICTIONS_DATA_PATH)

predator_predictions_data = introduce_triplet_column(predator_predictions_data)

predator_predictions_data

Unnamed: 0,Triplet,Mutation_Effect_Label,Predator_default,Predator_feature_selected,Predator_feature_selected_tuned
0,"(A5PKW4, E621K, P62330)",0,0,0,0
1,"(O00267, I194V, P63272)",1,1,1,1
2,"(O00311, I99L, Q9UBU7)",1,1,1,1
3,"(O00311, I99T, Q9UBU7)",1,1,1,1
4,"(O00311, K416T, Q9UBU7)",1,0,0,0
...,...,...,...,...,...
735,"(Q9Y570, H349A, P67775)",0,0,0,0
736,"(Q9Y570, S156A, P67775)",0,0,0,0
737,"(Q9Y570, S156A, P67775-2)",0,0,0,0
738,"(Q9Y570, S156T, P67775)",0,0,0,0


#### 2. SAAMBE-3D Predictions

In [5]:
saambe_3d_predictions_data = pd.read_csv(SAAMBE_3D_PREDICTIONS_DATA_PATH)

saambe_3d_predictions_data.rename(
    columns={"Prediction": "SAAMBE_3D"}, inplace=True
)

saambe_3d_predictions_data["SAAMBE_3D"] = saambe_3d_predictions_data["SAAMBE_3D"].fillna("not_available")

saambe_3d_predictions_data = introduce_triplet_column(saambe_3d_predictions_data)
saambe_3d_predictions_data

Unnamed: 0,Triplet,Mutation_Effect_Label,SAAMBE_3D
0,"(A5PKW4, E621K, P62330)",0,not_available
1,"(O00267, I194V, P63272)",1,1.0
2,"(O00311, I99L, Q9UBU7)",1,1.0
3,"(O00311, I99T, Q9UBU7)",1,0.0
4,"(O00311, K416T, Q9UBU7)",1,1.0
...,...,...,...
735,"(Q9Y570, H349A, P67775)",0,1.0
736,"(Q9Y570, S156A, P67775)",0,not_available
737,"(Q9Y570, S156A, P67775-2)",0,not_available
738,"(Q9Y570, S156T, P67775)",0,not_available


#### 3. PANDA Predictions

In [6]:
panda_predictions_data = pd.read_csv(PANDA_PREDICTIONS_DATA_PATH)
panda_predictions_data.rename(columns={"PANDA_Predictions": "PANDA"}, inplace=True)
panda_predictions_data = introduce_triplet_column(panda_predictions_data)
panda_predictions_data

Unnamed: 0,Triplet,Mutation_Effect_Label,PANDA
0,"(A5PKW4, E621K, P62330)",0,0
1,"(O00267, I194V, P63272)",1,0
2,"(O00311, I99L, Q9UBU7)",1,0
3,"(O00311, I99T, Q9UBU7)",1,0
4,"(O00311, K416T, Q9UBU7)",1,0
...,...,...,...
735,"(Q9Y570, H349A, P67775)",0,0
736,"(Q9Y570, S156A, P67775)",0,0
737,"(Q9Y570, S156A, P67775-2)",0,0
738,"(Q9Y570, S156T, P67775)",0,0


#### 4. mCSM_PPI2 Predictions

In [8]:
mcsm_ppi2_predictions_data = pd.read_csv(MCSM_PPI2_PREDICTION_DATA_PATH)

mcsm_ppi2_predictions_data.rename(
    columns={"mCSM_PP2_Prediction": "mCSM_PP2"}, inplace=True
)

mcsm_ppi2_predictions_data["mCSM_PP2"] = mcsm_ppi2_predictions_data["mCSM_PP2"].fillna("not_available")

mcsm_ppi2_predictions_data = introduce_triplet_column(mcsm_ppi2_predictions_data)
mcsm_ppi2_predictions_data

Unnamed: 0,Triplet,Mutation_Effect_Label,mCSM_PP2
0,"(A5PKW4, E621K, P62330)",0,not_available
1,"(O00267, I194V, P63272)",1,0.0
2,"(O00311, I99L, Q9UBU7)",1,0.0
3,"(O00311, I99T, Q9UBU7)",1,0.0
4,"(O00311, K416T, Q9UBU7)",1,0.0
...,...,...,...
735,"(Q9Y570, H349A, P67775)",0,0.0
736,"(Q9Y570, S156A, P67775)",0,not_available
737,"(Q9Y570, S156A, P67775-2)",0,not_available
738,"(Q9Y570, S156T, P67775)",0,not_available


## Benchmark data

In [9]:
# Add PREDATOR
benchmark_data = predator_predictions_data.copy()

# Add SAMBE_3D
benchmark_data["SAAMBE_3D"] = saambe_3d_predictions_data["SAAMBE_3D"]

# Add PANDA
benchmark_data["PANDA"] = panda_predictions_data["PANDA"]

# Add mCSM_PPI2
benchmark_data["mCSM_PP2"] = mcsm_ppi2_predictions_data["mCSM_PP2"]

benchmark_data

Unnamed: 0,Triplet,Mutation_Effect_Label,Predator_default,Predator_feature_selected,Predator_feature_selected_tuned,SAAMBE_3D,PANDA,mCSM_PP2
0,"(A5PKW4, E621K, P62330)",0,0,0,0,not_available,0,not_available
1,"(O00267, I194V, P63272)",1,1,1,1,1.0,0,0.0
2,"(O00311, I99L, Q9UBU7)",1,1,1,1,1.0,0,0.0
3,"(O00311, I99T, Q9UBU7)",1,1,1,1,0.0,0,0.0
4,"(O00311, K416T, Q9UBU7)",1,0,0,0,1.0,0,0.0
...,...,...,...,...,...,...,...,...
735,"(Q9Y570, H349A, P67775)",0,0,0,0,1.0,0,0.0
736,"(Q9Y570, S156A, P67775)",0,0,0,0,not_available,0,not_available
737,"(Q9Y570, S156A, P67775-2)",0,0,0,0,not_available,0,not_available
738,"(Q9Y570, S156T, P67775)",0,0,0,0,not_available,0,not_available


Drop NAN entries

In [11]:
benchmark_data_drop_nan = benchmark_data[benchmark_data["SAAMBE_3D"] != "not_available"].copy()
benchmark_data_drop_nan["SAAMBE_3D"] = benchmark_data_drop_nan["SAAMBE_3D"].astype(int)
benchmark_data_drop_nan["mCSM_PP2"] = benchmark_data_drop_nan["mCSM_PP2"].astype(int)
benchmark_data_drop_nan

Unnamed: 0,Triplet,Mutation_Effect_Label,Predator_default,Predator_feature_selected,Predator_feature_selected_tuned,SAAMBE_3D,PANDA,mCSM_PP2
1,"(O00267, I194V, P63272)",1,1,1,1,1,0,0
2,"(O00311, I99L, Q9UBU7)",1,1,1,1,1,0,0
3,"(O00311, I99T, Q9UBU7)",1,1,1,1,0,0,0
4,"(O00311, K416T, Q9UBU7)",1,0,0,0,1,0,0
5,"(O00311, Q106R, Q9UBU7)",1,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...
731,"(Q9Y4K3, L74E, P61088)",0,0,0,0,1,0,0
732,"(Q9Y4K3, L74H, P61088)",0,0,0,0,1,0,0
733,"(Q9Y4K3, L74K, P61088)",0,0,0,0,1,0,0
734,"(Q9Y4K3, L74R, P61088)",0,0,0,0,1,0,0


# Evaluation Metric

In [12]:
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    matthews_corrcoef,
    precision_score,
    recall_score,
)

In [13]:
def get_scoring_metrics(y_true, y_pred):
    metrics_data = pd.DataFrame(
        [
            accuracy_score(y_true, y_pred),
            balanced_accuracy_score(y_true, y_pred),
            f1_score(y_true, y_pred),
            matthews_corrcoef(y_true, y_pred),
            precision_score(y_true, y_pred),
            recall_score(y_true, y_pred),
        ],
        columns=["SCORE"],
        index=["ACCURACY", "BALANCED_ACCURACY", "F1", "MATTEWS_COR", "PRECISION", "RECALL"])

    return metrics_data

In [16]:
import pandas as pd

metrics_comparison_data = pd.concat(
    [
        get_scoring_metrics(benchmark_data_drop_nan["Mutation_Effect_Label"], benchmark_data_drop_nan["Predator_default"]),
        get_scoring_metrics(benchmark_data_drop_nan["Mutation_Effect_Label"], benchmark_data_drop_nan["Predator_feature_selected"]),
        get_scoring_metrics(benchmark_data_drop_nan["Mutation_Effect_Label"], benchmark_data_drop_nan["Predator_feature_selected_tuned"]),
        get_scoring_metrics(benchmark_data_drop_nan["Mutation_Effect_Label"], benchmark_data_drop_nan["SAAMBE_3D"]),
        get_scoring_metrics(benchmark_data_drop_nan["Mutation_Effect_Label"], benchmark_data_drop_nan["PANDA"]),
        get_scoring_metrics(benchmark_data_drop_nan["Mutation_Effect_Label"], benchmark_data_drop_nan["mCSM_PP2"]),

    ], axis="columns"
)
metrics_comparison_data.columns = ["Predator_default", "Predator_Shap_10", "Predator_Shap_10_Tuned", "SAAMBE_3D", "PANDA", "mCSM_PP2"]
print(f"Prediction results in {len(benchmark_data_drop_nan)} entries.")
metrics_comparison_data

Prediction results in 365 entries.


Unnamed: 0,Predator_default,Predator_Shap_10,Predator_Shap_10_Tuned,SAAMBE_3D,PANDA,mCSM_PP2
ACCURACY,0.857534,0.827397,0.8,0.263014,0.684932,0.8
BALANCED_ACCURACY,0.717573,0.667823,0.658817,0.481427,0.467681,0.61984
F1,0.59375,0.496,0.47482,0.348668,0.094488,0.396694
MATTEWS_COR,0.547219,0.43453,0.365185,-0.05334,-0.082056,0.32381
PRECISION,0.826087,0.72093,0.578947,0.217523,0.133333,0.615385
RECALL,0.463415,0.378049,0.402439,0.878049,0.073171,0.292683


In [None]:
# the end

### ---------------------------

In [51]:
saambe_3d_predictions_data["SAAMBE_3D"]
saambe_3d_predictions_data["Mutation_Effect_Label"]

0      0
1      1
2      1
3      1
4      1
      ..
735    0
736    0
737    0
738    0
739    0
Name: Mutation_Effect_Label, Length: 740, dtype: int64

In [52]:
predator_predictions_data["Triplet"]

0        (A5PKW4, E621K, P62330)
1        (O00267, I194V, P63272)
2         (O00311, I99L, Q9UBU7)
3         (O00311, I99T, Q9UBU7)
4        (O00311, K416T, Q9UBU7)
                 ...            
735      (Q9Y570, H349A, P67775)
736      (Q9Y570, S156A, P67775)
737    (Q9Y570, S156A, P67775-2)
738      (Q9Y570, S156T, P67775)
739    (Q9Y570, S156T, P67775-2)
Name: Triplet, Length: 740, dtype: object

In [53]:
predator_predictions_data["Triplet"].tolist() == saambe_3d_predictions_data["Triplet"].tolist() == panda_predictions_data["Triplet"].tolist()

True

In [54]:
predator_predictions_data

Unnamed: 0,Triplet,Mutation_Effect_Label,Predator_default,Predator_feature_selected,Predator_feature_selected_tuned
0,"(A5PKW4, E621K, P62330)",0,0,0,0
1,"(O00267, I194V, P63272)",1,1,1,1
2,"(O00311, I99L, Q9UBU7)",1,1,1,1
3,"(O00311, I99T, Q9UBU7)",1,1,1,1
4,"(O00311, K416T, Q9UBU7)",1,0,0,0
...,...,...,...,...,...
735,"(Q9Y570, H349A, P67775)",0,0,0,0
736,"(Q9Y570, S156A, P67775)",0,0,0,0
737,"(Q9Y570, S156A, P67775-2)",0,0,0,0
738,"(Q9Y570, S156T, P67775)",0,0,0,0


In [55]:
pd.merge(
    predator_predictions_data,
    saambe_3d_predictions_data,
    on=["Mutation_Effect_Label", "UniProt_ID", "Mutation", "Interactor_UniProt_ID"],
    how="inner"
)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\ibrah\anaconda3\envs\MyVenvML\lib\site-packages\IPython\core\interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-55-39796ea77a87>", line 1, in <module>
    pd.merge(
  File "C:\Users\ibrah\anaconda3\envs\MyVenvML\lib\site-packages\pandas\core\reshape\merge.py", line 74, in merge
    op = _MergeOperation(
  File "C:\Users\ibrah\anaconda3\envs\MyVenvML\lib\site-packages\pandas\core\reshape\merge.py", line 668, in __init__
    ) = self._get_merge_keys()
  File "C:\Users\ibrah\anaconda3\envs\MyVenvML\lib\site-packages\pandas\core\reshape\merge.py", line 1033, in _get_merge_keys
    right_keys.append(right._get_label_or_level_values(rk))
  File "C:\Users\ibrah\anaconda3\envs\MyVenvML\lib\site-packages\pandas\core\generic.py", line 1684, in _get_label_or_level_values
    raise KeyError(key)
KeyError: 'UniProt_ID'

During handling of the above exception, another exception 

TypeError: object of type 'NoneType' has no len()

In [None]:
pd.merge(

)

In [None]:
# Add PREDATOR
benchmark_data = predator_predictions_data.copy()

benchmark_data

In [None]:
saambe_3d_predictions_data

In [None]:
# Add SAAMBE-3D
benchmark_data["SAAMBE_3D"] = saambe_3d_predictions_data["Prediction"]

In [None]:
# Add PANDA
benchmark_data["PANDA"] = panda_predictions_data["PANDA_Predictions"]

In [None]:
benchmark_data