# Leave One Complex Out

February 14th, 2022


Now, we are using only one random forest classifier in leave one out scheme.

In [1]:
from utils import (
    load_protein_dataframes,
    get_top_n_features,
    get_randomized_search,
    get_train_and_test_data,
    get_predictions,
    get_scoring_metrics
)

from tqdm.notebook import tqdm

HIDDEN_ENTRIES_FOLDER_PATH = "hidden-entries_2022-02-11"

TOP_N = 10

# Reflect changes in the modules immediately.
%load_ext autoreload
%autoreload 2

In [2]:
protein_dataframes = load_protein_dataframes(HIDDEN_ENTRIES_FOLDER_PATH)
len(protein_dataframes)

164

In [3]:
protein_dataframes[56]

Unnamed: 0,Mutation_Effect_Label,UniProt_ID,Mutation,Interactor_UniProt_ID,Template_sequence_identity,Alignment_score,Interactor_template_sequence_identity,Interactor_alignment_score,Final_ddG,ProtBert_score,...,number_of_residues_mut,IntraclashesEnergy1_wt,IntraclashesEnergy1_mut,IntraclashesEnergy2_wt,IntraclashesEnergy2_mut,Interface_hydrophobic_area,Interface_hydrophilic_area,Interface_total_area,Interface_contact_distance_wt,Interface_contact_distance_mut
0,1,P40692,V213M,P54278-2,0.3333,0.288192,0.9505,0.943632,1.08167,0.045422,...,408.0,284.603,285.042,39.5219,39.5219,263.325,218.09,481.41,4.63532,4.63532
1,1,P40692,F80V,P54278,0.3333,0.288192,0.9507,0.943866,1.10465,0.881422,...,409.0,400.704,400.864,50.5247,50.5247,0.0,0.0,0.0,3.29591,4.01617
2,1,P40692,F80V,P54278-3,0.3333,0.288192,0.9507,0.943866,1.10465,0.881422,...,409.0,400.704,400.864,50.5247,50.5247,0.0,0.0,0.0,3.29591,4.01617
3,1,P40692,C77R,P54278,0.3333,0.288192,0.9507,0.943866,1.76213,0.982242,...,409.0,399.477,399.615,49.6587,49.6587,0.0,0.0,0.0,3.39677,3.57761
4,1,P40692,C77R,P54278-3,0.3333,0.288192,0.9507,0.943866,1.76213,0.982242,...,409.0,399.477,399.615,49.6587,49.6587,0.0,0.0,0.0,3.39677,3.57761


In [4]:
features = list(protein_dataframes[56].columns[4:])
len(features)

58

In [5]:
with open("features.txt", "w") as fin:
    for feature in features:
        fin.write(f"{feature}\n")

In [5]:
def run_leave_one_complex_out():
    actuals = []
    predictions_default = []  # all features and no feature selection
    predictions_fs = []
    predictions_fs_hypertuned = []

    selected_features_list = []

    prediction_dataframes = []

    for i in tqdm(range(len(protein_dataframes))):

        # excluding test data
        train_data, test_data = get_train_and_test_data(protein_dataframes, index=i)

        # current prediction belongs to this test_data, not the original intact data with 740 entries.
        prediction_dataframes.append(test_data)

        X_train = train_data[features]
        y_train = train_data["Mutation_Effect_Label"]

        X_test = test_data[features]
        y_test = test_data["Mutation_Effect_Label"]

        # True labels
        actuals.append(list(y_test))

        n_train = len(X_train)
        n_test = len(X_test)
        print(f"Training RF model with {n_train} entries.")
        print(f"Predicting {n_test} entries. The protein: {test_data['UniProt_ID'].unique()}")

        # Predictions with all features and no hyper tuning
        predictions_default.append(
            get_predictions(X_train, y_train, X_test, tuning=False)
        )

        # Feature selection
        selected_features = get_top_n_features(X_train, y_train, top_n=TOP_N)
        selected_features_list.append(selected_features)

        # Update X_train and X_test with selected features
        X_train_fs = X_train[selected_features]
        X_test_fs = X_test[selected_features]

        # Predictions with selected features and no hyper tuning
        predictions_fs.append(
            get_predictions(X_train_fs, y_train, X_test_fs, tuning=False)
        )

        # Predictions with selected features and hyper tuning
        predictions_fs_hypertuned.append(
            get_predictions(X_train_fs, y_train, X_test_fs, tuning=True)
        )

        print("=======================================")


    return {
        "prediction_dataframes": prediction_dataframes,
        "actuals": actuals,
        "predictions_default": predictions_default,
        "predictions_fs": predictions_fs,
        "predictions_fs_hypertuned": predictions_fs_hypertuned,
        "selected_features_list": selected_features_list,
    }


In [6]:
output = run_leave_one_complex_out()

  0%|          | 0/164 [00:00<?, ?it/s]

Training RF model with 739 entries.
Predicting 1 entries. The protein: ['A5PKW4']
selected_features=['EL2_score', 'Provean_score', 'Model/DOPE_score', 'Final_ddG', 'entropy_mainchain_wt', 'Matrix_score', 'Solvent_accessibility_mut', 'entropy_mainchain_mut', 'ProtBert_score', 'ProteinSolver_score']
Training RF model with 739 entries.
Predicting 1 entries. The protein: ['O00267']
selected_features=['EL2_score', 'Provean_score', 'Model/DOPE_score', 'Final_ddG', 'entropy_mainchain_wt', 'Matrix_score', 'Solvent_accessibility_mut', 'ProtBert_score', 'entropy_mainchain_mut', 'sidechain_hbond_mut']
Training RF model with 736 entries.
Predicting 4 entries. The protein: ['O00311']
selected_features=['Provean_score', 'EL2_score', 'Model/DOPE_score', 'Matrix_score', 'Final_ddG', 'Solvent_accessibility_mut', 'entropy_mainchain_wt', 'entropy_mainchain_mut', 'Interactor_template_sequence_identity', 'ProtBert_score']
Training RF model with 738 entries.
Predicting 2 entries. The protein: ['O14641']
sel

In [7]:
actuals = output["actuals"]
predictions_default = output["predictions_default"]
predictions_fs = output["predictions_fs"]
predictions_fs_hypertuned = output["predictions_fs_hypertuned"]

In [14]:
y_true = sum(actuals, [])
y_pred_default = sum(predictions_default, [])
y_pred_fs = sum(predictions_fs, [])
y_pred_fs_hypertuned = sum(predictions_fs_hypertuned, [])

In [32]:
import pandas as pd

metrics_comparison_data = pd.concat(
    [
        get_scoring_metrics(y_true, y_pred_default),
        get_scoring_metrics(y_true, y_pred_fs),
        get_scoring_metrics(y_true, y_pred_fs_hypertuned)
    ], axis="columns"
)
metrics_comparison_data.columns = ["Default", "Feature selected", "Feature selected + hypertuned"]
metrics_comparison_data

Unnamed: 0,Default,Feature selected,Feature selected + hypertuned
ACCURACY,0.778378,0.766216,0.741892
BALANCED_ACCURACY,0.651389,0.64463,0.621667
F1,0.477707,0.467692,0.429851
MATTEWS_COR,0.372474,0.342852,0.279814
PRECISION,0.657895,0.608,0.533333
RECALL,0.375,0.38,0.36


In [16]:
print(predictions_default[11])
print(predictions_fs[11])
print(predictions_fs_hypertuned[11])
print(actuals[11])
print(len(actuals[11]))

[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
26


In [26]:
leave_one_complex_out_prediction_data = pd.concat(
    output["prediction_dataframes"], ignore_index=True
)[["Mutation_Effect_Label", "UniProt_ID", "Mutation", "Interactor_UniProt_ID"]]
leave_one_complex_out_prediction_data["Predator_default"] = y_pred_default
leave_one_complex_out_prediction_data["Predator_feature_selected"] = y_pred_fs
leave_one_complex_out_prediction_data["Predator_feature_selected_tuned"] = y_pred_fs_hypertuned
assert leave_one_complex_out_prediction_data["Mutation_Effect_Label"].tolist() == y_true

In [28]:
leave_one_complex_out_prediction_data

Unnamed: 0,Mutation_Effect_Label,UniProt_ID,Mutation,Interactor_UniProt_ID,Predator_default,Predator_feature_selected,Predator_feature_selected_tuned
0,0,A5PKW4,E621K,P62330,0,0,0
1,1,O00267,I194V,P63272,1,1,1
2,1,O00311,I99T,Q9UBU7,1,1,1
3,1,O00311,I99L,Q9UBU7,1,1,1
4,1,O00311,K416T,Q9UBU7,0,0,0
...,...,...,...,...,...,...,...
735,0,Q9Y570,S156A,P67775,0,0,0
736,0,Q9Y570,S156A,P67775-2,0,0,0
737,0,Q9Y570,H349A,P67775,0,0,0
738,0,Q9Y570,S156T,P67775,0,0,0


In [29]:
from datetime import datetime

file_date = datetime.today().strftime('%Y-%m-%d')
file_name = "Leave_one_complex_out_predictions"
file_name = f"{file_name}_{file_date}.csv"
leave_one_complex_out_prediction_data.to_csv(file_name, index=False)

# ---------------------------------------------------------------

In [None]:
# The end.