# Leave One Complex Out

February 11th, 2022


Now, we are using only one random forest classifier in leave one out scheme.

In [1]:
from utils import (
    load_protein_dataframes,
    get_top_n_features,
    get_randomized_search,
    get_train_and_test_data,
    get_predictions,
    get_scoring_metrics
)

from tqdm.notebook import tqdm

HIDDEN_ENTRIES_FOLDER_PATH = "hidden-entries_2022-02-11"

TOP_N = 10

# Reflect changes in the modules immediately.
%load_ext autoreload
%autoreload 2

In [2]:
protein_dataframes = load_protein_dataframes(HIDDEN_ENTRIES_FOLDER_PATH)
len(protein_dataframes)

164

In [3]:
protein_dataframes[56]

Unnamed: 0,Mutation_Effect_Label,UniProt_ID,Mutation,Interactor_UniProt_ID,Template_sequence_identity,Alignment_score,Interactor_template_sequence_identity,Interactor_alignment_score,Final_ddG,ProtBert_score,...,number_of_residues_mut,IntraclashesEnergy1_wt,IntraclashesEnergy1_mut,IntraclashesEnergy2_wt,IntraclashesEnergy2_mut,Interface_hydrophobic_area,Interface_hydrophilic_area,Interface_total_area,Interface_contact_distance_wt,Interface_contact_distance_mut
0,1,P40692,V213M,P54278-2,0.3333,0.288192,0.9505,0.943632,1.08167,0.045422,...,408.0,284.603,285.042,39.5219,39.5219,263.325,218.09,481.41,4.63532,4.63532
1,1,P40692,F80V,P54278,0.3333,0.288192,0.9507,0.943866,1.10465,0.881422,...,409.0,400.704,400.864,50.5247,50.5247,0.0,0.0,0.0,3.29591,4.01617
2,1,P40692,F80V,P54278-3,0.3333,0.288192,0.9507,0.943866,1.10465,0.881422,...,409.0,400.704,400.864,50.5247,50.5247,0.0,0.0,0.0,3.29591,4.01617
3,1,P40692,C77R,P54278,0.3333,0.288192,0.9507,0.943866,1.76213,0.982242,...,409.0,399.477,399.615,49.6587,49.6587,0.0,0.0,0.0,3.39677,3.57761
4,1,P40692,C77R,P54278-3,0.3333,0.288192,0.9507,0.943866,1.76213,0.982242,...,409.0,399.477,399.615,49.6587,49.6587,0.0,0.0,0.0,3.39677,3.57761


In [4]:
features = list(protein_dataframes[56].columns[4:])
len(features)

58

In [5]:
actuals = []
predictions_default = []  # all features and no feature selection
predictions_fs = []
predictions_fs_hypertuned = []

selected_features_list = []

for i in tqdm(range(len(protein_dataframes))):

    # excluding test data
    train_data, test_data = get_train_and_test_data(protein_dataframes, index=i)

    X_train = train_data[features]
    y_train = train_data["Mutation_Effect_Label"]

    X_test = test_data[features]
    y_test = test_data["Mutation_Effect_Label"]

    # True labels
    actuals.append(list(y_test))

    n_train = len(X_train)
    n_test = len(X_test)
    print(f"Training RF model with {n_train} entries.")
    print(f"Predicting {n_test} entries.")

    # Predictions with all features and no hyper tuning
    predictions_default.append(
        get_predictions(X_train, y_train, X_test, tuning=False)
    )

    # Feature selection
    selected_features = get_top_n_features(X_train, y_train, top_n=TOP_N)
    selected_features_list.append(selected_features)

    # Update X_train and X_test with selected features
    X_train_fs = X_train[selected_features]
    X_test_fs = X_test[selected_features]

    # Predictions with selected features and no hyper tuning
    predictions_fs.append(
        get_predictions(X_train_fs, y_train, X_test_fs, tuning=False)
    )

    # Predictions with selected features and hyper tuning
    predictions_fs_hypertuned.append(
        get_predictions(X_train_fs, y_train, X_test_fs, tuning=True)
    )

    print("=======================================")

  0%|          | 0/164 [00:00<?, ?it/s]

Training RF model with 739 entries.
Predicting 1 entries.
selected_features=['EL2_score', 'Provean_score', 'Model/DOPE_score', 'Final_ddG', 'entropy_mainchain_wt', 'Matrix_score', 'Solvent_accessibility_mut', 'entropy_mainchain_mut', 'ProtBert_score', 'ProteinSolver_score']
Training RF model with 739 entries.
Predicting 1 entries.
selected_features=['EL2_score', 'Provean_score', 'Model/DOPE_score', 'Final_ddG', 'entropy_mainchain_wt', 'Matrix_score', 'Solvent_accessibility_mut', 'ProtBert_score', 'entropy_mainchain_mut', 'sidechain_hbond_mut']
Training RF model with 736 entries.
Predicting 4 entries.
selected_features=['Provean_score', 'EL2_score', 'Model/DOPE_score', 'Matrix_score', 'Final_ddG', 'Solvent_accessibility_mut', 'entropy_mainchain_wt', 'entropy_mainchain_mut', 'Interactor_template_sequence_identity', 'ProtBert_score']
Training RF model with 738 entries.
Predicting 2 entries.
selected_features=['Provean_score', 'EL2_score', 'Model/DOPE_score', 'Matrix_score', 'Final_ddG', '

In [8]:
y_true = sum(actuals, [])
y_pred_default = sum(predictions_default, [])
y_pred_fs = sum(predictions_fs, [])
y_pred_fs_hypertuned = sum(predictions_fs_hypertuned, [])

In [9]:
import pandas as pd

metrics_comparison_data = pd.concat(
    [
        get_scoring_metrics(y_true, y_pred_default),
        get_scoring_metrics(y_true, y_pred_fs),
        get_scoring_metrics(y_true, y_pred_fs_hypertuned)
    ], axis="columns"
)
metrics_comparison_data.columns = ["Default", "Feature selected", "Feature selected + hypertuned"]
metrics_comparison_data

Unnamed: 0,Default,Feature selected,Feature selected + hypertuned
ACCURACY,0.778378,0.766216,0.762162
BALANCED_ACCURACY,0.651389,0.64463,0.643426
F1,0.477707,0.467692,0.466667
MATTEWS_COR,0.372474,0.342852,0.33476
PRECISION,0.657895,0.608,0.592308
RECALL,0.375,0.38,0.385


In [None]:
# waiting for above cells .....

In [None]:
# todo: obtain our predictions and export it as a dataframe

In [23]:
print(predictions_default[11])
print(predictions_fs[11])
print(predictions_fs_hypertuned[11])
print(actuals[11])
print(len(actuals[11]))

[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
26


In [27]:
train_data_740 = pd.read_csv("../../processed_data_740.csv")
train_data_740_predictions_added = train_data_740.copy()

In [29]:
train_data_740_predictions_added.insert(0, "PREDICTIONS_DEFAULT", y_pred_default)
train_data_740_predictions_added.insert(1, "PREDICTIONS_FS", y_pred_fs)
train_data_740_predictions_added.insert(2, "PREDICTIONS_FS_HYPERTUNED", y_pred_fs_hypertuned)

In [30]:
train_data_740_predictions_added

Unnamed: 0,PREDICTIONS_DEFAULT,PREDICTIONS_FS,PREDICTIONS_FS_HYPERTUNED,Mutation_Effect_Label,UniProt_ID,Mutation,Interactor_UniProt_ID,Template_sequence_identity,Alignment_score,Interactor_template_sequence_identity,...,number_of_residues_mut,IntraclashesEnergy1_wt,IntraclashesEnergy1_mut,IntraclashesEnergy2_wt,IntraclashesEnergy2_mut,Interface_hydrophobic_area,Interface_hydrophilic_area,Interface_total_area,Interface_contact_distance_wt,Interface_contact_distance_mut
0,0,0,0,0,Q9BPZ3,F118A,P11940,1.0000,1.000000,1.0000,...,118.0,5.30636,3.79101,52.4675,52.4675,427.775,194.715,622.495,3.35293,3.35293
1,1,1,1,0,P01116,Y40C,P50749,0.9277,0.931315,0.3553,...,329.0,47.61770,47.21160,70.2902,70.2461,464.470,400.030,864.505,3.07827,3.59123
2,1,1,1,0,Q96QK1,F534D,Q9UBQ0,1.0000,0.360743,1.0000,...,453.0,68.21080,68.19710,59.2321,59.1774,510.690,298.820,809.510,3.03938,3.24435
3,1,1,1,0,Q96QK1,F534D,Q9UBQ0-2,1.0000,0.360743,1.0000,...,453.0,68.21080,68.19710,59.2321,59.1774,510.690,298.820,809.510,3.03938,3.24435
4,0,0,0,0,O43521-3,G66A,Q07812,1.0000,0.714286,0.3007,...,202.0,3.74828,3.76142,41.5937,41.5937,405.510,273.740,679.245,3.37301,3.22505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,0,0,0,0,P84243,Q94A,Q9UER7-3,1.0000,0.948980,1.0000,...,309.0,21.27130,20.85060,32.4886,32.4886,1159.700,952.745,2112.450,3.00752,4.27950
736,0,0,0,0,Q96QK1,L589D,Q9UBQ0,1.0000,0.360743,1.0000,...,453.0,68.13080,68.12330,59.2629,59.2629,510.690,298.820,809.510,3.45879,3.26826
737,0,0,0,0,Q96QK1,L589D,Q9UBQ0-2,1.0000,0.360743,1.0000,...,453.0,68.13080,68.12330,59.2629,59.2629,510.690,298.820,809.510,3.45879,3.26826
738,0,0,0,0,P23297,F72L,P25815,0.6778,0.664382,0.5222,...,188.0,26.74010,27.29430,17.1488,16.8699,667.920,288.795,956.720,3.26388,3.30643


In [31]:
from datetime import datetime

In [32]:
file_date = datetime.today().strftime('%Y-%m-%d')
file_name = "train_data_740_predictions_added"
file_name = f"{file_name}_{file_date}.csv"
train_data_740_predictions_added.to_csv(file_name, index=False)

# ---------------------------------------------------------------

In [None]:
# The end.

In [16]:
print("Default")
get_scoring_metrics(y_true, y_pred_default)

Default


Unnamed: 0,SCORE
ACCURACY,0.778378
BALANCED_ACCURACY,0.651389
F1,0.477707
MATTEWS_COR,0.372474
PRECISION,0.657895
RECALL,0.375


In [17]:
print("Feature selected")
get_scoring_metrics(y_true, y_pred_fs)

Feature selected


Unnamed: 0,SCORE
ACCURACY,0.766216
BALANCED_ACCURACY,0.64463
F1,0.467692
MATTEWS_COR,0.342852
PRECISION,0.608
RECALL,0.38


In [18]:
print("Feature selected + hypertuned")
get_scoring_metrics(y_true, y_pred_fs_hypertuned)

Feature selected + hypertuned


Unnamed: 0,SCORE
ACCURACY,0.755405
BALANCED_ACCURACY,0.645093
F1,0.472303
MATTEWS_COR,0.326386
PRECISION,0.566434
RECALL,0.405


In [20]:
# print(f"{actuals=}")
# print(f"{predictions_default=}")
# print(f"{predictions_fs=}")
# print(f"{predictions_fs_hypertuned=}")

actuals=[[0]]
predictions_default=[array([0], dtype=int64)]
predictions_fs=[array([0], dtype=int64)]
predictions_fs_hypertuned=[array([0], dtype=int64)]


NameError: name 'predictions_fs_hypertuned' is not defined

In [None]:
## run the cells above.

# ------------------------------

In [11]:
# randomized_search = get_randomized_search()

In [12]:
# from timeit import default_timer as timer

In [13]:
# start_time = timer()
# randomized_search.fit(X_train, y_train)
# print(timer() - start_time)

45.3238145


In [40]:
print("Feature selected (Top 10)")
get_scoring_metrics(y_true, y_pred)

Feature selected (Top 10)


Unnamed: 0,SCORE
ACCURACY,0.766216
BALANCED_ACCURACY,0.64463
F1,0.467692
MATTEWS_COR,0.342852
PRECISION,0.608
RECALL,0.38


In [39]:
print("Feature selected (Top 10) + HyperTuned")
get_scoring_metrics(y_true, y_pred_hypertuned)

Feature selected (Top 10) + HyperTuned


Unnamed: 0,SCORE
ACCURACY,0.751351
BALANCED_ACCURACY,0.626574
F1,0.435583
MATTEWS_COR,0.299101
PRECISION,0.563492
RECALL,0.355


# ------------

In [50]:
print("accuracy_score: {:.4}\n".format( accuracy_score(y_true, y_pred)) )
print("balanced_accuracy_score: {:.4}\n".format( balanced_accuracy_score(y_true, y_pred)) )
print("f1_score: {:.4}\n".format( f1_score(y_true, y_pred)) )
print("matthews_corrcoef: {:.4}\n".format( matthews_corrcoef(y_true, y_pred)) )
print("precision_score: {:.4}\n".format( precision_score(y_true, y_pred)) )
print("recall_score: {:.4}\n".format( recall_score(y_true, y_pred)) )

accuracy_score: 0.77

balanced_accuracy_score: 0.64

f1_score: 0.47

matthews_corrcoef: 0.34

precision_score: 0.61

recall_score: 0.38



In [55]:
print("accuracy_score: {:.2}\n".format( accuracy_score(y_true, y_pred)) )
print("balanced_accuracy_score: {:.2}\n".format( balanced_accuracy_score(y_true, y_pred)) )
print("f1_score: {:.2}\n".format( f1_score(y_true, y_pred)) )
print("matthews_corrcoef: {:.2}\n".format( matthews_corrcoef(y_true, y_pred)) )
print("precision_score: {:.2}\n".format( precision_score(y_true, y_pred)) )
print("recall_score: {:.2}\n".format( recall_score(y_true, y_pred)) )

accuracy_score: 0.62

balanced_accuracy_score: 0.52

f1_score: 0.31

matthews_corrcoef: 0.049

precision_score: 0.31

recall_score: 0.31



---

In [None]:
# garbage

In [None]:
randomized_search = get_randomized_search()
randomized_search.fit(X_train, y_train)
clf_tuned = randomized_search.best_estimator_
# get_tuned_model(X_train, y_train)

print(f"Training RF model with {len(X_train)} entries.")
clf.fit(X_train, y_train)
print(f"Predicting {len(X_test)} entries.")
current_preds_fs = clf.predict(X_test)
current_preds_fs_hypertuned = clf_tuned.predict(X_test)

actuals.append(list(y_test))
predictions_fs.append(list(current_preds_fs))
predictions_fs_hypertuned.append(list(current_preds_fs_hypertuned))