## Supervised Learning on Simulations on PDZ Domain

In [1]:
import sys # note temporary... 
sys.path.append("..") # note temporary...

import re 
import pandas as pd
from key_interactions_finder import pycontact_processing
from key_interactions_finder import data_preperation
from key_interactions_finder import model_building
from key_interactions_finder import post_proccessing
from key_interactions_finder import pymol_projections

### First, handle the experimental dataset

In [None]:
import scipy.io as sio
matlab_file = r"C:\Users\Rory Crean\Dropbox (lkgroup)\Backup_HardDrive\Postdoc\MachLearnConformationalFeatures\Workup\PDZ_ML\Raw_Experi_Datasets\Salinas_Ranganathan_data.mat"
exp_file = sio.loadmat(matlab_file)
exp_file.keys()

In [None]:
## Figure rest out later... 

### PDZ Domain Peptide Bound vs Unbound impact on domain. 

Probably easiest to load both datasets in seperately first then merge them afterwards.

In [None]:
# all input files 
in_dir = r"C:\Users\Rory Crean\Dropbox (lkgroup)\Backup_HardDrive\Postdoc\MachLearnConformationalFeatures\Workup\PDZ_ML\Raw_Datasets\PDZ_Combined"
out_dir = r"C:\Users\Rory Crean\Dropbox (lkgroup)\Backup_HardDrive\Postdoc\MachLearnConformationalFeatures\Workup\PDZ_ML\Raw_Results"

pycontact_files_horizontal = ["PyContact_Per_Frame_Interactions_Block1.csv", "PyContact_Per_Frame_Interactions_Block2.csv",
                              "PyContact_Per_Frame_Interactions_Block3.csv", "PyContact_Per_Frame_Interactions_Block4.csv",
                              "PyContact_Per_Frame_Interactions_Block5.csv", "PyContact_Per_Frame_Interactions_Block6.csv",
                              "PyContact_Per_Frame_Interactions_Block7.csv", "PyContact_Per_Frame_Interactions_Block8.csv",
                              "PyContact_Per_Frame_Interactions_Block9.csv", "PyContact_Per_Frame_Interactions_Block10.csv",
                              "PyContact_Per_Frame_Interactions_Block11.csv", "PyContact_Per_Frame_Interactions_Block12.csv",
                              ]


In [None]:
# dataset prep parts. 
pycontact_dataset = pycontact_processing.PyContactInitializer(
    pycontact_files=pycontact_files_horizontal,
    multiple_files=True,
    merge_files_method="horizontal",  
    remove_false_interactions=True,
    in_dir=in_dir,
)

In [None]:
pycontact_dataset.prepared_df

In [None]:
# Create and save a target file (to load back in next step)
# 10,000 frames for both states  
classification_col = ["States"] + ["Bound"] * 10000 + ["Unbound"] * 10000

classifications_file = r"C:\Users\Rory Crean\Dropbox (lkgroup)\Backup_HardDrive\Postdoc\MachLearnConformationalFeatures\Workup\PDZ_ML\Raw_Datasets\PDZ_Classifications_Combined.dat"

with open(classifications_file, "w") as file_out:
    for item in classification_col:
        file_out.write(item + "\n")

In [None]:
# Now generate an instance of the SupervisedFeatureData class and load datasets in.
supervised_dataset = data_preperation.SupervisedFeatureData(
    input_df=pycontact_dataset.prepared_df,
    target_file=classifications_file,
    is_classification=True,
    header_present=True 
)

supervised_dataset.df_processed

In [None]:
supervised_dataset.df_processed.Target.value_counts()

In [None]:
supervised_dataset.df_processed

In [None]:
# Filtering 
supervised_dataset.reset_filtering()
supervised_dataset.filter_by_occupancy_by_class(min_occupancy=25)
supervised_dataset.filter_by_main_or_side_chain(
    main_side_chain_types_included=["sc-sc", "bb-sc", "sc-bb", "bb-bb"] 
)
supervised_dataset.filter_by_avg_strength(average_strength_cut_off=0.5)
print(f"Number of features after filtering by average interaction scores: {len(supervised_dataset.df_filtered.columns)}")

In [None]:
supervised_dataset.df_filtered

In [None]:
supervised_dataset.df_filtered["101Phe 97Glu Hbond sc-bb"][10001:20000]
display(supervised_dataset.df_filtered["101Phe 97Glu Hbond sc-bb"][1:10000])

In [None]:
# Instantiate the model.
ml_model = model_building.ClassificationModel(
    dataset=supervised_dataset.df_filtered,
    evaluation_split_ratio=0.15,
    classes_to_use=["Bound", "Unbound"], 
    models_to_use=["CatBoost"], # "XGBoost", "Random_Forest"
    scaling_method="min_max",
    out_dir=out_dir, 
    cross_validation_splits=5, 
    cross_validation_repeats=3,
    search_approach="none",
)

In [None]:
ml_model.build_models(save_models=True)

In [None]:
reports = ml_model.evaluate_models()

In [None]:
reports["XGBoost"]

In [None]:
reports["CatBoost"]

In [None]:
reports["Random_Forest"]

In [None]:
post_proc = post_proccessing.SupervisedPostProcessor(
    out_dir=out_dir,
)
post_proc.load_models_from_instance(supervised_model=ml_model)

post_proc.get_feature_importance()
post_proc.get_per_res_importance()

# Pymol Results
pymol_projections.project_multiple_per_res_scores(
    all_per_res_scores=post_proc.all_per_residue_scores,
    out_dir=out_dir
)

pymol_projections.project_multiple_per_feature_scores(
    all_feature_scores=post_proc.all_feature_importances,
    numb_features=200,   #"all",
    out_dir=out_dir
)

In [None]:
post_proc.all_feature_importances

In [2]:
# TODO - (full implementation in a tutorial)
# Make more fancy selection options. 
# Allow user to customisize selection to multi-atom?
from key_interactions_finder import utils 

In [3]:
pdb_file = r"C:\Users\Rory Crean\Dropbox (lkgroup)\Backup_HardDrive\Postdoc\MachLearnConformationalFeatures\Workup\PDZ_ML\PDZ_pep_Bound_postleap.pdb"
out_file = r"C:\Users\Rory Crean\Dropbox (lkgroup)\Backup_HardDrive\Postdoc\MachLearnConformationalFeatures\Workup\PDZ_ML\Raw_Results\PDZ_Dist_to_Binding_Site.csv"
distance_array = utils.per_residue_distance_to_site(
    pdb_file=pdb_file,
    out_file=out_file,
    site_defintion="resid 118 and name CA",   #"not name H* and resid 118 to 123",
    first_residue=1,
    last_residue=117,
)

[4.2983402  3.96260469 4.608412   3.99198438 4.27552392 3.86867127
 3.6261414  3.97854258 4.73128174 5.3288185  3.42584846 3.12535418
 3.69037011 3.35443625 4.02431262 3.19395393 2.99943135 2.92196937
 3.19039658 3.7105367  4.38959762 3.17684994 4.02182399 3.38869786
 3.9120616  2.59417409 4.13510285 2.99791741 3.91193551 3.27932208
 3.20278319 4.39464079 3.66932651 3.4907868  2.78507996 3.66522991
 3.94904867 4.19612053 3.16876409 3.33485142 2.59980908 2.80275387]
min_dists




In [4]:
len(distance_array)

42