In [2]:
import pandas as pd
import numpy as np
import pytraj as pyt
import matplotlib.pyplot as plt
import glob, sys, os
sys.path.append(r'..')

In [3]:
prot_name = 'cdk2'

## 1) Protein Metadata

In [4]:
### Protein data table

In [33]:
path_to_file = '../data/TABLA_MTDATA_CDK2_402_crys_LIGS_INFO_LABELS.json'
df_prot = pd.read_json(path_to_file)
# df_prot

In [8]:
## Load all protein structures

In [6]:
file_pdb_traj_all = '../../ARCHIVOS/CRISTALES/TRAJ_CRISTALS_PDB/PDB_402_cdk2_PISANI_ALL.pdb'
traj_crystals = pyt.load(file_pdb_traj_all)

print(F'Número de frames: {traj_crystals.n_frames}.\nNúmero de átomos: {traj_crystals.n_atoms}.')

Número de frames: 402.
Número de átomos: 4848.


In [20]:
# Used residues 

In [7]:
from modules.subsecuencias_cdk2 import *
pisiani_residues = get_pisani_residues(sep=",") # subsecuencia de la estructura secuendaria
pocket_residues = get_pocket_residues(sep=",")

In [8]:
# Mask to select CA atoms from secondary structure residues
sec_str_mask = f'(:{pisiani_residues})&(@CA)'

# Mask to select CA form pocket residues
pkt_str_mask = f'(:{pocket_residues})&(@CA)'

In [9]:
# Volume Information

In [10]:
# Volume using povme
def get_volume_results(files, col_names, index_col):
    df_columns = []
    for file in files:
        df = pd.read_csv(file, sep='\t', header=None, names=['conf_num', 'volume'])[['volume']]
        df_columns.append(df)
    # Concatenate dataframes
    res_df = pd.concat(df_columns, axis=1)
    res_df.columns = col_names
    # Set the index
    res_df.index = index_col
    return res_df

In [13]:
# Pocket residues alignment (done using vmd)
pocket_file = '../../ARCHIVOS/CRISTALES/VOLUMEN/CDK2_VOL_RICCI_402/res_volumes.tabbed.txt'

# Append results from secondary structure alignment
sec_file =  '../../ARCHIVOS/CRISTALES/VOLUMEN/CDK2_VOL_PISANI_402/res_volumes.tabbed.txt'

df_volumen = get_volume_results([pocket_file, sec_file], ['pocket', 'sec-struct'], index_col=df_prot.index)

In [None]:
# Format table for dash app

In [34]:
# Add volume info to the main table
df_dash_app = pd.concat([df_prot, df_volumen], axis=1)
# Drop unuseful columns
df_dash_app.reset_index(inplace=True)
cols2keep = {'index': 'PDB-id', 'Date': 'Date', 'Resolution': 'Resolution', 
                'Coverage': 'Coverage', 'Inhib': 'Ligand',  'Inhib_mass': 'LigMass', 
                'pocket': 'Pocket Volume (Sec)','sec-struct': 'Pocket Volume (Pkt)',
                'Labels_conf': 'Conformation'}

df_dash_app = df_dash_app[cols2keep.keys()]
df_dash_app = df_dash_app.rename(columns=cols2keep)

# Fill na values
df_dash_app.fillna(0)

# Format conformation column 
df_dash_app.Conformation = df_dash_app['Conformation'].apply(lambda x: x.replace('_', ' ').capitalize())

# Cas data types
df_dash_app = df_dash_app.infer_objects()
df_dash_app
# Save the table to be added to the APP
df_dash_app.to_pickle('./CDK2_PDB_metadata_TABLE_dashApp.obj')

## 2) Dimensionality Reduction Table

### Classical Multidimensional Scaling

In [35]:
from modules.MDS import cMDS
from modules.plotting_functions import plot_points, plot_anotation_labels

In [36]:
pair_rmsd_sec = pyt.pairwise_rmsd(traj = traj_crystals,
                                 mask = sec_str_mask, metric='rms')

pair_rmsd_pkt = pyt.pairwise_rmsd(traj = traj_crystals,
                                 mask = pkt_str_mask, metric='rms')

In [37]:
# Generación del cMDS

In [38]:
mds_sec = cMDS(pair_rmsd_sec)[0]
mds_pkt = cMDS(pair_rmsd_pkt)[0]

### t-SNE

In [39]:
from copy import copy

def get_new_dimentions(dmr_obj, traj, frames = None, atom_mask = "@CA"):
    ''' Retorna un objeto pca dada una trayectoria de pytraj y una mascara de selección'''
    if frames is None:
        frames = range(0, traj.n_frames)
    traj_subset = traj_crystals[frames, atom_mask]
    xyz_2d_array = traj_subset.xyz.reshape(traj_subset.n_frames, traj_subset.n_atoms * 3)
    # Crea una copia del estimador para evitar reentrenar el mismo objeto
    dmr_obj_copy = copy(dmr_obj)
    reduced = dmr_obj_copy.fit_transform( xyz_2d_array )
    return reduced

In [40]:
# tSNE

In [41]:
from sklearn.manifold import TSNE
tsne_obj = TSNE(n_components=2, learning_rate=50)

tsne_sec = get_new_dimentions(tsne_obj, traj_crystals, 
                              frames=None, atom_mask=sec_str_mask).T


tsne_pkt = get_new_dimentions(tsne_obj, traj_crystals, 
                              frames=None, atom_mask=pkt_str_mask).T

In [44]:
# Pocket shape information
# python3 binding_site_overlap_RICCI.py -f *frame_*npy  -c --csv

# *******************
# Secondary Structure
# *******************
dir_volumen_results = '../../ARCHIVOS/CRISTALES/VOLUMEN/CDK2_VOL_PISANI_402/res_frameInfo/'

vol_tan_mtx = pd.read_csv(dir_volumen_results + 'POVME_Tanimoto_matrix.csv', header=None)
vol_tan_col_mtx = pd.read_csv(dir_volumen_results + 'POVME_Tanimoto_matrix_colored.csv', header=None)
vol_tan_hyfb_mtx = pd.read_csv(dir_volumen_results +'POVME_Tanimoto_matrix_hydrophobic.csv', header=None)

# MDS using only col Tanimoto
mds_vol_sec = cMDS(1 - vol_tan_mtx)[0]

# *******************
# Pocket Residues 
# *******************
dir_volumen_results = '../../ARCHIVOS/CRISTALES/VOLUMEN/CDK2_VOL_RICCI_402/res_frameInfo/'

vol_tan_mtx = pd.read_csv(dir_volumen_results + 'POVME_Tanimoto_matrix.csv', header=None)
vol_tan_col_mtx = pd.read_csv(dir_volumen_results + 'POVME_Tanimoto_matrix_colored.csv', header=None)
vol_tan_hyfb_mtx = pd.read_csv(dir_volumen_results +'POVME_Tanimoto_matrix_hydrophobic.csv', header=None)

# MDS using only col Tanimoto
mds_vol_pkt = cMDS(1 - vol_tan_mtx)[0]

### Merge all values: Dimensionality Reduction

In [45]:
df_dims = pd.DataFrame([mds_sec[0], mds_sec[1],
                        mds_pkt[0], mds_pkt[1],
                        tsne_sec[0], tsne_sec[1],
                        tsne_pkt[0], tsne_pkt[1],
                        mds_vol_sec[0], mds_vol_sec[1],
                        mds_vol_pkt[0], mds_vol_pkt[1]]).T

# Set names
colnames = ['mds_sec_x', 'mds_sec_y',
            'mds_pkt_x', 'mds_pkt_y',
            'tsne_sec_x', 'tsne_sec_y',
            'tsne_pkt_x', 'tsne_pkt_y',
            'mds_vol_sec_x', 'mds_vol_sec_y',
            'mds_vol_pkt_x', 'mds_vol_pkt_y',
           ]

# Set the names
df_dims.columns = colnames
# Set index
df_dims.index = df_prot.index
df_dims


# Save the results
df_dims.to_pickle('./CDK2_DIMS_reduced_TABLE_dashApp.obj')

## 3) Table: Merged dataframes with Scores Results

In [57]:
file_name = '../6_Machine_Learning_Models/df_DkSc_results_COCRYS_CSAR_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(3466, 402)

## 4) Preselected Conformations

In [65]:
import joblib
import warnings
warnings.filterwarnings("ignore")

In [79]:
# Funtion to get the dataframe with selected k conformations using RFE
def selected_confs_from_RFE(rfe_selector, X):
    '''Returns a Data Frame with n rows (n = num of protein conformations) and one
       column where each cell contains a list of k indices indicating the selected conformations.
       Selected conformatios are obtained from an rfe_selector estimator and the X matrix.'''
    # Get the features ranking
    df_ranks = pd.DataFrame({'pdb_id': X.columns, 'rfe_ranking': rfe_selector.ranking_})
    # Sort features by ranking
    df_ranks.sort_values('rfe_ranking', inplace = True)
    # Get the dataframe with the list of conf indices per k conformations
    confs_per_k = [ df_ranks.index[:i + 1].tolist() for i in range(len(df_ranks))]
    df_sel_confs = pd.DataFrame({'confs_idx_per_k': confs_per_k})
    return df_sel_confs


# Auxiliar Function to extract RFE preselected conformations
def get_preselected_conf_sorted_RFE(filename, X):
    
    # RFE selector LogReg Random split
    rfe_selector = joblib.load(filename)

    #**************************
    # Get the features ranking
    df_sel_confs = selected_confs_from_RFE(rfe_selector, X)

    # Get the array of preselected conformations
    # Due to RFE is a sequential selector, we only need the last row of the dataframe
    return np.array(df_sel_confs.iloc[-1].values[0])

In [80]:
X = X_merged_dksc

In [81]:
#***************************
# Create an empty dictionary
#***************************
preselect_confs = {}

### Random Splitting

In [90]:
# Logistic Regression Estimator 
dataset = 'MERGED'
model_name = 'LogReg'
split = 'random'
filename = f'../6_Machine_Learning_Models/ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
preselect_confs[f'RFE_{model_name}_{split}'] = get_preselected_conf_sorted_RFE(filename, X)


# Random Forest Estimator 
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'random'
filename = f'../6_Machine_Learning_Models/ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
preselect_confs[f'RFE_{model_name}_{split}'] = get_preselected_conf_sorted_RFE(filename, X)


# XGB tree Estimator 
dataset = 'MERGED'
model_name = 'XGB_tree'
split = 'random'
filename = f'../6_Machine_Learning_Models/ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
preselect_confs[f'RFE_{model_name}_{split}'] = get_preselected_conf_sorted_RFE(filename, X)

### Scaffold Splitting

In [92]:
# Logistic Regression Estimator 
dataset = 'MERGED'
model_name = 'LogReg'
split = 'scaffold'
filename = f'../6_Machine_Learning_Models/ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
preselect_confs[f'RFE_{model_name}_{split}'] = get_preselected_conf_sorted_RFE(filename, X)


# Random Forest Estimator 
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'scaffold'
filename = f'../6_Machine_Learning_Models/ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
preselect_confs[f'RFE_{model_name}_{split}'] = get_preselected_conf_sorted_RFE(filename, X)


# XGB tree Estimator 
dataset = 'MERGED'
model_name = 'XGB_tree'
split = 'scaffold'
filename = f'../6_Machine_Learning_Models/ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
preselect_confs[f'RFE_{model_name}_{split}'] = get_preselected_conf_sorted_RFE(filename, X)

In [93]:
preselect_confs

{'RFE_LogReg_random': array([  0, 270, 268, ...,  37, 286, 371]),
 'RFE_RandomForest_random': array([321, 293, 334, ..., 199, 401, 200]),
 'RFE_XGB_tree_random': array([  0, 182, 183, ..., 143, 204, 221]),
 'RFE_LogReg_scaffold': array([  0, 272, 271, ...,  41, 242, 247]),
 'RFE_RandomForest_scaffold': array([248, 333, 332, ...,   2,   1,   0]),
 'RFE_XGB_tree_scaffold': array([  0, 272, 271, ..., 137, 401, 352])}