In [2]:
import pandas as pd
import numpy as np
import pytraj as pyt
import matplotlib.pyplot as plt
import glob, sys, os
sys.path.append(r'..')

In [3]:
prot_name = 'cdk2'

## 1) Protein Metadata

In [4]:
### Protein data table

In [33]:
path_to_file = '../data/TABLA_MTDATA_CDK2_402_crys_LIGS_INFO_LABELS.json'
df_prot = pd.read_json(path_to_file)
# df_prot

In [8]:
## Load all protein structures

In [6]:
file_pdb_traj_all = '../../ARCHIVOS/CRISTALES/TRAJ_CRISTALS_PDB/PDB_402_cdk2_PISANI_ALL.pdb'
traj_crystals = pyt.load(file_pdb_traj_all)

print(F'Número de frames: {traj_crystals.n_frames}.\nNúmero de átomos: {traj_crystals.n_atoms}.')

Número de frames: 402.
Número de átomos: 4848.


In [20]:
# Used residues 

In [7]:
from modules.subsecuencias_cdk2 import *
pisiani_residues = get_pisani_residues(sep=",") # subsecuencia de la estructura secuendaria
pocket_residues = get_pocket_residues(sep=",")

In [8]:
# Mask to select CA atoms from secondary structure residues
sec_str_mask = f'(:{pisiani_residues})&(@CA)'

# Mask to select CA form pocket residues
pkt_str_mask = f'(:{pocket_residues})&(@CA)'

In [9]:
# Volume Information

In [10]:
# Volume using povme
def get_volume_results(files, col_names, index_col):
    df_columns = []
    for file in files:
        df = pd.read_csv(file, sep='\t', header=None, names=['conf_num', 'volume'])[['volume']]
        df_columns.append(df)
    # Concatenate dataframes
    res_df = pd.concat(df_columns, axis=1)
    res_df.columns = col_names
    # Set the index
    res_df.index = index_col
    return res_df

In [13]:
# Pocket residues alignment (done using vmd)
pocket_file = '../../ARCHIVOS/CRISTALES/VOLUMEN/CDK2_VOL_RICCI_402/res_volumes.tabbed.txt'

# Append results from secondary structure alignment
sec_file =  '../../ARCHIVOS/CRISTALES/VOLUMEN/CDK2_VOL_PISANI_402/res_volumes.tabbed.txt'

df_volumen = get_volume_results([pocket_file, sec_file], ['pocket', 'sec-struct'], index_col=df_prot.index)

In [None]:
# Format table for dash app

In [346]:
# Add volume info to the main table
df_dash_app = pd.concat([df_prot, df_volumen], axis=1)
# Drop unuseful columns
df_dash_app.reset_index(inplace=True)
cols2keep = {'index': 'PDB-id', 'Date': 'Date', 'Resolution': 'Resolution', 
                'Coverage': 'Coverage', 'Inhib': 'Ligand',  'Inhib_mass': 'LigMass', 
                'pocket': 'Pocket Volume (Sec)','sec-struct': 'Pocket Volume (Pkt)',
                'Labels_conf': 'Conformation'}

df_dash_app = df_dash_app[cols2keep.keys()]
df_dash_app = df_dash_app.rename(columns=cols2keep)

# Fill na values
df_dash_app.fillna(0)
df_dash_app.LigMass = [0 if i == '' else i for i in df_dash_app.LigMass ]

# Format conformation column 
df_dash_app.Conformation = df_dash_app['Conformation'].apply(lambda x: x.replace('_', ' ').capitalize())

# Cas data types
df_dash_app = df_dash_app.infer_objects()
df_dash_app
# Save the table to be added to the APP
# df_dash_app.to_pickle('./CDK2_PDB_metadata_TABLE_dashApp.obj')

df_dash_app.LigMass.astype(np.float)

0      440.0
1      398.0
2      398.0
3        0.0
4      214.0
       ...  
397    258.0
398    274.0
399    234.0
400    218.0
401      0.0
Name: LigMass, Length: 402, dtype: float64

## 2) Dimensionality Reduction Table

### Classical Multidimensional Scaling

In [35]:
from modules.MDS import cMDS
from modules.plotting_functions import plot_points, plot_anotation_labels

In [36]:
pair_rmsd_sec = pyt.pairwise_rmsd(traj = traj_crystals,
                                 mask = sec_str_mask, metric='rms')

pair_rmsd_pkt = pyt.pairwise_rmsd(traj = traj_crystals,
                                 mask = pkt_str_mask, metric='rms')

In [37]:
# Generación del cMDS

In [38]:
mds_sec = cMDS(pair_rmsd_sec)[0]
mds_pkt = cMDS(pair_rmsd_pkt)[0]

### t-SNE

In [39]:
from copy import copy

def get_new_dimentions(dmr_obj, traj, frames = None, atom_mask = "@CA"):
    ''' Retorna un objeto pca dada una trayectoria de pytraj y una mascara de selección'''
    if frames is None:
        frames = range(0, traj.n_frames)
    traj_subset = traj_crystals[frames, atom_mask]
    xyz_2d_array = traj_subset.xyz.reshape(traj_subset.n_frames, traj_subset.n_atoms * 3)
    # Crea una copia del estimador para evitar reentrenar el mismo objeto
    dmr_obj_copy = copy(dmr_obj)
    reduced = dmr_obj_copy.fit_transform( xyz_2d_array )
    return reduced

In [40]:
# tSNE

In [41]:
from sklearn.manifold import TSNE
tsne_obj = TSNE(n_components=2, learning_rate=50)

tsne_sec = get_new_dimentions(tsne_obj, traj_crystals, 
                              frames=None, atom_mask=sec_str_mask).T


tsne_pkt = get_new_dimentions(tsne_obj, traj_crystals, 
                              frames=None, atom_mask=pkt_str_mask).T

In [44]:
# Pocket shape information
# python3 binding_site_overlap_RICCI.py -f *frame_*npy  -c --csv

# *******************
# Secondary Structure
# *******************
dir_volumen_results = '../../ARCHIVOS/CRISTALES/VOLUMEN/CDK2_VOL_PISANI_402/res_frameInfo/'

vol_tan_mtx = pd.read_csv(dir_volumen_results + 'POVME_Tanimoto_matrix.csv', header=None)
vol_tan_col_mtx = pd.read_csv(dir_volumen_results + 'POVME_Tanimoto_matrix_colored.csv', header=None)
vol_tan_hyfb_mtx = pd.read_csv(dir_volumen_results +'POVME_Tanimoto_matrix_hydrophobic.csv', header=None)

# MDS using only col Tanimoto
mds_vol_sec = cMDS(1 - vol_tan_mtx)[0]

# *******************
# Pocket Residues 
# *******************
dir_volumen_results = '../../ARCHIVOS/CRISTALES/VOLUMEN/CDK2_VOL_RICCI_402/res_frameInfo/'

vol_tan_mtx = pd.read_csv(dir_volumen_results + 'POVME_Tanimoto_matrix.csv', header=None)
vol_tan_col_mtx = pd.read_csv(dir_volumen_results + 'POVME_Tanimoto_matrix_colored.csv', header=None)
vol_tan_hyfb_mtx = pd.read_csv(dir_volumen_results +'POVME_Tanimoto_matrix_hydrophobic.csv', header=None)

# MDS using only col Tanimoto
mds_vol_pkt = cMDS(1 - vol_tan_mtx)[0]

### Merge all values: Dimensionality Reduction

In [96]:
df_dims = pd.DataFrame([mds_sec[0], mds_sec[1],
                        mds_pkt[0], mds_pkt[1],
                        tsne_sec[0], tsne_sec[1],
                        tsne_pkt[0], tsne_pkt[1],
                        mds_vol_sec[0], mds_vol_sec[1],
                        mds_vol_pkt[0], mds_vol_pkt[1]]).T

# Set names
colnames = ['mds_sec_x', 'mds_sec_y',
            'mds_pkt_x', 'mds_pkt_y',
            'tsne_sec_x', 'tsne_sec_y',
            'tsne_pkt_x', 'tsne_pkt_y',
            'mds_vol_sec_x', 'mds_vol_sec_y',
            'mds_vol_pkt_x', 'mds_vol_pkt_y',
           ]

# Set the names
df_dims.columns = colnames
# Set index
df_dims.index = df_prot.index

# Save the results
# df_dims.to_pickle('./CDK2_DIMS_reduced_TABLE_dashApp.obj')

## 3) Table: Merged dataframes with Scores Results

In [111]:
%run ../modules/plotting_metrics.py

In [57]:
file_name = '../6_Machine_Learning_Models/df_DkSc_results_COCRYS_CSAR_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(3466, 402)

### Compute Metrics on Raw docking Scores

In [234]:
import sys
sys.path.append('../2_Docking_analysis/')
from scaffold_splitter import train_test_scaffold_split

# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = '../2_Docking_analysis/df_COCRYS_CSAR_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

X = X_merged_dksc
y = y_true_merged

# Scaffold splitted subsets
df_scff_murcko = pd.read_pickle(file)
df_scff_murcko.shape
scaffold_series = df_scff_murcko['scff_generic']

# Create the scaffold splitting
X_scff_train, X_scff_test, y_scff_train, y_scff_test = train_test_scaffold_split(X, y, 
                                    scaffold_series = scaffold_series,
                                    test_size=0.25, stratify=y)
# The rest of subsections
lib = 'CSAR'
X_CSAR, y_CSAR = X.loc[lib], y.loc[lib]

lib = 'DEKOIS'
X_DEKOIS, y_DEKOIS = X.loc[lib], y.loc[lib]

lib = 'DUD'
X_DUD, y_DUD = X.loc[lib], y.loc[lib]

# ******************************
# Create a dictionary of subsets
# ******************************

raw_dksc_subsets = {
    'csar': (X_CSAR, y_CSAR),
    'dekois': (X_DEKOIS, y_DEKOIS),
    'dud': (X_DUD, y_DUD),
    'merged': (X, y),
    'scff-train': (X_scff_train, y_scff_train),
    'scff-test': (X_scff_test, y_scff_test)
}

In [235]:
# List of parameters to compute
roc_params = {'metric_name': 'roc_auc'}
nef_params = {'metric_name': 'nef_auc'}
pr_params = {'metric_name': 'pr_auc'}

# The Ra value for the testing set in FXa is 75/1559 = 0.05
# Therefore the maximum value of alpha for bedroc could be a=20
bedroc_20 = {'metric_name': 'bedroc', 'alpha': 20}
bedroc_10 = {'metric_name': 'bedroc', 'alpha': 10}
bedroc_2 = {'metric_name': 'bedroc', 'alpha': 2}
bedroc_05 = {'metric_name': 'bedroc', 'alpha': 0.5}

# ef values 0.001, 0.005, 0.02, 0.1, 0.2
ef_0001 = {'metric_name': 'ef', 'fraction': 0.001}
ef_0005 = {'metric_name': 'ef', 'fraction': 0.005}
ef_002 = {'metric_name': 'ef', 'fraction': 0.02}
ef_02 = {'metric_name': 'ef', 'fraction': 0.2}

# List of metrics
metrics = [roc_params, nef_params, pr_params,
           bedroc_20, bedroc_10, bedroc_2, bedroc_05,
           ef_0001, ef_0005, ef_002, ef_02]

In [236]:
def get_dksc_metric_results(X, y, metric_params):
    dict_y_preds = X.to_dict('list')
    metric_results = PlotMetric(y_true=y, y_pred_dict=dict_y_preds, 
                                decreasing=True).format_metric_results(
                                **metric_params)
    return metric_results

In [237]:
dict_metrics_dksc = {}

for subset, (X, y) in raw_dksc_subsets.items():
    for m_p in metrics:
        metric_name = '-'.join([str(i) for i in list(m_p.values())])
        colname = subset + '_' + metric_name
        
        values = get_dksc_metric_results(X, y, m_p).T.values[0]
        
        dict_metrics_dksc[colname] = values

#*******************
# Create a dataframe
#*******************

df_raw_dksc_metrics = pd.DataFrame(dict_metrics_dksc, index=df_prot.index)

## 4) Preselected Conformations

In [65]:
import joblib
import warnings
warnings.filterwarnings("ignore")

In [79]:
# Funtion to get the dataframe with selected k conformations using RFE
def selected_confs_from_RFE(rfe_selector, X):
    '''Returns a Data Frame with n rows (n = num of protein conformations) and one
       column where each cell contains a list of k indices indicating the selected conformations.
       Selected conformatios are obtained from an rfe_selector estimator and the X matrix.'''
    # Get the features ranking
    df_ranks = pd.DataFrame({'pdb_id': X.columns, 'rfe_ranking': rfe_selector.ranking_})
    # Sort features by ranking
    df_ranks.sort_values('rfe_ranking', inplace = True)
    # Get the dataframe with the list of conf indices per k conformations
    confs_per_k = [ df_ranks.index[:i + 1].tolist() for i in range(len(df_ranks))]
    df_sel_confs = pd.DataFrame({'confs_idx_per_k': confs_per_k})
    return df_sel_confs


# Auxiliar Function to extract RFE preselected conformations
def get_preselected_conf_sorted_RFE(filename, X):
    
    # RFE selector LogReg Random split
    rfe_selector = joblib.load(filename)

    #**************************
    # Get the features ranking
    df_sel_confs = selected_confs_from_RFE(rfe_selector, X)

    # Get the array of preselected conformations
    # Due to RFE is a sequential selector, we only need the last row of the dataframe
    return np.array(df_sel_confs.iloc[-1].values[0])

In [81]:
#***************************
# Create an empty dictionary
#***************************
preselect_confs = {}

### Random Splitting

In [90]:
# Logistic Regression Estimator 
dataset = 'MERGED'
model_name = 'LogReg'
split = 'random'
filename = f'../6_Machine_Learning_Models/ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
preselect_confs[f'RFE_{model_name}_{split}'] = get_preselected_conf_sorted_RFE(filename, X)


# Random Forest Estimator 
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'random'
filename = f'../6_Machine_Learning_Models/ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
preselect_confs[f'RFE_{model_name}_{split}'] = get_preselected_conf_sorted_RFE(filename, X)


# XGB tree Estimator 
dataset = 'MERGED'
model_name = 'XGB_tree'
split = 'random'
filename = f'../6_Machine_Learning_Models/ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
preselect_confs[f'RFE_{model_name}_{split}'] = get_preselected_conf_sorted_RFE(filename, X)

### Scaffold Splitting

In [92]:
# Logistic Regression Estimator 
dataset = 'MERGED'
model_name = 'LogReg'
split = 'scaffold'
filename = f'../6_Machine_Learning_Models/ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
preselect_confs[f'RFE_{model_name}_{split}'] = get_preselected_conf_sorted_RFE(filename, X)


# Random Forest Estimator 
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'scaffold'
filename = f'../6_Machine_Learning_Models/ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
preselect_confs[f'RFE_{model_name}_{split}'] = get_preselected_conf_sorted_RFE(filename, X)


# XGB tree Estimator 
dataset = 'MERGED'
model_name = 'XGB_tree'
split = 'scaffold'
filename = f'../6_Machine_Learning_Models/ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
preselect_confs[f'RFE_{model_name}_{split}'] = get_preselected_conf_sorted_RFE(filename, X)

In [93]:
preselect_confs

{'RFE_LogReg_random': array([  0, 270, 268, ...,  37, 286, 371]),
 'RFE_RandomForest_random': array([321, 293, 334, ..., 199, 401, 200]),
 'RFE_XGB_tree_random': array([  0, 182, 183, ..., 143, 204, 221]),
 'RFE_LogReg_scaffold': array([  0, 272, 271, ...,  41, 242, 247]),
 'RFE_RandomForest_scaffold': array([248, 333, 332, ...,   2,   1,   0]),
 'RFE_XGB_tree_scaffold': array([  0, 272, 271, ..., 137, 401, 352])}

## 5) Conformational Selection Results

In [100]:
import pickle

In [208]:
filename = '../6_Machine_Learning_Models/CDK2_ML_results_conformational_selection.obj'
with open(filename, 'rb') as f:
    dic_ml_results = pickle.load(f)

# FINAL OBJECT

In [238]:
FINAL_DIC = dict(
    df_PROT_METADATA = df_dash_app,
    df_DIM_REDUCT = df_dims,
    df_DKSC_METRICS = df_raw_dksc_metrics,
    df_SELECTED_CONFS = preselect_confs,
    dict_ML_RESULTS = dic_ml_results
)

In [439]:
preselect_confs

{'RFE_LogReg_random': array([  0, 270, 268, ...,  37, 286, 371]),
 'RFE_RandomForest_random': array([321, 293, 334, ..., 199, 401, 200]),
 'RFE_XGB_tree_random': array([  0, 182, 183, ..., 143, 204, 221]),
 'RFE_LogReg_scaffold': array([  0, 272, 271, ...,  41, 242, 247]),
 'RFE_RandomForest_scaffold': array([248, 333, 332, ...,   2,   1,   0]),
 'RFE_XGB_tree_scaffold': array([  0, 272, 271, ..., 137, 401, 352])}

In [314]:
# Save the object
data_file = './CDK2_dash_app_results.obj'
with open(data_file, 'wb') as f:
    pickle.dump(FINAL_DIC, f)

# Plot Testing

In [240]:
import plotly.graph_objects as go

In [325]:
metric = 'bedroc-20'

In [326]:
W = FINAL_DIC['df_DKSC_METRICS'].filter(regex=metric)


selected_points = preselect_confs['RFE_LogReg_random'][0:34]
W.filter(regex='scff|merged')
W

Unnamed: 0,csar_bedroc-20,dekois_bedroc-20,dud_bedroc-20,merged_bedroc-20,scff-train_bedroc-20,scff-test_bedroc-20
1aq1,0.539,0.201,0.216,0.297,0.314,0.306
1b38,0.127,0.089,0.272,0.245,0.303,0.181
1b39,0.088,0.124,0.327,0.311,0.338,0.274
1buh,0.496,0.119,0.086,0.193,0.172,0.211
1ckp,0.077,0.163,0.306,0.372,0.420,0.330
...,...,...,...,...,...,...
6q4g,0.039,0.097,0.101,0.241,0.269,0.205
6q4h,0.054,0.121,0.158,0.274,0.324,0.212
6q4i,0.135,0.173,0.107,0.304,0.328,0.258
6q4j,0.180,0.126,0.122,0.282,0.345,0.170


In [280]:
# Violin plots
fig = go.Figure()

for column in W:
    fig.add_trace(
        go.Violin(
        y = W[column],
       name=column.split('_')[0].upper(),
       jitter=1, points='all', side='positive',
       box_visible=True,
       selectedpoints=selected_points,
           marker= dict(
               size = 5
        )
        )
    )
fig

In [327]:
## Scatter Plot
df_DIM_REDUCT

Unnamed: 0,mds_sec_x,mds_sec_y,mds_pkt_x,mds_pkt_y,tsne_sec_x,tsne_sec_y,tsne_pkt_x,tsne_pkt_y,mds_vol_sec_x,mds_vol_sec_y,mds_vol_pkt_x,mds_vol_pkt_y
1aq1,0.447117,0.128934,-0.005042,0.078324,-0.333573,3.693628,-23.480253,1.799659,0.029833,-0.063703,-0.037282,0.068246
1b38,0.498714,-0.190018,-0.147474,-0.133852,7.647439,6.533896,-3.304227,-0.772078,0.115577,0.159151,0.057118,-0.151794
1b39,0.591065,-0.188184,-0.090459,-0.120443,6.201058,6.832195,-3.581550,-0.632519,0.137015,0.137407,0.058764,-0.139097
1buh,0.469182,-0.332170,-0.462317,-0.329182,18.949434,1.859857,6.321007,-5.739815,0.080822,0.198052,0.014419,-0.216752
1ckp,0.498730,-0.211238,-0.416193,-0.165841,8.654486,6.208575,0.762444,-14.850564,0.082367,-0.034609,0.035167,0.038303
...,...,...,...,...,...,...,...,...,...,...,...,...
6q4g,0.662916,-0.376188,-0.560737,-0.367091,21.615829,2.994778,5.327835,-18.505230,0.172777,0.086522,0.114973,-0.079964
6q4h,0.654137,-0.363412,-0.545670,-0.334475,21.609739,3.425165,5.288434,-18.361229,0.138966,0.069850,0.081968,-0.060474
6q4i,0.659985,-0.316495,-0.471819,-0.253619,19.054459,11.644764,6.035275,-16.082022,0.101212,0.180100,0.082982,-0.185039
6q4j,0.708471,-0.320230,-0.423223,-0.175359,21.787132,3.671082,4.832711,-19.523121,0.136786,0.039892,0.120044,-0.031240


In [412]:
df_DIM_REDUCT = FINAL_DIC['df_DIM_REDUCT']
method = 'mds'
prot_section = 'sec'

name = f'{method}_{prot_section}_'
Z = df_DIM_REDUCT[[name + 'x', name + 'y']]
Z.columns = ['x', 'y']

Unnamed: 0,PDB-id,Date,Resolution,Coverage,Ligand,LigMass,Pocket Volume (Sec),Pocket Volume (Pkt),Conformation,x,y
0,1aq1,1997-11-12,2.00,92.953020,STU,440.0,887.0,859.0,Inact b,,
1,1b38,1998-12-23,2.00,97.315436,ATP,398.0,698.0,670.0,Inact a,,
2,1b39,1998-12-23,2.10,97.315436,ATP,398.0,672.0,654.0,Inact a,,
3,1buh,1998-09-09,2.60,96.308725,,0,657.0,643.0,Inact a,,
4,1ckp,1999-01-13,2.05,93.624161,PVB,214.0,747.0,732.0,Inact a,,
...,...,...,...,...,...,...,...,...,...,...,...
6q4g,,NaT,,,,,,,,0.662916,-0.376188
6q4h,,NaT,,,,,,,,0.654137,-0.363412
6q4i,,NaT,,,,,,,,0.659985,-0.316495
6q4j,,NaT,,,,,,,,0.708471,-0.320230


In [437]:
# [(i, j) for i, j in df_dash_app[['Conformation', 'LigMass']].list()]

df_dash_app[['Conformation', 'LigMass']].to_dict('list')

df_dash_app[['PDB-id', 'Ligand', 'LigMass']].to_dict('list')

{'PDB-id': ['1aq1',
  '1b38',
  '1b39',
  '1buh',
  '1ckp',
  '1di8',
  '1dm2',
  '1e1v',
  '1e1x',
  '1e9h',
  '1f5q',
  '1fin',
  '1fq1',
  '1fvt',
  '1fvv',
  '1g5s',
  '1gih',
  '1gii',
  '1gij',
  '1gy3',
  '1gz8',
  '1h00',
  '1h01',
  '1h07',
  '1h08',
  '1h0v',
  '1h0w',
  '1h1p',
  '1h1q',
  '1h1r',
  '1h1s',
  '1h24',
  '1h25',
  '1h26',
  '1h27',
  '1h28',
  '1hck',
  '1hcl',
  '1jst',
  '1jsv',
  '1jvp',
  '1ke5',
  '1ke6',
  '1ke7',
  '1ke8',
  '1ke9',
  '1ogu',
  '1oi9',
  '1oiq',
  '1oir',
  '1oit',
  '1oiu',
  '1oiy',
  '1okv',
  '1okw',
  '1ol1',
  '1ol2',
  '1p2a',
  '1p5e',
  '1pf8',
  '1pkd',
  '1pw2',
  '1pxi',
  '1pxj',
  '1pxk',
  '1pxl',
  '1pxm',
  '1pxn',
  '1pxo',
  '1pxp',
  '1pye',
  '1qmz',
  '1r78',
  '1urc',
  '1urw',
  '1v1k',
  '1vyw',
  '1vyz',
  '1w0x',
  '1w8c',
  '1w98',
  '1wcc',
  '1y8y',
  '1y91',
  '1ykr',
  '2a0c',
  '2a4l',
  '2b52',
  '2b53',
  '2b54',
  '2b55',
  '2bhe',
  '2bhh',
  '2bkz',
  '2bpm',
  '2btr',
  '2bts',
  '2c4g',
  '2c5n',


In [417]:
fig = go.Figure()

color_by = 'Conformation'
labels_col = df_dash_app[color_by]


if color_col.dtype == 'object':
    discrete_colors = ['#2a7885', '#5a8b59', '#f64a3b', 'grey', '#fecc6a', '#69d7c4']
    labels = labels_col.unique()
    
    # Select the number of colors
    n_labels = len(col_labels)
    color_mapper = {i:j for i, j in zip(labels, discrete_colors[:n_labels])}
    
    color_col = color_col.map(color_mapper)

for label in labels:
    Zs = Z.query(f'{color_by} == "{label}"')
    fig.add_trace(go.Scatter(
        x = Zs.x, y = Zs.y,
        mode='markers',
        name=labels_col,
        marker = dict(
            color=color_col.to_list(),
            size=df_dash_app['LigMass'].astype(np.float),
            sizemode='diameter',
            sizeref=30,
            line_width=0
        )

    ))

UndefinedVariableError: name 'Conformation' is not defined

In [352]:
fig = go.Figure()

import plotly.express as px



fig = px.scatter(
    x = Z.x, y = Z.y,
    color= df_dash_app['Conformation'],
    size=df_dash_app['LigMass'].astype(np.float),
    line=dict(width=None),
    color_discrete_sequence = discrete_colors,
    opacity=0.6
)

selected = Z.iloc[[1, 2, 45]]
fig.add_trace(go.Scatter(
    x = selected.x, y = selected.y,
    mode='markers',
    marker_color='black'
))

fig.show()

TypeError: scatter() got an unexpected keyword argument 'line'