# Notebook for load and processing molecules using rdkit

In [3]:
import pandas as pd
import numpy as np
from glob import glob
import os, pickle

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*')

In [7]:
from analyse_db_molecules import *

### Cocrystalized Molecules

In [8]:
sdf_input_path = '../../FILES/CRYSTALS/LIGS_FXA/POCKET_LIGS_PREP_SDF/'
sort_function = lambda x: x.split('/')[-1].split('_')[0]
list_sdf_files = get_files_list(sdf_input_path, actives_name='LIG', sufix='from_pdb', sort_func = sort_function)
df_pdi_lig = load_cocrys_molecules_from_dir(list_sdf_files)
# Just for cocrystalized molecules

# Tranform the previous dataframe to an Lig, activity, mol_rdk, sanitized dataframe
df_COCRYS = df_pdi_lig[['Lig', 'mol_rdk']]
df_COCRYS['Activity'] = 'active'
df_COCRYS['sanitized'] = [True if i != 'v3' else False for i in df_pdi_lig.validation]
df_COCRYS = df_COCRYS[['Lig', 'Activity', 'mol_rdk', 'sanitized']]
df_COCRYS = df_COCRYS.drop_duplicates('Lig').set_index('Lig')
df_COCRYS.sanitized.value_counts()

True     108
False     20
Name: sanitized, dtype: int64

### DEKOIS Molecules

In [9]:
sdf_input_path = '../../FILES/CRYSTALS/LIGS_FXA/DEKOIS2/sdf/'

list_sdf_files = get_files_list(sdf_input_path, actives_name='ligand')
df_DEKOIS = get_mol_dataframe(load_molecules_from_dir(list_sdf_files))
df_DEKOIS.head(3)

Unnamed: 0,Activity,mol_rdk,sanitized
ligand_1,active,<rdkit.Chem.rdchem.Mol object at 0x7f52f28041c0>,True
ligand_2,active,<rdkit.Chem.rdchem.Mol object at 0x7f52f28036c0>,True
ligand_3,active,<rdkit.Chem.rdchem.Mol object at 0x7f52f28038a0>,True


### DUD Molecules

In [10]:
sdf_input_path = '../../FILES/CRYSTALS/LIGS_FXA/DUD2006/sdf/'

list_sdf_files = get_files_list(sdf_input_path, actives_name='ligand')
df_DUD = get_mol_dataframe(load_molecules_from_dir(list_sdf_files))
df_DUD.head(3)

Unnamed: 0,Activity,mol_rdk,sanitized
ligand_1,active,<rdkit.Chem.rdchem.Mol object at 0x7f52f28ef800>,True
ligand_2,active,<rdkit.Chem.rdchem.Mol object at 0x7f52f28ef080>,True
ligand_3,active,<rdkit.Chem.rdchem.Mol object at 0x7f52f28efbc0>,True


### DUD-E Dataset 

In [6]:
sdf_input_path = '../../FILES/CRYSTALS/LIGS_FXA/DUD_E/sdf/'

list_sdf_files = get_files_list(sdf_input_path, actives_name='ligand')
df_DUDE = get_mol_dataframe(load_molecules_from_dir(list_sdf_files))
df_DUDE.shape

(21209, 3)

## Save Molecules Dictionary

In [15]:
lig_datasets = {'COCRYS': df_COCRYS, 'DUD': df_DUD, 'DEKOIS': df_DEKOIS}
lig_datasets_with_DUDE = {'COCRYS': df_COCRYS, 'DUD': df_DUD, 'DUDE': df_DUDE, 'DEKOIS': df_DEKOIS}

In [18]:
import pickle

file_rd_mols = './fxa_rdkit_db_molecules.obj'

if os.path.isfile(file_rd_mols):
    with open(file_rd_mols, 'rb') as f:
        lig_datasets = pickle.load(f)
else:
    with open(file_rd_mols, 'wb') as f:
        pickle.dump(lig_datasets, f)

In [19]:
import pickle

file_rd_mols = './fxa_rdkit_db_molecules_with_DUDE.obj'

if os.path.isfile(file_rd_mols):
    with open(file_rd_mols, 'rb') as f:
        lig_datasets_with_DUDE = pickle.load(f)
else:
    with open(file_rd_mols, 'wb') as f:
        pickle.dump(lig_datasets_with_DUDE, f)

In [21]:
!ls -alh *obj

-rw-rw-r-- 1 ricci ricci 6.1M jun 26 22:16 df_COCRYS_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj
-rw-rw-r-- 1 ricci ricci 6.3M jun 26 19:52 df_COCRYS_DUD_DEKOIS_with_Fingerprints_MDS.obj
-rw-rw-r-- 1 ricci ricci 6.0M jun 26 18:44 df_COCRYS_DUD_DEKOIS_with_Fingerprints_TSNE.obj
-rw-rw-r-- 1 ricci ricci 184K jun 26 18:33 df_repeated_mols_among_DUD_DEKIOS_COCRYS.obj
-rw-rw-r-- 1 ricci ricci 849K ago 18 10:26 FXA_dash_app_Consensus_results.obj
-rw-rw-r-- 1 ricci ricci 769K jul 15 23:28 FXA_ML_results_conformational_selection.obj
-rw-rw-r-- 1 ricci ricci 6.6M sep 14 13:54 fxa_rdkit_db_molecules.obj
-rw-rw-r-- 1 ricci ricci  25M sep 14 13:55 fxa_rdkit_db_molecules_with_DUDE.obj
