# Get Molecular Libraries

- `obabel` is assumed to be installed: https://anaconda.org/openbabel/openbabel
- Molecules will be exported into `sdf` and `mol2` formats for loading with `rdkit` and docking with [SMINA](https://sourceforge.net/projects/smina/), respectively.

In [1]:
import wget
import gzip
import shutil
import tarfile
import pandas as pd
from glob import glob
import subprocess as sp
from pathlib import Path

In [2]:
LIB_DIR = './datasets/'

<h4 style='color: black; background-color: #F9E5AB; padding: 5px;'>
    Important!
</h4>

- The `prot_name` is used to download the specific protein target's dataset.
- To proceed with a new protein different to CDK2, FXa, EGFR, and HSP90, please double check the name of the protein as it appears in the respective dataset.

In [3]:
prot_name = 'cdk2'

## DEKOIS 2.0

Downloaded from: http://www.pharmchem.uni-tuebingen.de/dekois/

In [4]:
# Datasets url
DEKOIS_URL = 'http://www.pharmchem.uni-tuebingen.de/dekois/data/'
dekois_actives_url = f'{DEKOIS_URL}/DEKOIS2_actives/{prot_name.upper()}.sdf.gz'
dekois_decoys_url  = f'{DEKOIS_URL}/DEKOIS2_decoys/{prot_name.upper()}_Celling-v1.12_decoyset.sdf.gz'

# Output directory
DEKOIS_DIR = f'{LIB_DIR}/DEKOIS2/'
Path(DEKOIS_DIR).mkdir(exist_ok = True, parents = True)

# Download and extract each file
for lib_set in [dekois_actives_url, dekois_decoys_url]:
    # Download it
    set_path = Path(DEKOIS_DIR, lib_set.split('/')[-1])
    if not set_path.exists():
        wget.download(lib_set, out = DEKOIS_DIR)

    # Extract the files
    with gzip.open(str(set_path)) as f_in, \
          open(str(set_path).split('.gz')[0], 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# Create the output directories
Path(f'{DEKOIS_DIR}/mol2').mkdir(exist_ok = True)
Path(f'{DEKOIS_DIR}/sdf').mkdir(exist_ok = True)

In [4]:
%%bash -s $DEKOIS_DIR 
# Split the molecules into individual files

# MOL2 Files
# Actives
obabel $1/CDK2.sdf -O $1/mol2/ligand_.mol2 -m 
# Decoys
obabel $1/CDK2_Celling-v1.12_decoyset.sdf -O $1/mol2/decoy_.mol2 -m 

# SDF Files
# Actives
obabel $1/CDK2.sdf -O $1/sdf/ligand_.sdf -m 
# Decoys
obabel $1/CDK2_Celling-v1.12_decoyset.sdf -O $1/sdf/decoy_.sdf -m 

40 molecules converted
40 files output. The first is ./datasets//DEKOIS2//mol2/ligand_1.mol2
1200 molecules converted
1200 files output. The first is ./datasets//DEKOIS2//mol2/decoy_1.mol2


## DUD-2006

Downloaded from: http://dud.docking.org/inhibox.html

In [5]:
# Datasets url
DUD_URL = 'http://dud.docking.org/inhibox/allDUDfiles_Gasteiger.tar.gz'

# Output directory
DUD_DIR = f'{LIB_DIR}/DUD/'
Path(DUD_DIR).mkdir(exist_ok = True)

# Download the file
set_path = Path(DUD_DIR, DUD_URL.split('/')[-1])
if not set_path.exists():
    wget.download(DUD_URL, out = DUD_DIR)

# Extract the molecules file
with tarfile.open(str(set_path)) as t:
    # ligands
    ligands_file = f'{prot_name}_ligands_Gasteiger.mol2'
    f_in = t.extract(f'allDUDfiles_Gasteiger/{ligands_file}', path = DUD_DIR)

    # decoys 
    decoys_file = f'{prot_name}_decoys_Gasteiger.mol2'
    f_in = t.extract(f'allDUDfiles_Gasteiger/{decoys_file}', path = DUD_DIR)

# Create the output directories
Path(f'{DUD_DIR}/mol2').mkdir(exist_ok = True)
Path(f'{DUD_DIR}/sdf').mkdir(exist_ok = True)

In [6]:
%%bash -s $DUD_DIR/allDUDfiles_Gasteiger $ligands_file $decoys_file
# Split the molecules into individual files

# MOL2 Files
# Actives
obabel $1/$2 -O $1/../mol2/ligand_.mol2 -m --gen3d
# Decoys
obabel $1/$3 -O $1/../mol2/decoy_.mol2 -m --gen3d

# SDF Files
# Actives
obabel $1/$2 -O $1/../sdf/ligand_.sdf -m --gen3d
# Decoys
obabel $1/$3 -O $1/../sdf/decoy_.sdf -m --gen3d

## CSAR Library

Downloaded from: https://drugdesigndata.org/about/datasets/220

```python
# Install the openpyxl package to read the xlsx file 
!pip install openpyxl 
```

In [4]:
# Download the CSAR dataset
CSAR_URL = 'https://drugdesigndata.org/php/file-download.php?type=extended&id=99'

# Output directory
CSAR_DIR = f'{LIB_DIR}/CSAR/'
Path(CSAR_DIR).mkdir(exist_ok = True)

# Download the file
set_path = Path(CSAR_DIR, 
                'CDK2_Binding_Data_Corrected_2016AUG18.xlsx.xlsx')
if not set_path.exists():
    wget.download(CSAR_URL, out = CSAR_DIR)
    
# Read the xlsx file and extract the SMILES
csar_data = pd.read_excel(set_path, engine='openpyxl')
csar_data["ActiveInactive"].fillna("Active", inplace=True)
csar_data['activity'] = csar_data["ActiveInactive"]\
                           .apply(lambda x: 1 if x == 'Active' else 0)
# Drop null values
csar_data.dropna(inplace = True, subset = ['Compound_ID'])
# Save the dataset
csar_data.to_csv(f'{CSAR_DIR}/csar_dataset.csv')
# Clean SMILES column
csar_data.SMILES = csar_data.SMILES.apply(lambda x: x.split(' ')[0])

# Save the smiles to a text file
csar_data[['SMILES', 'Compound_ID']]\
                .to_csv(f'{CSAR_DIR}/CSAR_SMILES.smi', 
                        header = False, index = False, sep = ' ')

# Create the output directories
Path(f'{CSAR_DIR}/mol2').mkdir(exist_ok = True)
Path(f'{CSAR_DIR}/sdf').mkdir(exist_ok = True)

  warn(msg)


In [5]:
%%bash -s $CSAR_DIR 
# Split the molecules into individual files
# MOL2 Files
obabel \
    -ismi $1/CSAR_SMILES.smi \
    -omol2 -O $1/mol2/temp_.mol2 -m \
    -p 7.0 --partialcharge gasteiger \
    --gen3d

# Rename the molecules using the original name
for lig in $1/mol2/*mol2;
do
    name=`head -n 2 $lig | tail -n 1`
    path=${lig%/temp*mol2}
    mv $lig $path/$name.mol2
done

*** Open Babel Error  in Do
  3D coordinate generation failed
*** Open Babel Error  in Do
  3D coordinate generation failed
*** Open Babel Error  in Do
  3D coordinate generation failed
*** Open Babel Error  in Do
  3D coordinate generation failed
111 molecules converted
111 files output. The first is ./datasets//CSAR//mol2/temp_1.mol2


In [8]:
%%bash -s $CSAR_DIR 
# Split the molecules into individual files
# SDF Files
obabel \
    -ismi $1/CSAR_SMILES.smi \
    -osdf -O $1/sdf/temp_.sdf -m \
    -p 7.0 --partialcharge gasteiger \
    --gen3d

# Rename the molecules using the original name
for lig in $1/sdf/*sdf;
do
    name=`head -n 1 $lig`
    path=${lig%/temp*sdf}
    mv $lig $path/$name.sdf
done

*** Open Babel Error  in Do
  3D coordinate generation failed
  No 2D or 3D coordinates exist. Stereochemical information will be stored using an Open Babel extension. To generate 2D or 3D coordinates instead use --gen2D or --gen3D.
*** Open Babel Error  in Do
  3D coordinate generation failed
  No 2D or 3D coordinates exist. Stereochemical information will be stored using an Open Babel extension. To generate 2D or 3D coordinates instead use --gen2D or --gen3D.
*** Open Babel Error  in Do
  3D coordinate generation failed
  No 2D or 3D coordinates exist. Stereochemical information will be stored using an Open Babel extension. To generate 2D or 3D coordinates instead use --gen2D or --gen3D.
*** Open Babel Error  in Do
  3D coordinate generation failed
  No 2D or 3D coordinates exist. Stereochemical information will be stored using an Open Babel extension. To generate 2D or 3D coordinates instead use --gen2D or --gen3D.
111 molecules converted
111 files output. The first is ./datasets//C

## Cocrystalized molecules
Molecules obtained with the notebook `../1_Download_and_prepare_protein_ensembles/5_Get_cocrystalized_molecules_from_PDB`.

In [9]:
# Cocrystalized ligands directory
DIR_MAIN      = '../1_Download_and_prepare_protein_ensembles/pdb_structures'
DIR_PREP_LIGS = f'{DIR_MAIN}/pocket_ligands'


# Output directory
COCRYS_DIR = f'{LIB_DIR}/COCRYS/'
Path(COCRYS_DIR).mkdir(exist_ok = True)

# List all available files and create a 
# dictionary with the molecule name as key
# Duplicates will be removed in futher notebooks
list_of_files = sorted(glob(f'{DIR_PREP_LIGS}/*pdb'))
cocrys_mols = {file.split('/')[-1].split('.pdb')[0]: file 
                 for file in list_of_files
              }
print(f'{len(cocrys_mols)} unique compounds of ' +\
      f'{len(list_of_files)} cocrystalized molecules')

# MOL2 Files
Path(f'{COCRYS_DIR}/mol2').mkdir(exist_ok = True)
# Convert the molecules to mol2 using obabel
for name, file in cocrys_mols.items():
    sp.run(
        f'''
        obabel -ipdb {file} \
            -omol2 -O {COCRYS_DIR}/mol2/{name}.mol2 \
            -p 7.0 --gen3d --partialcharge gasteiger
        ''',
        shell = True
    )

# SDF Files
Path(f'{COCRYS_DIR}/sdf').mkdir(exist_ok = True)
# Convert the molecules to sdf using obabel
for name, file in cocrys_mols.items():
    sp.run(
        f'''
        obabel -ipdb {file} \
            -osdf -O {COCRYS_DIR}/sdf/{name}.sdf \
            -p 7.0 --gen3d --partialcharge gasteiger
        ''',
        shell = True
    )

140 unique compounds of 140 cocrystalized molecules


Finished!