# Improving Ranking From Docking Scores

This notebook applies different methodologies to improve the ranking of ligands beyond the scores directly obtained from docking scores.

The methods here applied are reviewed by  Arcineaga and Lange (2014).

- Ligand Efficiency

In [172]:
import pandas as pd
import numpy as np
import glob, os, sys
sys.path.append(r'..')

These resultas are considered as 'DockScore', and corresponds to the best ligand ligand score given by the tool employed.

In [173]:
data_dir = '../data'
# Resultados CSAR
df_ad4_results_LE = pd.read_csv(F'{data_dir}/vs_docking_crys_ensemble_AD4_LE.csv', index_col=0)
df_ad4_results_LC = pd.read_csv(F'{data_dir}/vs_docking_crys_ensemble_AD4_LC.csv', index_col=0)
df_vina_results   = pd.read_csv(F'{data_dir}/vs_docking_crys_ensemble_VINA.csv', index_col=0)
df_vinardo_results = pd.read_csv(F'{data_dir}/vs_docking_crys_ensemble_VINARDO.csv', index_col=0)
# Resultados DUD
patho_to_json_vrd8_file = glob.glob((os.path.join('..', 'data', 
                       'vs_docking_DUD2006_vs_402_crys_vinardo_8x.csv')))[0]
df_vinardo_DUD = pd.read_csv(patho_to_json_vrd8_file, index_col=0)

## Ligand Efficiency

Herein Ligand Efficiency corresponds to the quotient between the best ligand's score and the number of heavy atoms in the ligand.

Because we only need the number  of heavy atoms, we can directly load the molecule using `rdkit`

In [174]:
def get_ligand_efficiency(scores_df, n_atoms_df, column_name = 'ActiveInactive'):
    # First we need to make sure the index are the same
    assert all(scores_df.index == n_atoms_df.index), 'Index are not the same'
    assert column_name in scores_df.columns, 'Make sure "ActiveInactive" column exists in scores_df'
    # Then the division is performed
    lef_df = scores_df.drop([column_name], axis = 1).div(n_atoms_df.Num_heavy_atoms, axis = 0)
    # Finally we reinsert the ActiveInactive Column at the begining
    lef_df.insert(0, column_name, scores_df[column_name])
    return lef_df

def write_ligand_efficiency_table(scores_df, n_atoms_df, file_name, column_name = 'ActiveInactive'):
    df_lef = get_ligand_efficiency(scores_df, n_atoms_df, column_name = column_name)
    df_lef.to_csv(file_name)

In [175]:
from rdkit import Chem, RDLogger
from rdkit.Chem import rdchem
# neccesary to ignore warning realte to molecule kekulization
RDLogger.DisableLog('rdApp.*')

### Ligand Efficiency of CSAR Results

First, we need to get the number of heavy atoms per ligand.

In [176]:
files_csar_ligs_path = os.path.join(*'../../ARCHIVOS/CRISTALES/LIGS_CDK2/CSAR/sdf/*'.split('/'))
files_csar_ligs = glob.glob(files_csar_ligs_path)
# alphanumerical ordering
files_csar_ligs.sort(key= lambda x: int(x.split('/')[-1].split('.')[0].replace('CS', '')))
# We now load each molecule in a dictionary
csar_natm_ligs_dic = {
    file.split('/')[-1].split('.')[0] : Chem.SDMolSupplier(file, sanitize = False)[0].GetNumHeavyAtoms()
    for file in files_csar_ligs
}

csar_natm_ligs = pd.DataFrame(csar_natm_ligs_dic, index = ['Num_heavy_atoms'])
# Now transpose and arder the dataframe
csar_natm_ligs = csar_natm_ligs.T

#### Saving the score tables

In [177]:
database = 'CSAR'
ensemble = 'CRYS_402'
docking_tools_dic = {'AD4_LE': df_ad4_results_LE, 'AD4_LC': df_ad4_results_LC, 
                     'VINA': df_vina_results, 'VINARDO': df_vinardo_results}

for tool, scores in docking_tools_dic.items():
    filename = F'{data_dir}/docking_scores/vs_dk_{ensemble}_{database}_{tool}_ligand_Efficiency.csv'
    write_ligand_efficiency_table(scores_df = scores, 
                                  n_atoms_df = csar_natm_ligs, file_name = filename)

### Ligand Efficiency of DUD Results

First, we need to get the number of heavy atoms per ligand.

In [178]:
files_dud_ligs_path = os.path.join(*'../../ARCHIVOS/CRISTALES/LIGS_CDK2/DUDE_DECOYS/DUD_2006/sdf/*'.split('/'))
files_dud_ligs = glob.glob(files_dud_ligs_path)
# alphanumerical ordering
# First we need to separate the actives from inactives and then perform the ordering in each set
actives_dud = [i for i in filter(lambda x: 'ligand' in x.split('/')[-1].split('_')[0], 
                                 files_dud_ligs)]
actives_dud_ord = sorted(actives_dud, key = lambda x: int(x.split('_')[-1].split('.')[0]))

decoys_dud = [i for i in filter(lambda x: 'decoy' in x.split('/')[-1].split('_')[0], 
                                 files_dud_ligs)]
decoys_dud_ord = sorted(decoys_dud, key = lambda x: int(x.split('_')[-1].split('.')[0]))

# Now we merge both list
files_dud_ligs = actives_dud_ord + decoys_dud_ord

In [179]:
# We now load each molecule in a dictionary
dud_natm_ligs_dic = {
    file.split('/')[-1].split('.')[0] : Chem.SDMolSupplier(file, sanitize = False)[0].GetNumHeavyAtoms()
    for file in files_dud_ligs
}

dud_natm_ligs = pd.DataFrame(dud_natm_ligs_dic, index = ['Num_heavy_atoms'])
# Now transpose and arder the dataframe
dud_natm_ligs = (dud_natm_ligs.T)

#### Saving the score tables

In [180]:
database = 'DUD2006'
ensemble = 'CRYS_402'
docking_tools_dic = {'VINARDO': df_vinardo_DUD}

for tool, scores in docking_tools_dic.items():
    filename = F'{data_dir}/docking_scores/vs_dk_{ensemble}_{database}_{tool}_ligand_Efficiency.csv'
    write_ligand_efficiency_table(scores_df = scores, n_atoms_df = dud_natm_ligs, 
                                  file_name = filename, column_name= 'Actividad')