## Ligand Efficiency

In [1]:
import pandas as pd
import numpy as np
from glob import glob

In [2]:
from rdkit import Chem, RDLogger
from rdkit.Chem import rdchem
# neccesary to ignore warning realte to molecule kekulization
RDLogger.DisableLog('rdApp.*')

In [3]:
def get_ligand_efficiency(scores_df, n_atoms_df, column_name = 'ActiveInactive'):
    # First we need to make sure the index are the same
    assert all(scores_df.index == n_atoms_df.index), 'Index are not the same'
    assert column_name in scores_df.columns, 'Make sure "ActiveInactive" column exists in scores_df'
    # Then the division is performed
    lef_df = scores_df.drop([column_name], axis = 1).div(n_atoms_df.Num_heavy_atoms, axis = 0).round(4)
    # Finally we reinsert the ActiveInactive Column at the begining
    lef_df.insert(0, column_name, scores_df[column_name])
    return lef_df

In [4]:
def get_number_atoms(list_mol_files):
    molecules = []
    for mol_file in list_mol_files:
        mol_name = mol_file.split('/')[-1].split('.')[0]
        n_atoms = Chem.SDMolSupplier(mol_file, sanitize=False)[0].GetNumHeavyAtoms()
        # Append to the series
        molecules.append((mol_name, n_atoms))
    idx, values = zip(*molecules)  
    return pd.Series(values, idx)

### Read Dk results from previous notebook

In [5]:
def get_ligand_efficiency(scores_df, n_atoms_series, activity_col_name = 'activity'):
    # First we need to make sure the index are the same
    assert all(scores_df.sort_index().index == n_atoms_series.sort_index().index), 'Index are not the same'
    assert activity_col_name in scores_df.columns, 'Make sure "ActiveInactive" column exists in scores_df'
    # Add the n atoms series to the dataframe
    df = scores_df.copy()
    df['Num_heavy_atoms'] = n_atoms_series
    # Then the division is performed
    df_leff = df.drop([activity_col_name], axis = 1)\
                      .div(df.Num_heavy_atoms, axis = 0)\
                      .round(3)\
                      .drop('Num_heavy_atoms', axis=1)
    # Finally we reinsert the ActiveInactive Column at the begining
    df_leff.insert(0, activity_col_name, df[activity_col_name])
    return df_leff

First get the number of atoms per ligand.

### CSAR 

In [12]:
list_mol_files = glob('../../ARCHIVOS/CRISTALES/LIGS_CDK2/CSAR/sdf/*')
dksc_filename = 'CSAR_VINARDO_403_prots_111_mols.csv'

# First get the number of atoms per ligand.
n_atoms_series = get_number_atoms(list_mol_files)
# Then, get the Ligand efficiency values
df_dekois_vrd = pd.read_csv(dksc_filename, index_col='ligand')
# LEFF
df_leff_dekois_vrd = get_ligand_efficiency(df_dekois_vrd, n_atoms_series)
# Save as csv
df_leff_dekois_vrd.to_csv(dksc_filename.replace('.csv', '') + '_' + 'LigEff.csv')

### DEKOIS

In [8]:
list_mol_files = glob('../../ARCHIVOS/CRISTALES/LIGS_CDK2/DEKOIS_2/sdf/*')
dksc_filename = 'DEKOIS2_VINARDO_403_prots_1240_mols.csv'

# First get the number of atoms per ligand.
n_atoms_series = get_number_atoms(list_mol_files)
# Then, get the Ligand efficiency values
df_dekois_vrd = pd.read_csv(dksc_filename, index_col='ligand')
# LEFF
df_leff_dekois_vrd = get_ligand_efficiency(df_dekois_vrd, n_atoms_series)
# Save as csv
df_leff_dekois_vrd.to_csv(dksc_filename.replace('.csv', '') + '_' + 'LigEff.csv')

### DUD

In [11]:
list_mol_files = glob('../../ARCHIVOS/CRISTALES/LIGS_CDK2/DUD_2006/sdf/*')
dksc_filename = 'DUD2006_VINARDO_403_prots_2146_mols.csv'

# First get the number of atoms per ligand.
n_atoms_series = get_number_atoms(list_mol_files)
# Then, get the Ligand efficiency values
df_dud_vrd = pd.read_csv(dksc_filename, index_col='ligand')
# LEFF
df_leff_dud_vrd = get_ligand_efficiency(df_dud_vrd, n_atoms_series)
# Save as csv
df_leff_dud_vrd.to_csv(dksc_filename.replace('.csv', '') + '_' + 'LigEff.csv')