In [1]:
import pandas as pd
import numpy as np
from glob import glob
import sys
import os

In [13]:
def _get_the_dataframe(dir_path):
    list_files = glob(dir_path + '*')
    assert list_files
    # Sort the values in natural order
    list_files.sort(key = lambda x: x.split('/')[-1].split('_')[0])

    list_of_dfs = [pd.read_csv(i, index_col='Ligando') for i in list_files]
    df = pd.concat(list_of_dfs, axis=1)
    df.index.names = ['ligand'] # Rename the index
    # Rename columns
    df.columns = df.columns.str.strip().str.replace('DkScore_', '') # First strip to remove all white spaces
    return df

def process_smina_docking_results(dir_path, mol_library, docking_tool, save = True, 
                                  cocrys_molecules=False, cocrys_pattern=True):
    '''
    Concatenate a set of csv files with Smina docking results. If the molecules are named as "ligand_xxx" and
    "decoy_xxx", cocrys_molecules parameter should be kept as False.
    If cocristalized, the function expects that the ligand name has the following patter: 'pdbi_LIG_XXX'
    where lig is the  three letter molecule name.
    '''
    df = _get_the_dataframe(dir_path)
    
    if cocrys_molecules:
        # Add the Activity column.
        df['activity'] = 1 # If cocristalized it'll be assumed as active
        # We dont need to sort by numerical ordering, but by ligand name, so extract it
        df = df.reset_index()\
                    .sort_values('ligand', ascending = True)\
                    .set_index('ligand')
        if cocrys_pattern:
            df.index = df.index.map(lambda x: x.split('_')[1])
        
    else:
        # Add the Activity column.
        df['activity'] = [1 if 'ligand' in i else 0 for i in df.index ]
        # Sort indices by natural ordering
        df['order'] = [int(i.split('_')[-1]) for i in df.index]
        df = df.reset_index()\
                    .sort_values(['activity', 'order'], ascending=[False, True])\
                    .set_index('ligand')\
                    .drop('order', axis=1)
    # Save the dataframe as csv
    n_cols = df.shape[1]
    m_mols = df.shape[0]
    print('Shape of the final df:', df.shape)
    if save:
        df.to_csv(f'./{mol_library}_{docking_tool}_{n_cols}_prots_{m_mols}_mols.csv')
    return df


### COCRYSTALIZED MOLECULES
#### VINARDO

In [18]:
# DIR OF FILES
mol_library = 'COCRYS'
docking_tool = 'VINARDO'
COCRYS_PATH = f'../../ARCHIVOS/CRISTALES/DOCKINGS/DOCK_402_crys_{mol_library}/{docking_tool}/CSV/'

df_cocrys_vrd = process_smina_docking_results(COCRYS_PATH, mol_library, docking_tool, cocrys_molecules=True,
                                             cocrys_pattern=False)
# df_cocrys_vrd

Shape of the final df: (261, 403)


### CSAR MOLECULES
Originally these results were splitted into two files (the first with 95 mols, and the second with 16). However those files were merged using bash.
#### VINARDO

In [59]:
# DIR OF FILES
mol_library = 'CSAR'
docking_tool = 'VINARDO'
COCRYS_PATH = f'../../ARCHIVOS/CRISTALES/DOCKINGS/DOCK_402_crys_{mol_library}/{docking_tool}/CSV/'

df_csar_vrd = process_smina_docking_results(COCRYS_PATH, mol_library, docking_tool, cocrys_molecules=True,
                                             cocrys_pattern=False)
df_csar_vrd = df_csar_vrd.drop('activity', axis=1)

#*******
# We need to update activity for this set due to activity
# is not implicit in the names
#*******
csar_data = pd.read_excel('../../ARCHIVOS/CDK2_Binding_Data_Corrected_2016AUG18.xlsx', sheet_name="Binding", index_col=0)
csar_data["ActiveInactive"].fillna("Active", inplace=True)
csar_data['activity'] = csar_data["ActiveInactive"].apply(lambda x: 1 if x == 'Active' else 0)
activity_series = csar_data[['activity']]

# update the dataframe of results, add first activity to preserve the rows order
df_csar_vrd = pd.concat([activity_series, df_csar_vrd], axis=1)
# Move activity column to the end to match other tables
df_csar_vrd = df_csar_vrd[df_csar_vrd.columns[1:].to_list() + ['activity']]

#*************
# Update the dataframe saved by process_smina_docking function
#*************
n_cols = df_csar_vrd.shape[1]
m_mols = df_csar_vrd.shape[0]
file = f'./{mol_library}_{docking_tool}_{n_cols}_prots_{m_mols}_mols.csv'
df_csar_vrd.to_csv(file)
# Be sure that we are saving the correct df
df_csar_vrd = pd.read_csv(file, index_col=0)
df_csar_vrd


Shape of the final df: (111, 403)


Unnamed: 0,1aq1,1b38,1b39,1buh,1ckp,1di8,1dm2,1e1v,1e1x,1e9h,...,6q4c,6q4d,6q4e,6q4f,6q4g,6q4h,6q4i,6q4j,6q4k,activity
CS1,-8.1,-6.9,-7.1,-7.6,-6.9,-7.5,-8.5,-7.4,-6.9,-7.2,...,-6.6,-6.1,-6.3,-6.3,-6.4,-6.9,-6.3,-7.1,-6.5,1
CS2,-7.1,-6.0,-6.1,-6.5,-6.1,-6.2,-7.3,-6.6,-6.2,-6.6,...,-5.7,-5.1,-5.7,-5.7,-5.4,-6.2,-5.5,-5.7,-5.4,1
CS3,-7.9,-7.9,-7.6,-6.8,-7.8,-7.7,-8.7,-7.6,-7.6,-8.0,...,-6.8,-6.4,-7.0,-6.8,-7.2,-7.3,-6.5,-6.9,-6.5,1
CS4,-8.7,-8.4,-8.1,-7.6,-8.1,-8.0,-8.3,-8.3,-8.0,-8.6,...,-7.4,-6.6,-7.0,-6.8,-7.2,-7.3,-6.8,-7.0,-6.8,1
CS5,-7.9,-7.8,-7.2,-6.7,-8.2,-7.7,-7.2,-7.9,-7.4,-7.3,...,-7.2,-7.2,-6.9,-6.7,-7.1,-7.9,-6.5,-7.6,-7.1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CS232,-9.4,-9.7,-9.8,-8.3,-10.0,-8.6,-8.8,-9.3,-8.7,-9.4,...,-9.6,-9.2,-8.7,-8.8,-9.3,-9.1,-8.4,-8.1,-9.0,0
CS234,-8.7,-9.0,-7.1,-7.3,-7.8,-9.5,-8.4,-9.2,-8.8,-8.6,...,-7.6,-6.7,-8.6,-8.6,-6.8,-8.6,-6.9,-8.9,-7.4,0
CS236,-10.1,-8.5,-8.9,-8.1,-8.1,-9.1,-9.0,-8.7,-9.4,-9.1,...,-8.0,-7.5,-8.2,-8.3,-8.6,-8.9,-8.0,-8.7,-7.9,0
CS237,-7.9,-7.9,-7.8,-7.8,-7.8,-8.2,-7.7,-8.4,-8.2,-8.3,...,-7.2,-7.1,-7.5,-7.4,-7.2,-7.9,-7.3,-7.6,-7.2,0


### COCRYSTALIZED MOLECULES
#### VINARDO

In [18]:
# DIR OF FILES
mol_library = 'COCRYS'
docking_tool = 'VINARDO'
COCRYS_PATH = f'../../ARCHIVOS/CRISTALES/DOCKINGS/DOCK_402_crys_{mol_library}/{docking_tool}/CSV/'

df_cocrys_vrd = process_smina_docking_results(COCRYS_PATH, mol_library, docking_tool, cocrys_molecules=True,
                                             cocrys_pattern=False)
# df_cocrys_vrd

Shape of the final df: (261, 403)


### DEKOIS 
#### VINARDO

In [20]:
# DIR OF FILES
mol_library = 'DEKOIS2'
docking_tool = 'VINARDO'
COCRYS_PATH = f'../../ARCHIVOS/CRISTALES/DOCKINGS/DOCK_402_crys_{mol_library}/{docking_tool}/CSV/'

df_dekois_vrd = process_smina_docking_results(COCRYS_PATH, mol_library, docking_tool, cocrys_molecules=False)
# df_dud_vrd

Shape of the final df: (1240, 403)


### DEKOIS 
#### VINARDO

In [17]:
# DIR OF FILES
mol_library = 'DUD2006'
docking_tool = 'VINARDO'
COCRYS_PATH = f'../../ARCHIVOS/CRISTALES/DOCKINGS/DOCK_402_crys_{mol_library}/{docking_tool}/CSV/'

df_dud_vrd = process_smina_docking_results(COCRYS_PATH, mol_library, docking_tool, cocrys_molecules=False)
# df_dud_vrd

Shape of the final df: (2146, 403)


Unnamed: 0_level_0,1aq1,1b38,1b39,1buh,1ckp,1di8,1dm2,1e1v,1e1x,1e9h,...,6q4c,6q4d,6q4e,6q4f,6q4g,6q4h,6q4i,6q4j,6q4k,activity
ligand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ligand_1,-9.1,-8.4,-8.5,-7.4,-8.6,-8.9,-8.4,-8.5,-8.5,-8.8,...,-7.4,-6.6,-7.5,-7.1,-6.8,-7.9,-7.1,-8.7,-6.6,1
ligand_2,-8.2,-6.9,-6.8,-6.5,-6.7,-8.1,-8.0,-7.7,-8.0,-7.5,...,-6.6,-6.6,-6.7,-6.6,-6.3,-7.3,-6.3,-6.8,-6.6,1
ligand_3,-9.0,-7.4,-8.3,-7.8,-7.1,-8.6,-8.0,-8.9,-7.5,-7.7,...,-6.6,-6.7,-7.6,-6.1,-6.1,-8.3,-6.7,-9.3,-6.5,1
ligand_4,-9.6,-8.2,-7.6,-7.0,-7.3,-8.5,-8.0,-9.1,-9.0,-7.7,...,-7.9,-7.1,-7.6,-7.8,-6.7,-8.9,-6.1,-9.6,-6.6,1
ligand_5,-9.0,-7.4,-8.3,-7.8,-7.1,-8.7,-7.9,-8.9,-8.8,-7.7,...,-6.6,-6.7,-7.6,-6.1,-6.1,-8.3,-6.7,-9.3,-6.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
decoy_2070,-9.2,-7.5,-7.4,-7.0,-7.0,-8.7,-7.6,-7.5,-8.3,-6.9,...,-7.9,-8.4,-7.0,-7.3,-7.6,-8.2,-7.7,-8.7,-7.8,0
decoy_2071,-10.0,-7.8,-7.5,-7.5,-8.2,-8.3,-9.5,-8.6,-8.1,-8.2,...,-7.6,-7.4,-8.0,-7.5,-8.0,-8.0,-7.6,-8.3,-7.4,0
decoy_2072,-8.4,-6.0,-6.4,-7.0,-7.2,-8.0,-7.1,-7.0,-7.2,-7.1,...,-7.3,-7.1,-7.4,-6.9,-6.6,-7.2,-7.0,-8.0,-7.3,0
decoy_2073,-10.3,-8.8,-8.9,-8.1,-8.9,-8.6,-9.0,-9.4,-9.0,-9.4,...,-8.6,-7.4,-8.4,-7.2,-7.6,-8.9,-7.5,-8.5,-7.5,0
