In [1]:
import pandas as pd
import numpy as np
from glob import glob
import sys
import os

We will use the following function from

In [16]:
def _get_the_dataframe(dir_path):
    list_files = glob(dir_path + '*')
    assert list_files
    # Sort the values in natural order
    list_files.sort(key = lambda x: x.split('/')[-1].split('_')[0])

    list_of_dfs = [pd.read_csv(i, index_col='Ligando') for i in list_files]
    df = pd.concat(list_of_dfs, axis=1)
    df.index.names = ['ligand'] # Rename the index
    # Rename columns
    df.columns = df.columns.str.strip().str.replace('DkScore_', '') # First strip to remove all white spaces
    return df

def process_smina_docking_results(dir_path, mol_library, docking_tool, save = True, 
                                  cocrys_molecules=False, cocrys_pattern=True):
    '''
    Concatenate a set of csv files with Smina docking results. If the molecules are named as "ligand_xxx" and
    "decoy_xxx", cocrys_molecules parameter should be kept as False.
    If cocristalized, the function expects that the ligand name has the following patter: 'pdbi_LIG_XXX'
    where lig is the  three letter molecule name.
    '''
    df = _get_the_dataframe(dir_path)
    
    if cocrys_molecules:
        # Add the Activity column.
        df['activity'] = 1 # If cocristalized it'll be assumed as active
        # We dont need to sort by numerical ordering, but by ligand name, so extract it
        df = df.reset_index()\
                    .sort_values('ligand', ascending = True)\
                    .set_index('ligand')
        if cocrys_pattern:
            df.index = df.index.map(lambda x: x.split('_')[1])
        
    else:
        # Add the Activity column.
        df['activity'] = [1 if 'ligand' in i else 0 for i in df.index ]
        # Sort indices by natural ordering
        df['order'] = [int(i.split('_')[-1]) for i in df.index]
        df = df.reset_index()\
                    .sort_values(['activity', 'order'], ascending=[False, True])\
                    .set_index('ligand')\
                    .drop('order', axis=1)
    # Save the dataframe as csv
    n_cols = df.shape[1]
    m_mols = df.shape[0]
    print('Shape of the final df:', df.shape)
    if save:
        df.to_csv(f'./{mol_library}_{docking_tool}_{n_cols}_prots_{m_mols}_mols.csv')
    return df


#### COCRYSTALIZED MOLECULES

In [19]:
# DIR OF FILES
mol_library = 'COCRYS'
docking_tool = 'VINARDO'
COCRYS_PATH = f'../../FILES/CRYSTALS/DOKINGS/CDK2_LIGANDS_CROSS/{mol_library}/{docking_tool}/'

df_cocrys_vrd = process_smina_docking_results(COCRYS_PATH, mol_library, docking_tool, cocrys_molecules=True,
                                             cocrys_pattern=False)

df_cocrys_vrd.head()

Shape of the final df: (261, 137)


Unnamed: 0_level_0,1c5m,1ezq,1f0r,1f0s,1fax,1fjs,1g2l,1g2m,1hcg,1ioe,...,4y76,4y79,4y7a,4y7b,4zh8,4zha,5k0h,5voe,5vof,activity
ligand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
02Z,-7.4,-8.4,-7.7,-7.8,-7.5,-8.2,-8.3,-7.7,-7.7,-7.8,...,-7.3,-7.8,-7.7,-7.3,-7.6,-8.9,-8.6,-7.1,-7.7,1
03K,-8.1,-8.4,-8.6,-7.9,-8.0,-8.4,-8.8,-8.7,-8.6,-8.3,...,-8.0,-8.2,-8.1,-8.0,-7.9,-8.8,-8.9,-8.3,-8.4,1
03Z,-9.4,-9.2,-9.7,-9.6,-9.3,-9.0,-10.2,-9.7,-9.2,-10.0,...,-9.1,-9.1,-9.7,-9.2,-8.7,-9.5,-9.4,-9.1,-9.2,1
04Z,-9.6,-9.6,-10.6,-9.8,-9.5,-9.8,-10.7,-10.7,-10.3,-10.9,...,-9.7,-10.2,-10.6,-9.8,-10.5,-10.8,-10.3,-9.3,-9.9,1
06Z,-9.0,-9.2,-8.5,-9.2,-8.3,-8.9,-8.3,-8.6,-8.6,-9.6,...,-8.8,-8.6,-8.6,-8.6,-9.3,-8.7,-8.4,-8.4,-8.7,1


### DEKOIS 
#### VINARDO

In [13]:
# DIR OF FILES
mol_library = 'DEKOIS'
docking_tool = 'VINARDO'
DEKOIS_PATH = f'../../FILES/CRYSTALS/DOKINGS/CDK2_LIGANDS_CROSS/{mol_library}/{docking_tool}/'

df_dekois_vrd = process_smina_docking_results(DEKOIS_PATH, mol_library, docking_tool)

Shape of the final df: (1240, 137)


### DUD 
#### VINARDO

In [14]:
# DIR OF FILES
mol_library = 'DUD'
docking_tool = 'VINARDO'
DUD_PATH = f'../../FILES/CRYSTALS/DOKINGS/CDK2_LIGANDS_CROSS/{mol_library}/{docking_tool}/'

df_dud_vrd = process_smina_docking_results(DUD_PATH, mol_library, docking_tool)

Shape of the final df: (2146, 137)


### CSAR 
#### VINARDO

In [23]:
# DIR OF FILES
mol_library = 'CSAR'
docking_tool = 'VINARDO'
COCRYS_PATH = f'../../FILES/CRYSTALS/DOKINGS/CDK2_LIGANDS_CROSS/{mol_library}/{docking_tool}/'

df_csar_vrd = process_smina_docking_results(COCRYS_PATH, mol_library, docking_tool, cocrys_molecules=True,
                                             cocrys_pattern=False)
df_csar_vrd = df_csar_vrd.drop('activity', axis=1)
df_csar_vrd

#*******
# We need to update activity for this set due to activity
# is not implicit in the names
#*******
csar_data = pd.read_excel('../../../CDK2/ARCHIVOS/CDK2_Binding_Data_Corrected_2016AUG18.xlsx', sheet_name="Binding", index_col=0)
csar_data["ActiveInactive"].fillna("Active", inplace=True)
csar_data['activity'] = csar_data["ActiveInactive"].apply(lambda x: 1 if x == 'Active' else 0)
activity_series = csar_data[['activity']]

# update the dataframe of results, add first activity to preserve the rows order
df_csar_vrd = pd.concat([activity_series, df_csar_vrd], axis=1)
# Move activity column to the end to match other tables
df_csar_vrd = df_csar_vrd[df_csar_vrd.columns[1:].to_list() + ['activity']]
df_csar_vrd.index.name = 'ligand'
# Sort index by natural sort
index_natsort = sorted(df_csar_vrd.index.to_list(), key= lambda x: int(x.split('CS')[1]))
df_csar_vrd = df_csar_vrd.reindex(index_natsort)

#*************
# Update the dataframe saved by process_smina_docking function
#*************
n_cols = df_csar_vrd.shape[1]
m_mols = df_csar_vrd.shape[0]
file = f'./{mol_library}_{docking_tool}_{n_cols}_prots_{m_mols}_mols.csv'
df_csar_vrd.to_csv(file)
# Be sure that we are saving the correct df
df_csar_vrd = pd.read_csv(file, index_col=0)
df_csar_vrd

Shape of the final df: (111, 137)


Unnamed: 0_level_0,1c5m,1ezq,1f0r,1f0s,1fax,1fjs,1g2l,1g2m,1hcg,1ioe,...,4y76,4y79,4y7a,4y7b,4zh8,4zha,5k0h,5voe,5vof,activity
ligand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CS1,-8.4,-7.9,-7.0,-6.9,-7.0,-8.1,-8.4,-8.4,-8.5,-8.0,...,-7.8,-7.4,-6.9,-7.5,-6.7,-8.5,-7.8,-8.1,-7.7,1
CS2,-6.5,-7.7,-7.3,-7.0,-6.3,-7.5,-7.8,-7.1,-7.2,-8.2,...,-6.2,-6.9,-6.4,-6.7,-7.0,-7.6,-7.6,-6.2,-7.3,1
CS3,-8.0,-7.9,-7.9,-8.1,-7.8,-8.0,-8.0,-7.8,-8.2,-7.7,...,-7.6,-7.6,-7.7,-7.5,-7.5,-8.2,-8.0,-7.7,-8.2,1
CS4,-8.6,-8.9,-8.0,-7.6,-7.8,-8.4,-9.4,-8.9,-8.9,-9.0,...,-7.9,-8.3,-8.5,-8.1,-8.1,-8.7,-8.8,-8.1,-8.5,1
CS5,-8.2,-8.5,-8.0,-8.2,-7.5,-8.2,-8.7,-8.4,-8.7,-7.9,...,-7.7,-7.8,-8.1,-8.1,-7.8,-8.3,-8.4,-8.1,-7.9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CS247,-10.5,-10.4,-9.9,-10.4,-11.3,-10.9,-10.7,-10.6,-10.1,-11.5,...,-11.4,-10.2,-11.8,-10.1,-10.9,-10.6,-10.5,-10.2,-10.8,1
CS248,-9.1,-9.8,-9.2,-8.8,-9.1,-9.1,-9.9,-10.7,-9.4,-11.1,...,-8.7,-8.9,-9.5,-9.1,-8.1,-10.4,-10.2,-8.7,-9.0,1
CS260,-9.3,-10.0,-9.1,-9.1,-9.8,-9.8,-9.4,-10.2,-9.0,-10.2,...,-8.7,-10.0,-9.4,-9.3,-9.4,-10.0,-9.1,-9.1,-8.9,1
CS261,-9.1,-11.0,-10.4,-10.1,-9.7,-10.5,-10.3,-11.7,-10.0,-11.2,...,-9.0,-11.3,-10.2,-9.9,-10.4,-10.9,-11.1,-9.6,-9.4,1


In [22]:
# ! pip install xlrd

Collecting xlrd
  Downloading xlrd-1.2.0-py2.py3-none-any.whl (103 kB)
[K     |████████████████████████████████| 103 kB 108 kB/s eta 0:00:01
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-1.2.0
