In [1]:
import pandas as pd
import numpy as np
from glob import glob
import sys
import os

We will use the following function from

In [24]:
def _get_the_dataframe(dir_path):
    list_files = glob(dir_path + '*')
    assert list_files
    # Sort the values in natural order
    list_files.sort(key = lambda x: x.split('/')[-1].split('_')[0])

    list_of_dfs = [pd.read_csv(i, index_col='Ligando') for i in list_files]
    df = pd.concat(list_of_dfs, axis=1)
    df.index.names = ['ligand'] # Rename the index
    # Rename columns
    df.columns = df.columns.str.strip().str.replace('DkScore_', '') # First strip to remove all white spaces
    return df

def process_smina_docking_results(dir_path, mol_library, docking_tool, save = True, 
                                  cocrys_molecules=False):
    '''
    Concatenate a set of csv files with Smina docking results. If the molecules are named as "ligand_xxx" and
    "decoy_xxx", cocrys_molecules parameter should be kept as False.
    If cocristalized, the function expects that the ligand name has the following patter: 'pdbi_LIG_XXX'
    where lig is the  three letter molecule name.
    '''
    df = _get_the_dataframe(dir_path)
    
    if cocrys_molecules:
        # Add the Activity column.
        df['activity'] = 1 # If cocristalized it'll be assumed as active
        # We dont need to sort by numerical ordering, but by ligand name, so extract it
        df = df.reset_index()\
                    .sort_values('ligand', ascending = True)\
                    .set_index('ligand')
        df.index = df.index.map(lambda x: x.split('_')[1])
        
    else:
        # Add the Activity column.
        df['activity'] = [1 if 'ligand' in i else 0 for i in df.index ]
        # Sort indices by natural ordering
        df['order'] = [int(i.split('_')[-1]) for i in df.index]
        df = df.reset_index()\
                    .sort_values(['activity', 'order'], ascending=[False, True])\
                    .set_index('ligand')\
                    .drop('order', axis=1)
    # Save the dataframe as csv
    n_cols = df.shape[1]
    m_mols = df.shape[0]
    print('Shape of the final df:', df.shape)
    if save:
        df.to_csv(f'./{mol_library}_{docking_tool}_{n_cols}_prots_{m_mols}_mols.csv')
    return df


#### COCRYSTALIZED MOLECULES

In [26]:
# DIR OF FILES
mol_library = 'COCRYS'
docking_tool = 'VINARDO'
COCRYS_PATH = f'../../FILES/CRYSTALS/DOKINGS/DOCK_136_crys_{mol_library}/{docking_tool}/CSV/'

df_cocrys_vrd = process_smina_docking_results(COCRYS_PATH, mol_library, docking_tool, cocrys_molecules=True)

df_cocrys_vrd.head()

Shape of the final df: (130, 137)


Unnamed: 0_level_0,1c5m,1ezq,1f0r,1f0s,1fax,1fjs,1g2l,1g2m,1hcg,1ioe,...,4y76,4y79,4y7a,4y7b,4zh8,4zha,5k0h,5voe,5vof,activity
ligand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RPR,-11.8,-14.3,-12.8,-12.0,-11.5,-12.6,-12.8,-12.9,-12.6,-12.4,...,-11.5,-10.9,-11.7,-11.8,-9.8,-13.0,-13.0,-11.7,-11.4,1
815,-11.5,-11.5,-11.5,-11.4,-10.7,-11.3,-11.8,-11.8,-11.1,-12.4,...,-11.3,-11.5,-11.5,-11.2,-11.6,-11.5,-10.8,-10.7,-11.7,1
PR2,-10.6,-11.4,-10.9,-11.4,-10.7,-10.3,-10.5,-10.5,-10.0,-11.7,...,-10.6,-10.8,-10.7,-10.3,-10.7,-10.3,-10.4,-10.1,-11.3,1
DX9,-10.7,-11.6,-11.1,-10.5,-11.5,-10.8,-10.8,-11.4,-10.6,-11.8,...,-10.8,-11.0,-10.4,-10.1,-9.5,-11.0,-11.6,-9.6,-11.3,1
Z34,-11.2,-13.2,-12.7,-13.2,-12.4,-13.2,-12.8,-13.5,-11.1,-12.7,...,-11.8,-12.2,-12.6,-11.3,-11.8,-11.5,-13.4,-11.7,-11.7,1


### DEKOIS 
#### VINARDO

In [135]:
# DIR OF FILES
mol_library = 'DEKOIS2'
docking_tool = 'VINARDO'
DEKOIS_PATH = f'../../FILES/CRYSTALS/DOKINGS/DOCK_136_crys_{mol_library}/{docking_tool}/CSV/'

df_dekois_vrd = process_smina_docking_results(DEKOIS_PATH, mol_library, docking_tool)

Shape of the final df: (1240, 137)


### DUD 
#### VINARDO

In [132]:
# DIR OF FILES
mol_library = 'DUD2006'
docking_tool = 'VINARDO'
DUD_PATH = f'../../FILES/CRYSTALS/DOKINGS/DOCK_136_crys_{mol_library}/{docking_tool}/CSV/'

df_dud_vrd = process_smina_docking_results(DUD_PATH, mol_library, docking_tool)

Shape of the final df: (5891, 137)


### NaN values in DUD: Temporally
I will fill NAN with row means in the DUD dataset to continue the analysis before complete the missing ligands.



In [150]:
df_dud_vrd.shape[0] * df_dud_vrd.shape[1]

807067

In [149]:
df_dud_vrd.isna().sum().sum() # There are 1538 missing values of 807067 values

1538

In [151]:
1538/807067

0.0019056658245226233

In [148]:
x = df_dud_vrd.isna().sum(axis=1)
for idx, value in zip(x.index, x):
    if value > 0:
        pass
        #print(idx, value)

In [155]:
df_dud_vrd = df_dud_vrd.fillna(df_dud_vrd.mean(axis=1))

In [157]:
df_dud_vrd.to_csv('DUD2006_VINARDO_137_prots_5891_mols.csv')

In [156]:
!ls

0_Preparing_ligands.ipynb
1_Preprocessing_Docking_results.ipynb
2_Improving_Ranking_From_Docking_Scores.ipynb
3_Calculating_Metrics_to_Evaluate_VS_Performance.ipynb
chim_addh.py
chim_addh.pyc
DEKOIS2_VINARDO_137_prots_1240_mols.csv
DEKOIS2_VINARDO_137_prots_1240_mols_LigEff.csv
DUD2006_VINARDO_137_prots_5891_mols.csv
DUD2006_VINARDO_137_prots_5891_mols_LigEff.csv
