In [1]:
import pandas as pd
import numpy as np
from glob import glob
import sys
import os

In [2]:
def _get_the_dataframe(dir_path):
    list_files = glob(dir_path + '*')
    assert list_files
    # Sort the values in natural order
    list_files.sort(key = lambda x: x.split('/')[-1].split('_')[0])

    list_of_dfs = [pd.read_csv(i, index_col='Ligando') for i in list_files]
    df = pd.concat(list_of_dfs, axis=1)
    df.index.names = ['ligand'] # Rename the index
    # Rename columns
    df.columns = df.columns.str.strip().str.replace('DkScore_', '') # First strip to remove all white spaces
    return df

def process_smina_docking_results(dir_path, mol_library, docking_tool, save = True, 
                                  cocrys_molecules=False, cocrys_pattern=True):
    '''
    Concatenate a set of csv files with Smina docking results. If the molecules are named as "ligand_xxx" and
    "decoy_xxx", cocrys_molecules parameter should be kept as False.
    If cocristalized, the function expects that the ligand name has the following patter: 'pdbi_LIG_XXX'
    where lig is the  three letter molecule name.
    '''
    df = _get_the_dataframe(dir_path)
    
    if cocrys_molecules:
        # Add the Activity column.
        df['activity'] = 1 # If cocristalized it'll be assumed as active
        # We dont need to sort by numerical ordering, but by ligand name, so extract it
        df = df.reset_index()\
                    .sort_values('ligand', ascending = True)\
                    .set_index('ligand')
        if cocrys_pattern:
            df.index = df.index.map(lambda x: x.split('_')[1])
        
    else:
        # Add the Activity column.
        df['activity'] = [1 if 'ligand' in i else 0 for i in df.index ]
        # Sort indices by natural ordering
        df['order'] = [int(i.split('_')[-1]) for i in df.index]
        df = df.reset_index()\
                    .sort_values(['activity', 'order'], ascending=[False, True])\
                    .set_index('ligand')\
                    .drop('order', axis=1)
    # Save the dataframe as csv
    n_cols = df.shape[1]
    m_mols = df.shape[0]
    print('Shape of the final df:', df.shape)
    if save:
        df.to_csv(f'./{mol_library}_{docking_tool}_{n_cols}_prots_{m_mols}_mols.csv')
    return df


#### COCRYSTALIZED MOLECULES

In [3]:
# DIR OF FILES
mol_library = 'COCRYS'
docking_tool = 'VINARDO'
# COCRYS_PATH = f'../../ARCHIVOS/CRISTALES/DOKINGS//{mol_library}/{docking_tool}/'

# df_cocrys_vrd = process_smina_docking_results(COCRYS_PATH, mol_library, docking_tool, cocrys_molecules=True,
#                                              cocrys_pattern=False)

# df_cocrys_vrd.head()

### DEKOIS 
#### VINARDO

In [4]:
# DIR OF FILES
mol_library = 'DEKOIS'
docking_tool = 'VINARDO'
DB_PATH = f'../../ARCHIVOS/CRISTALES/DOCKINGS/FXA_LIGANDS_CROSS/{mol_library}/{docking_tool}/'

df_dekois_vrd = process_smina_docking_results(DB_PATH, mol_library, docking_tool)

Shape of the final df: (1240, 403)


### DUD 
#### VINARDO

In [5]:
# DIR OF FILES
mol_library = 'DUD'
docking_tool = 'VINARDO'
# DUD_PATH = f'../../FILES/CRYSTALS/DOKINGS/CDK2_LIGANDS_CROSS/{mol_library}/{docking_tool}/'

# df_dud_vrd = process_smina_docking_results(DUD_PATH, mol_library, docking_tool)