In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

##    Description    Functions to manage SDFiles, pandas Dataframes ...
##                   Applicability Domain analysis
##                   
##    Authors:       Kevin Pinto Gil (kevin.pinto@upf.edu)
##                   Manuel Pastor (manuel.pastor@upf.edu)
##
##    Copyright 2018 Manuel Pastor
##
##    This file is part of PhiTools
##
##    PhiTools is free software: you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation version 3.
##
##    PhiTools is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License
##    along with PhiTools.  If not, see <http://www.gnu.org/licenses/>

# 1. Importing libraries

In [1]:
### General libraries

import pandas as pd
import numpy as np
from math import * #math commands will be available every time you start an interactive session

## Dataframe visualization part

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 4000

## Ignore Warnings 

import warnings
warnings.filterwarnings('ignore')


*** Could not find EPA module. Will use only the CACTVS web service to resolve CAS number structures. ***



# 2. Checking Duplicates

In [None]:
def getDuplicatesDF(df, colname):

    '''
    
    Info
    ----
    This function checks if there is any duplicate molecules by colname (e.g. InChI key).
    It is highly recommended to checked this using as a colname the Standard and non Standard InChI key.
    
    Parameters
    ----------
    
    df: inditexDF       
        ## Pandas Dataframe 
    colname: 'inchikey' 
        ## Molecule column to be used to check duplicates
    
    Return
    ------
    
    duplist: Duplicated molecule names list
    dupDF:   Dataframe containing duplicated molecules
    
    Example
    -------
    
    duplist, dupDF = getDuplicatesDF(inditexDF, 'inchikey')
    
    Raises
    ------
    AttributeError: The ``Raises`` section is a list of all exceptions
        that are relevant to the interface.
    ValueError: IF dataframe is empty.
            
    '''    
    
#     df['phiID'] = [str('mol%0.6d'%(int(x)+1)) for x in range(len(df))]

    if not df.empty:
    
        duplist = df[df[colname].duplicated()][colname]
#         print ('\033[1m' +'1. Duplicates checking section:'+'\033[0m')
        print ('\nOriginal DF molecules = '+ str(len(df)))
        print ('\nThe number of '+colname+' duplicates in are  = '+ str(duplist.count()))
        print ('\nThe '+ colname + 'duplicates list is: \n'+str(np.array(duplist)))

        ## This dataframe contains duplicated molecules with metal ion decoupled
        dupDF = df[df[colname].isin(duplist)]
#         dupDF = dupDF.sort_values(by=[colname, 'phiID'], ascending=[True, True])
        return (duplist, dupDF)

    else:
        raise ValueError('The Dataframe is empty')

# 3. Dropping Dupplicates (keeping different info in list) and curate columns:

In [None]:
def dropDupMols(df, colname, molcol= None):
    
    '''
    
    Info
    ----
    
    After standardising molecules, one needs to check duplicates with getDuplicateDF
    function. After knowing how many duplicates you have, one should keep 1 molecule but
    keep the whole information of the other duplicate molecules. 
    This function allows one to obtain a dataframe where the duplicated molecule contains
    the information in a list in every column separated by comma. 
    
    Parameters
    ----------
    
    df:  stdDF
        ## obtained after standardising, but also after checking Excluded molecules
    colname: 'parent_inkey' 
        ## colname to groupby
    molcol: 'MolProt'
        ## By default is None, if there is molecule column defined, we will keep the first one. 
       
    Returns
    -------
    
    Dataframe with NO duplicates.
    Cells where non unique value was found, the value will be joined separated by ', ',
    such as "ROCHE_PC_RO0052807, AZ_GGA_200002374"
    
    Example
    -------        
    
    etoxNoDupDF = dropDupMols(stdDF,'parent_inkey')
         
    '''
   
    if not df.empty:
#         df1 = df.groupby([colname]).aggregate(lambda x : set(x))
#         cols = df.columns.drop(colname)
#         df1.reset_index(level=0, inplace=True)
#         df1[cols] = df1[cols].applymap(list)
        if molcol != None:
            ### Molecule Part
            df1 = df[[colname, molcol]].groupby([colname]).aggregate(lambda x : set(x))
            df1[molcol] = df1[molcol].apply(list)
            df1[molcol] = [x[0] for x in df1[molcol]]
            
            ### Rest of dataframe part
            cols = df.columns.drop(molcol)
            df2 = df[cols].groupby([colname]).aggregate(lambda x : ", ".join(map(str, set(x))))
            df1.reset_index(level=0, inplace=True)
            df2.reset_index(level=0, inplace=True)
            
            ### Joining molecule and rest of dataframe
            df3 = pd.merge(df2, df1 , left_on=colname, right_on=colname, how='inner')
            return(df3)
    
        else:
#             df1 = df.groupby([colname]).aggregate(lambda x : ", ".join(map(repr, set(x))))
            df1 = df.groupby([colname]).aggregate(lambda x : ", ".join(map(str, set(x))))
            df1.reset_index(level=0, inplace=True)
            return (df1)
            
#         return (df1)
        
    else:
        print('Dataframe is empty')
        df1 = pd.DataFrame([])
        return (df1)

In [None]:
def curateCol(dfname, colname, t):

    '''
    
    Info
    ----
    
    This function allows one to curate columns from dataframe after using 
    my dropDupMols function. 
    
    Parameters
    ----------
    
    dfname: df   
        ### dataframe name
    
    colname: 'molecular_weight'
        ### column name to be curated
    
    t: float, str or int
        ### you need to choose between float or int or str
        ### str: it's going to keep the first item and erase the other ones
        ### int: convert string to integer and calcule the mean value. If any nan value, eliminate them.
        ### float: convert string to float and calcule the mean value. If any nan value, eliminate them.
       
    Returns
    -------
    
    Dataframe with curated columns.
    
    Example
    -------        
    
    curatedDF = curate(dropdupDF, 'std_smiles', str)
         
    '''
    
    df = dfname.copy()
    if t == float:
#         print ('I am a fucking float')
        for x, y in zip(df[colname], df.index):
            if ', ' in x:
                x = np.array(x.split(', ')).astype(np.float)
                if True in np.isnan(x):
                    x = x[~np.isnan(x)] 
                    if len(x) == 0:
                        x = nan
                        df.loc[df.index.isin([y]), colname] = x
                        
                    else:
                        x = np.mean(x)
                        df.loc[df.index.isin([y]), colname] = x
                        continue
                else:
                    x = np.mean(x)
                    df.loc[df.index.isin([y]), colname] = x
    elif t == int:
#         print ('I am a fucking int')
        for x, y in zip(df[colname], df.index):
            if ', ' in x:
                x = np.array(x.split(', ')).astype(np.int)
                if True in np.isnan(x):
                    x = x[~np.isnan(x)] 
                    if len(x) == 0:
                        x = nan
                        df.loc[df.index.isin([y]), colname] = x   
                    else:
                        x = np.mean(x)
                        df.loc[df.index.isin([y]), colname] = x
                        continue
                else:
                    x = np.mean(x)
                    df.loc[df.index.isin([y]), colname] = x
    elif t == str:
#         print ('This is only used for compounds with same stdinkey and different smile')
        for x, y in zip(df[colname], df.index):
            if ', ' in x:
                x = np.array(x.split(', '))
#                 print ( x, 'is changed for ' , x[0], 'with an index of ', y)
                x = x[0]
                df.loc[df.index.isin([y]), colname] = x
            else:
                df.loc[df.index.isin([y]), colname] = x
                continue

#     elif t == 'mol':
# #         print ('This is only used for compounds with same stdinkey and different smile')
#         for x, y in zip(df[colname], df.index):
#             if ', ' in x:
#                 x = np.array(x.split(', '))
#                 x = x[0]
#                 df.loc[df.index.isin([y]), colname] = x
#             else:
#                 df.loc[df.index.isin([y]), colname] = x
#                 continue
    return (df)

# 4. Dropping rows giving a list 
- e.g. in parent_inkey column drop a row that contains pinkeylist

In [None]:
def dropRowsbycolumnlist(df, cn, cl):
    
    '''
    
    Info
    ----

    This functions allows one to drop rows by column list.
    
    Parameters
    ----------
    
    df: DF  
        ## Dataframe containing all information
    cn: colName 
        ## Column name to check molecules to drop
    cl: mollist 
        ## List of molecules present in column to be dropped 
       
    Returns
    -------
    
    Dataframe with cl ('molecule list') removed from original dataframe.
    
    Example
    -------        

    df = DF
    cn = 'parent_inkey'
    cl = ['INFDPOAKFNIJBF-UHFFFAOYSA-N', 'SYJFEGQWDCRVNX-UHFFFAOYSA-N',
          'CTSLUCNDVMMDHG-UHFFFAOYSA-N', 'ZFSLODLOARCGLH-UHFFFAOYSA-N']
    DF = dropRowsbycolumnlist(df, cn, cl)

         
    '''

    x = df[~df[cn].isin(cl)]
    print (len(x))
    return x  