In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

##    Description    Functions to manage SDFiles, pandas Dataframes ...
##                   Applicability Domain analysis
##                   
##    Authors:       Kevin Pinto Gil (kevin.pinto@upf.edu)
##                   Manuel Pastor (manuel.pastor@upf.edu)
##
##    Copyright 2018 Manuel Pastor
##
##    This file is part of PhiTools
##
##    PhiTools is free software: you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation version 3.
##
##    PhiTools is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License
##    along with PhiTools.  If not, see <http://www.gnu.org/licenses/>

# 1. Importing libraries

In [2]:
### System libraries

import sys
import os
import getopt
import re
import shutil

### General libraries

import pandas as pd
import numpy as np
from math import * #math commands will be available every time you start an interactive session

## RDkit libraries

from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, AllChem, Descriptors, Crippen, DataStructs

### Scikit learn libraries

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

## Dataframe visualization part

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 4000

## Ignore Warnings 

import warnings
warnings.filterwarnings('ignore')


*** Could not find EPA module. Will use only the CACTVS web service to resolve CAS number structures. ***



# 3. Descriptors

## 3.1. Calculating Morgan FingerPrints

In [None]:
def calc_fp_arr( mols, radius ):

    '''
    Info
    ----
    This function allows one to calculate morgan fingerprints from a list of molecules
    
    Parameters
    ----------
    
    radius: int
        e.g. radius = 4 ## define morgan fingerprint radius 
    mols: RDKit molecule
        e.g. mols = df.mol3DProt ## dataframe column containing molecules
    
    Return
    ------
    
    morgan fingerprints array
    
    Example
    -------
    
    res = calc_fp_arr( mols , radius)
    
    Then one can run a PCA such as:
    
        pca = PCA( n_components = 3 )
        pca.fit( res )
        x = pca.transform( res )
        x = pd.DataFrame( x )
        x.columns = [ 'PC1', 'PC2', 'PC3' ]
        df = df.join(x)

    Attention
    ---------
    
    If this function doesn't work, try to update your conda environment
    
    '''

    fplist = []
    for mol in mols:
        arr = np.zeros( (1,) )
        fp = AllChem.GetMorganFingerprintAsBitVect( mol, int(radius))
        DataStructs.ConvertToNumpyArray( fp, arr )
        fplist.append( arr )
    return np.asarray( fplist )

## 3.2. Running  PCA using external Descriptors 

In [None]:
def getPCAscores(descriptors, n_components):

    '''

    This functions returns as an output a PCA pandas Dataframe with the number
    of components calculated 
    
    Parameters
    ----------
    
    descriptors: numpy array or pd.series pandas dataframe columns
        Numpy array with descriptors e.g. descriptors = [[0,0,1],[...],[1,0,1]]
        or Pandas DataFrame columns  e.g. descriptors = df_piv.iloc[:,2:]
    
    n_components: int
        ## number of components one wants to calculate
    
    Return
    ------
    
    pd.DataFrame
        Principal Component Analysis (PCA) DataFrame
    
    Example
    -------
    
    descriptors = df_piv.iloc[:,2:]
    n_components =  3
    
    getPCAscores(descriptors, n_components)

    '''
    
    pca = PCA( n_components = n_components )
    pca.fit( descriptors )
    pcaDF = pca.transform( descriptors )
    pcaDF = pd.DataFrame( pcaDF )
    pcaDF.columns = [ 'PC'+str(i+1) for i in range(n_components) ]
    
    return pcaDF
    

# 4. Similarity Methods

## 4.1. Similarity between two Databases

In [None]:
def getMorganfpDF(inSDF, radius):
   
    '''
        Getting morgan fingerprints from SDFile. The output will be a dataframe including
        in one column the morgan Fingerprints.
        
        inSDF  = 'ref.sdf'     ## Database SD file
        radius = 4             ## Define morgan fingerprint radius

    '''
    
    ### Model Analysis: Loading SDF or given pandasDF, calculating Morgan Fingerprints
    
    mfpDF = PandasTools.LoadSDF(inSDF, molColName='molRO') ## Dataframe with all Model information
    mfpDF['MorganFP']= [ AllChem.GetMorganFingerprintAsBitVect( mol, int(radius)) for mol in mfpDF.molRO]
    mfpDF['molID'] = [str('mol%0.6d'%(int(x)+1)) for x in range(len(mfpDF))] ## adding ID column e.g. mol000001

    return (mfpDF)

def getSimilarity(refDF,mdDF, RFname, MDname, refID, mdID, cutoff, maxsim,  molcol='molRO'):

    '''

    This function reads two pandas Dataframe where:
        - one contains a Reference Database
        - and the other one is the Model database

    Then a similiraty analysis using Morgan Fingerprints is performed at the cutoff provided
    
    Parameters
    ----------
    
    refDF: pd.DataFrame 
        referenceDF ## Reference Dataframe including MorgaFP column previously calculated
    mdDF: pd.DataFrame
        modelDF ## Model Dataframe including MorgaFP column previously calculated
    RFname: str
        'Tox21' ## Reference database basename to name files
    MDname: str
        'myModel' ## Model database basename to name files
    refID: str
        'ToxCast_chid' ## Reference ID name column
    mdID: str
        'molID' ## Model ID name column
        mdDF['molID'] = [str('mol%0.6d'%(int(x)+1)) for x in range(len(mdDF))]
    cutoff: float
        0.6 ## cutoff similarity distance by tanimoto Morgan fingerprints
    maxsim: float
        1   ## maximum similarity cutoff
    molcol: str
        'mol3Dprot' ## molecule column name which contains rdkit mol object
    
    Return
    ------
    
    pd.DataFrame
        Principal Component Analysis (PCA) DataFrame
    SD
        SD file containg the whole information with similarity molecules
    pickle
        TSV file in tabular format
    
    Example
    -------
    
    Run first morgan finger prints calculation:
    
    degdf['MorganFP']= [ AllChem.GetMorganFingerprintAsBitVect( mol, int(radius)) for mol in degdf.mol3DProt]
    mulDF['MorganFP']= [ AllChem.GetMorganFingerprintAsBitVect( mol, int(radius)) for mol in mulDF.mol3DProt]
    
    Define input parameters tu run similarity analisys:
    
    refD = mulDF.copy()
    mdD = degdf.copy()
    RFname = 'Mulliner'
    MDname = 'Degeneration'
    refID = 'parent_nonstd_inkey'
    mdID = 'parent_nonstd_inkey'         
    cutoff = 1.0           
    maxsim = 1.0
    molcol = 'mol3DProt'
    
    simDF = getSimilarity(refDF, mdDF, RFname, MDname, refID, mdID, cutoff, maxsim, molcol)

    '''
     
    ## Loop to find similarity between two databases
    
    counter = 0
    simDF = pd.DataFrame([])
    
    refIDout = refID
    mdIDout = mdID
    if refID == mdID:
        refIDout += '_ref'
        mdIDout += '_model'

    for i, row_i in refDF.iterrows():       
        max_similarity = 0
        max_similarity_id = ''
        try:
            for j, row_j in mdDF.iterrows():
                sim = DataStructs.FingerprintSimilarity(row_j.MorganFP,row_i.MorganFP)
                if sim>max_similarity:
                    max_similarity = sim
                    max_similarity_mol = row_j[molcol]
                    max_similarity_id = row_j[mdID]
        except:
            max_similarity = 0
            max_similarity_id= ''

        if max_similarity >= cutoff and max_similarity <= maxsim:
            counter = counter+1
            simDict = [{refIDout:row_i[refID],mdIDout:max_similarity_id,
                        'similarity':float(max_similarity),RFname+'_mol': row_i[molcol],
                        MDname+'_mol':max_similarity_mol}]
            tempDF = pd.DataFrame(simDict)
            tempDF = tempDF[[refIDout, mdIDout, 'similarity', RFname+'_mol',MDname+'_mol']]
            simDF = simDF.append(tempDF)
        else:
            pass

    print ('Total similar molecules found = ', counter)    
    
    if counter > 0:
        simDF = simDF.sort_values([mdIDout, 'similarity'], ascending=[True, False])
        simDF = simDF.reset_index(drop=True)
#         simDF.to_csv('similarityAnalysis_'+RFname+'_vs_'+MDname+'_morganFP.csv',
#                      sep='\t', encoding='utf-8', index=False )
        simDF.to_pickle('similarityAnalysis_'+RFname+'_vs_'+MDname+'_morganFP.pkl')

        writeSDFfromPandasDF(simDF,'similarityAnalysis_'+RFname+'_vs_'+MDname+'_morganFP.sdf',
                             RFname+'_mol',list(simDF.columns))
    
    return (simDF)

## 4.2. Similarity by SMARTS

In [None]:
def smartsFinding(DF,molDFname,ID, smarts):

    '''
        This functions allows one to perfom a searching by Substructure using SMARTS from a Dataframe
        
        DF = DF                ## Dataframe with molecules
        molDFname  = 'molCol'  ## molecule column 
        ID = 'name'            ## ID or name column
        smarts = '[NX3;H2,H1;!$(NC=O);!$(Nc)]' # aliphatic primary and secondary amines but no aniline nor amides
           or
        smarts = '[NX4;H0]'  # cuaternary amines (charged)
        
        smartsDF = smartsFinding(DF, molDFname,smarts)
        
        If one wants an SDFile, use writeSDFfromPandasDF(df, output, molColName, props) function
    '''

    target = Chem.MolFromSmarts(str(smarts))
    DF = DF.reset_index(drop=True)
    smartsDF = pd.DataFrame([]) ## Dataframe with all information DF plus matchDF
    matchDF = pd.DataFrame([]) ## Dataframe with matching information
    count = 0
   
    for i in range(len(DF)):
        mol = DF[molDFname][i]
        if mol.HasSubstructMatch(target):
            count += 1
            temp1 = pd.DataFrame(DF.loc[i,:]).T ## to convert row into Dataframe
            smartsDF = smartsDF.append(temp1)
            temp2 = {'HasSubMatch':'YES'}
            temp2 = pd.DataFrame([temp2])
            matchDF = matchDF.append(temp2)
        else:
            pass
#             temp1 = pd.DataFrame(DF.loc[i,:]).T ## to convert row into Dataframe
#             smartsDF = smartsDF.append(temp1)
#             temp2 = {'HasSubMatch':'NO'}
#             temp2 = pd.DataFrame([temp2])
#             matchDF = matchDF.append(temp2)
            

    print ('Total match molecules: ',count)
    smartsDF = smartsDF.reset_index(drop=True)
    matchDF = matchDF.reset_index(drop=True)
    smartsDF = smartsDF.join(matchDF)
#     smartsDF = smartsDF.sort_values(['HasSubMatch',ID], ascending=[False,True])
    
    return (smartsDF)

# 7. etoxLAB

## 7.1. creating automatically a new model 

In [None]:
def creatingNewModel(endpoint, descriptor, tag):
    '''
    This functions allows one to use etoxlab and create a new model automatically
        input:
            endpoint = 'ratDeG' ## endpoint name
            descriptor = 'adriana' ## descriptor name
            tag = 'degeneration' ## tag information
        
        e.g. creatingNewModel('ratDeG', 'adriana','degeneration')
    
    
    
    '''
    
    os.system('~/soft/eTOXlab/src/manage.py --new -e '+endpoint+' -t /toxicity/'+descriptor+'/'+tag+'/1')

    print ('Tag is set up')

    os.system('~/soft/eTOXlab/src/manage.py -v 0 -e '+endpoint+' --get=model')
    os.system('mv imodel.py '+descriptor+'_imodel.py')

## 7.2. RandomSplit

In [None]:
def randomsplit(df, dirModelname, activity, molcol, name, endpoint,category, vpath):
    '''
        df = df                   ## pandas dataframe
        dirModelname = '5-model'  ## folder name to be created to store model files
        activity = 'activity'     ## columns with activity values
        molcol = 'mol3Dprot'      ## molecule column
        name = 'parent_inkey'     ## column name
        endpoint = 'BioTF'        ## endpoint name for basename files.sdf
        category = 'activity'     ## column name to create categories
        vpath = os.getcwd()       ## current directory path
        
        This functions returns 3 pandas dataframes, test, train and global DF
        The split is 70/30 train/test, with a randomseed = 1987
        It creates 3 SDFiles, global, test and train with files name, activity inside, 
        ready to build a model. 
        
        e.g. trainDF, testDF, globalDF = (df, dirModelname, activity, molcol, name, endpoint, category, vpath)
    '''
    

    
    ## Model Building preparation inputs parameters
   
    print ('\033[1m' + '\n Writing Model SDFile with less fields and Randomsplit'+'\033[0m')
    dirModelname = dirModelname ## name directory to save 3D sdFile with 3D coordinates
    createDir(vpath, dirModelname)
     
    ## Writing model Global SDfile
    globaldf = df.copy()
    globaldf = globaldf[[name, activity, molcol]]
    globaldf.columns = ['name', 'activity', 'mol']
    props = list(globaldf.columns)
    
    outModelfile =  vpath+'/'+dirModelname+'/global-'+endpoint+'.sdf'
    writeSDFfromPandasDF(globaldf, outModelfile, 'mol', props)
    
    ## Writing model training and test set SDfile:

    valdf = df[[name, activity, molcol]]
    valdf.columns = ['name', 'activity', 'mol']
    
    train, test, y_train, y_test = train_test_split(valdf, df[category], test_size=0.2, 
                                                    random_state= 1987, stratify=df[category])
    
    train
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    
    trainfile = vpath+'/'+dirModelname+'/tr-'+endpoint+'.sdf'
    testfile = vpath+'/'+dirModelname+'/pr-'+endpoint+'.sdf'
    
    trainprops = list(train.columns)
    testprops = list(test.columns)
    
    writeSDFfromPandasDF(train, trainfile, 'mol', trainprops)
    writeSDFfromPandasDF(test, testfile, 'mol', testprops)
    print ('Test : ' + str(len(test)))
    print ('Train : ' + str(len(train)))
    print ('Global : ' + str(len(df)))
    
    return ( train, test, df)

In [None]:
def randomsplit_quantitative(df, dirModelname, activity, molcol, name, endpoint, catcol, vpath):
    '''
        df = df                   ## pandas dataframe
        dirModelname = '5-model'  ## folder name to be created to store model files
        molcol = 'mol3Dprot'      ## molecule column
        name = 'parent_inkey'     ## column name
        endpoint = 'BioTF'        ## endpoint name for basename files.sdf 
        vpath = os.getcwd()       ## current directory path
        
        This functions returns 3 pandas dataframes, test, train and global DF
        The split is 70/30 train/test, with a randomseed = 1987
        It creates 3 SDFiles, global, test and train with files name, activity inside, 
        ready to build a model. 
        
        e.g. trainDF, testDF, globalDF = (df, dirModelname, activity, molcol, name, endpoint, vpath)
    '''
    df = df.copy()
#     df = df[[name, activity, molcol]]
#     df.columns = ['name', 'activity', 'mol']
    
    ## Model Building preparation inputs parameters
   
    print ('\033[1m' + '\n Writing Model SDFile with less fields and Randomsplit'+'\033[0m')
    dirModelname = dirModelname ## name directory to save 3D sdFile with 3D coordinates
    createDir(vpath, dirModelname)
     
    ## Writing model Global SDfile
    props = list(df.columns)
    
    outModelfile =  vpath+'/'+dirModelname+'/global-'+endpoint+'.sdf'
    writeSDFfromPandasDF(df, outModelfile, molcol, props)
    
    ## Writing model training and test set SDfile:
    train, test, y_train, y_test = train_test_split(df,df[activity], test_size=0.2, 
                                                    random_state= 1987, stratify=df[catcol])
    
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    
    trainfile = vpath+'/'+dirModelname+'/tr-'+endpoint+'.sdf'
    testfile = vpath+'/'+dirModelname+'/pr-'+endpoint+'.sdf'
    
    trainprops = list(train.columns)
    testprops = list(test.columns)
    
    writeSDFfromPandasDF(train, trainfile, molcol, trainprops)
    writeSDFfromPandasDF(test, testfile, molcol, testprops)
    print ('Test : ' + str(len(test)))
    print ('Train : ' + str(len(train)))
    print ('Global : ' + str(len(df)))
    
    return ( train, test, df)

In [None]:
def splitSDF (sdf, prop, seed):
    
    '''
        This function allows one to obtain a training and test set by
        randomspliting. 
        
        Input paremeters:
            sdf = file.sdf ### sd File input pathway
            prop = 70      ### give the percentage of splitting ( 70% trainset, 30% testset)
            seed = 483     ### give a random number 
        
        
        
        e.g. randomSplitSDF -f file.sdf -p 70 [-s 2356]
    '''
    prop = float(prop)
    seed = float(seed)

    nmols = 0
    try:
        f = open (sdf,'r')
    except:
        print ('unable to open file ',sdf, ' ABORT')
        exit(1)
        
    for line in f:
        if line.startswith('$$$$'):
            nmols = nmols+1
    f.close()

    ntrai = int(np.round(prop*nmols/100.0))
    npred = nmols - ntrai
    
    print (nmols, "compounds found. Creating series of ", ntrai, " for training and ", npred, " for prediction")

    if seed != None :
        npseed = int(seed)
        np.random.seed(npseed)
        
    elements = np.random.choice(nmols, ntrai, False)
    #print elements
    
    f  = open (sdf,'r')
    sdf = sdf.split("/")
    sdf = sdf[-1]
    fp = open ('pr-'+sdf,'w')
    ft = open ('tr-'+sdf,'w')

    i = 0
    for line in f:
        if i in elements :
            ft.write(line)
        else:
            fp.write(line)
        if line.startswith('$$$$'):
            i=i+1

    f.close()
    fp.close()
    ft.close()