In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

##    Description    Functions to manage SDFiles, pandas Dataframes ...
##                   Applicability Domain analysis
##                   
##    Authors:       Kevin Pinto Gil (kevin.pinto@upf.edu)
##                   Manuel Pastor (manuel.pastor@upf.edu)
##
##    Copyright 2018 Manuel Pastor
##
##    This file is part of PhiTools
##
##    PhiTools is free software: you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation version 3.
##
##    PhiTools is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License
##    along with PhiTools.  If not, see <http://www.gnu.org/licenses/>

# 1. Importing libraries

In [None]:
### System libraries

import sys
import os
import getopt
import re
import shutil

### General libraries

import pandas as pd
import numpy as np
from math import * #math commands will be available every time you start an interactive session

## RDkit libraries

from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, AllChem, Descriptors, Crippen, DataStructs
from rdkit.Chem.Draw import rdMolDraw2D 
from rdkit.Chem import SDWriter 
from rdkit.Chem import rdchem 
from rdkit.Chem.Scaffolds import MurckoScaffold 
from rdkit.six import BytesIO, string_types, PY3 

### LoadSDF  function into Pandas Dataframe without removing Hs

from __future__ import print_function 
from base64 import b64encode 
import sys 
import types 

## Dataframe visualization part

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 4000

## Ignore Warnings 

import warnings
warnings.filterwarnings('ignore')






# 2. Function to create directories

In [None]:
def createDir(vpath, dirname):


    '''
    
    Info
    ----
    This functions checks if dirname exists in path given, if does not exists, then it will be created. 
    
    Parameters
    ----------
    
    vpath: os.getcwd() 
        ## current directory path
    dir2Dname: '2-2Dcoord'
        ### directory name to be created
    
    
    Example
    -------
    
    createDir(vpath, dir2Dname)
    
    '''
    
    directory = vpath+'/'+dirname

    if not os.path.exists(directory):
        os.makedirs(directory)
        print (dirname + ' is created')
    else: 
        print(dirname + ' already exists')

# 3. Loading SDF into Pandas Dataframe

In [None]:
def LoadSDFintoDF(filename, idName='ID', molColName='ROMol', includeFingerprints=False,
                  isomericSmiles=False, smilesName=None, embedProps=False):
    '''
        Read file in SDF format and return as Pandas data frame. 
        If embedProps=True all properties also get embedded in Mol objects in the molecule column. 
        If molColName=None molecules would not be present in resulting DataFrame (only properties 
        would be read). 
        I took that function from PandasTools and modified it eliminating the sanitize option and
        adding removeHs = False, cause I do not want the molecule to be modified. 
    ''' 


    if isinstance(filename, string_types):
        if filename.lower()[-3:] == ".gz":
            import gzip 
            f = gzip.open(filename, "rb") 
        else: 
            f = open(filename, 'rb') 
            close = f.close 
    else:
        f = filename
        close = None  # don't close an open file that was passed in 
    records = [] 
    indices = [] 
    for i, mol in enumerate(Chem.ForwardSDMolSupplier(f,removeHs=False)):
        if mol is None: 
            continue 
        row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames()) 
        if molColName is not None and not embedProps: 
            for prop in mol.GetPropNames(): 
                  mol.ClearProp(prop) 
        if mol.HasProp('_Name'): 
            row[idName] = mol.GetProp('_Name') 
        if smilesName is not None: 
            row[smilesName] = Chem.MolToSmiles(mol, isomericSmiles=isomericSmiles) 
        if molColName is not None and not includeFingerprints: 
            row[molColName] = mol 
        elif molColName is not None: 
            row[molColName] = PandasTools._MolPlusFingerprint(mol) 
        records.append(row) 
        indices.append(i) 
   
    if close is not None:
        close() 
    PandasTools.RenderImagesInAllDataFrames(images=True) 
    return pd.DataFrame(records, index=indices) 

## 4. Writing SDFile from Pandas Data Frame

In [None]:
def writeSDFfromPandasDF(df, output, molColName, props):

    '''
    
    Info
    ----
    
    Function that allows one writing an SDFile from a pandas dataframe with the properties
    that one want to keep it
    
    Parameters
    ----------
    
    df: pandas DataFrame 
        ### pandasDataframe name e.g. inDF
    output: str
        ## give an output SD file name e.g. 'inditex3D.sdf'
    molColName: str
        ## give the column name where the molecule is stored e.g. 'mol2D'
    props: list
        ## list of columns names ['smiles','cas', 'annotation'] you want to add in SDFile as properties 
        ## if one want to add all properties: props = list(df.columns)
    
    
    Return
    ------
    
    SD file with columns as properties.
    
    Example
    -------
    
    writeSDFfromPandasDF(df, output, molColName, props)
            
    '''
    
    PandasTools.WriteSDF(df, output, molColName=molColName, properties=props)
    

# 5. Adding Parent smiles, inchi and inchikey to DF

In [None]:
def addParentInfo(dfname, molcol):
    '''
       Calculating smiles, inchi and inchikey from molecule and adding information
       in pandas Dataframe. 
       
       input parameters:
       dfname = DF ## dataframe which contains the molecule
       molcol = 'parent_mol' ## molecule column
    '''
    df = dfname.copy()
    df['parent_smiles'] = df[molcol].apply(lambda x: addParentsmi(x))
    df['parent_std_inchi'] = df[molcol].apply(lambda x: addParentinchi(x, 'STD'))
    df['parent_nonstd_inchi'] = df[molcol].apply(lambda x: addParentinchi(x, 'nonSTD'))
    df['parent_std_inkey'] = df[molcol].apply(lambda x: addParentinkey(x, 'STD'))
    df['parent_nonstd_inkey'] = df[molcol].apply(lambda x: addParentinkey(x, 'nonSTD'))
    return df

In [None]:
def addParentsmi(mol):
    
    psmiles = Chem.MolToSmiles(mol, isomericSmiles=True)
    
    return (psmiles)

In [None]:
def addParentinchi(mol, cat):

    '''
       This function returns standard or non standard inchi.
       mol = 'mol3D' ## molecule column name
       cat = 'STD' ## 'STD' if one wants standard inchi where no tautomerism, stereoisomerism is counted.
                   ## 'nonSTD' if one wants non-standard inchi . 
    '''
    
    if cat == 'STD':
        pinchi = Chem.MolToInchi(mol)        
    elif cat == 'nonSTD':
        pinchi = Chem.MolToInchi(mol, options='/FixedH')
    
    return (pinchi)

In [None]:
def addParentinkey(mol, cat):
    '''
       This function returns inchi key standard or non standard.
       mol = 'mol3D' ## molecule column name
       cat = 'STD' ## 'STD' if one wants standard inchi key where no tautomerism, stereoisomerism is counted.
                   ## 'nonSTD' if one wants non-standard inchi key. 
    '''
    
    if cat == 'STD':
        pinchi = Chem.MolToInchi(mol)
        pinkey = Chem.InchiToInchiKey(pinchi)        
    elif cat == 'nonSTD':
        pinchi = Chem.MolToInchi(mol, options='/FixedH')
        pinkey = Chem.InchiToInchiKey(pinchi)
    
    return (pinkey)