In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

##    Description    Functions to manage SDFiles, pandas Dataframes ...
##                   Applicability Domain analysis
##                   
##    Authors:       Kevin Pinto Gil (kevin.pinto@upf.edu)
##                   Manuel Pastor (manuel.pastor@upf.edu)
##
##    Copyright 2018 Manuel Pastor
##
##    This file is part of PhiTools
##
##    PhiTools is free software: you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation version 3.
##
##    PhiTools is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License
##    along with PhiTools.  If not, see <http://www.gnu.org/licenses/>

# 1. Importing libraries

In [1]:
### General libraries

import pandas as pd
import numpy as np
from math import * #math commands will be available every time you start an interactive session

## Dataframe visualization part

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 4000

## Ignore Warnings 

import warnings
warnings.filterwarnings('ignore')


*** Could not find EPA module. Will use only the CACTVS web service to resolve CAS number structures. ***



# 2. Curating columns of duplicated molecules

In [88]:
def curateColsum(dfname, colname, t):
    df = dfname.copy()
    if t == int:
#         print ('I am a fucking int')
        for x, y in zip(df[colname], df.index):
            if ', ' in x:
                x = np.array(x.split(', ')).astype(np.int)
                if True in np.isnan(x):
                    x = x[~np.isnan(x)] 
                    if len(x) == 0:
                        x = nan
                        df.loc[df.index.isin([y]), colname] = x   
                    else:
                        x = np.sum(x)
                        df.loc[df.index.isin([y]), colname] = x
                        continue
                else:
                    x = np.sum(x)
                    df.loc[df.index.isin([y]), colname] = x
    if t == float:
#         print ('I am a fucking int')
        for x, y in zip(df[colname], df.index):
            if ', ' in x:
                x = np.array(x.split(', ')).astype(np.float)
                if True in np.isnan(x):
                    x = x[~np.isnan(x)] 
                    if len(x) == 0:
                        x = nan
                        df.loc[df.index.isin([y]), colname] = x   
                    else:
                        x = np.sum(x)
                        df.loc[df.index.isin([y]), colname] = x
                        continue
                else:
                    x = np.sum(x)
                    df.loc[df.index.isin([y]), colname] = x
    return (df)

In [None]:
def curateColmean(dfname, colname, t):
    df = dfname.copy()
    if t == int:
#         print ('I am a fucking int')
        for x, y in zip(df[colname], df.index):
            if ', ' in x:
                x = np.array(x.split(', ')).astype(np.int)
                if True in np.isnan(x):
                    x = x[~np.isnan(x)] 
                    if len(x) == 0:
                        x = nan
                        df.loc[df.index.isin([y]), colname] = x   
                    else:
                        x = np.mean(x)
                        df.loc[df.index.isin([y]), colname] = x
                        continue
                else:
                    x = np.mean(x)
                    df.loc[df.index.isin([y]), colname] = x
    if t == float:
#         print ('I am a fucking int')
        for x, y in zip(df[colname], df.index):
            if ', ' in x:
                x = np.array(x.split(', ')).astype(np.float)
                if True in np.isnan(x):
                    x = x[~np.isnan(x)] 
                    if len(x) == 0:
                        x = nan
                        df.loc[df.index.isin([y]), colname] = x   
                    else:
                        x = np.mean(x)
                        df.loc[df.index.isin([y]), colname] = x
                        continue
                else:
                    x = np.mean(x)
                    df.loc[df.index.isin([y]), colname] = x
    return (df)

In [None]:
def curateColmin(dfname, colname, t):
    df = dfname.copy()
    if t == int:
#         print ('I am a fucking int')
        for x, y in zip(df[colname], df.index):
            if ', ' in x:
                x = np.array(x.split(', ')).astype(np.int)
                if True in np.isnan(x):
                    x = x[~np.isnan(x)] 
                    if len(x) == 0:
                        x = nan
                        df.loc[df.index.isin([y]), colname] = x   
                    else:
                        x = np.min(x)
                        df.loc[df.index.isin([y]), colname] = x
                        continue
                else:
                    x = np.min(x)
                    df.loc[df.index.isin([y]), colname] = x
    if t == float:
#         print ('I am a fucking int')
        for x, y in zip(df[colname], df.index):
            if ', ' in x:
                x = np.array(x.split(', ')).astype(np.float)
                if True in np.isnan(x):
                    x = x[~np.isnan(x)] 
                    if len(x) == 0:
                        x = nan
                        df.loc[df.index.isin([y]), colname] = x   
                    else:
                        x = np.min(x)
                        df.loc[df.index.isin([y]), colname] = x
                        continue
                else:
                    x = np.min(x)
                    df.loc[df.index.isin([y]), colname] = x
    return (df)

In [None]:
def curateColmax(dfname, colname, t):
    df = dfname.copy()
    if t == int:
#         print ('I am a fucking int')
        for x, y in zip(df[colname], df.index):
            if ', ' in x:
                x = np.array(x.split(', ')).astype(np.int)
                if True in np.isnan(x):
                    x = x[~np.isnan(x)] 
                    if len(x) == 0:
                        x = nan
                        df.loc[df.index.isin([y]), colname] = x   
                    else:
                        x = np.max(x)
                        df.loc[df.index.isin([y]), colname] = x
                        continue
                else:
                    x = np.max(x)
                    df.loc[df.index.isin([y]), colname] = x
    if t == float:
#         print ('I am a fucking int')
        for x, y in zip(df[colname], df.index):
            if ', ' in x:
                x = np.array(x.split(', ')).astype(np.float)
                if True in np.isnan(x):
                    x = x[~np.isnan(x)] 
                    if len(x) == 0:
                        x = nan
                        df.loc[df.index.isin([y]), colname] = x   
                    else:
                        x = np.max(x)
                        df.loc[df.index.isin([y]), colname] = x
                        continue
                else:
                    x = np.max(x)
                    df.loc[df.index.isin([y]), colname] = x
    return (df)

In [1]:
def curateMolCol(dfname, colname):
    
    df = dfname.copy()

    for x, y in zip(df[colname], df.index):
        if ', ' in x:
            x = x.split(', ')[0]
            df.loc[df.index.isin([y]), colname] = x
        else:
            df.loc[df.index.isin([y]), colname] = x
            continue

    return (df)