In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

##    Description    Functions to manage SDFiles, pandas Dataframes ...
##                   Applicability Domain analysis
##                   
##    Authors:       Kevin Pinto Gil (kevin.pinto@upf.edu)
##                   Manuel Pastor (manuel.pastor@upf.edu)
##
##    Copyright 2018 Manuel Pastor
##
##    This file is part of PhiTools
##
##    PhiTools is free software: you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation version 3.
##
##    PhiTools is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License
##    along with PhiTools.  If not, see <http://www.gnu.org/licenses/>

# 1. Importing libraries

In [2]:
### General libraries

import pandas as pd
import numpy as np
from math import * #math commands will be available every time you start an interactive session

### Graphical libraries

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib as mpl ## to come back to matplotlib default parameters
import seaborn as sns

# mpl.use('Agg') ## ERROOOOOOOOOOOOOOOOOOOOR

from pylab import *
%matplotlib inline

## Dataframe visualization part

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 4000

## Ignore Warnings 

import warnings
warnings.filterwarnings('ignore')


*** Could not find EPA module. Will use only the CACTVS web service to resolve CAS number structures. ***



# 1. FOUR FOLD graph

In [None]:
def FourfoldDisplay(TP, TN, FP, FN, label, name, endpoint):
    """ Draws confusion matrix graphical representaion

    """
    print (TP, TN, FP, FN)
    sensitivity = TP/(TP+FN)
    specificity = TN/(TN+FP)
    print ('sens', sensitivity, 'spec', specificity)
    width = np.pi / 2.0
    theta = np.radians([45,135,225,315])
    table = [FP,TP,FN,TN]
##    plt.figure("RF-Qualitative_validation")
    plt.figure()
    plt.clf()
    ax = plt.subplot(121, polar=True, adjustable='box', aspect=2)    
    bars = ax.bar(theta, table, width=width, color=["red", "lightblue", "red", "lightblue"])
##    plt.title( label )

    ax.set_xticklabels(["","FP (%s) \n\n" % str(int(FP)), "",  "TP (%s) \n\n" % str(int(TP)), "", "\n\n\nFN (%s)" % str(int(FN)), 
                        "",  "\n\n\nTN (%s)" % str(int(TN))], fontsize=14)
    ax.set_yticks([])
    ax.grid(False)
    ax.axes.spines['polar'].set_visible(False)
    
    ax2 = plt.subplot(122, adjustable='box', aspect=4)
    plt.ylim([0,1])
##    plt.title(endpoint+'\n')
    
    bar_width = 0.5
    y = [0, sensitivity, specificity, 0]
    index = np.arange(4)
    ax2.bar(index, y, bar_width, color=["lightgreen","lightgrey"])
    #ax.offset(0.5)
    plt.xticks( index + bar_width / 2.0, ("", 'Sens', 'Spec', ""))
    
    plt.savefig(name)


# 2. Histogram plot plus adding category column to Dataframe

In [None]:
def categorizeByhist(df, activity, bins):
    
    '''
    
    This function returns a dataframe with a histogram category column. It creates a folder 
    called histogram, and inside it creates the histogram plot. 
    
    Input parameters:
    
    df = protDF.copy()                   #### dataframe to be used
    activity = 'quantitative_activity'   #### activity name column 
    bins = 10                            #### int number of bins to be histogrammed
    
    e.g. df = histogramTOLabels(df, activity, bins)
    
    '''
    xdf = df.copy()
    xdf[activity] = np.around(xdf[activity], 4) ## round to 4 decimals
    hist = xdf[activity].value_counts(bins=bins).sort_index()
    print (hist)
    
    my_dpi = 96
    fig = plt.figure(figsize=(1250/my_dpi, 800/my_dpi))
    myplot = sns.barplot(hist.index, hist.values, alpha=0.8)
    for item in myplot.get_xticklabels():
        item.set_rotation(75)
    plt.title(activity+' histogram')
    plt.ylabel('Number of Compounds', fontsize=12)
    plt.xlabel(activity, fontsize=12)
    plt.show()
    
    directory = 'histogram'

    if not os.path.exists(directory):
        os.makedirs(directory)
        print (directory + ' is created')
    else: 
        print(directory + ' already exists')
    
    fig.savefig(directory+'/histogram_'+str(bins)+'.png')
    
    xdf['histcat'] = pd.cut(xdf[activity], bins=bins, right=True, labels=[str(x) for x in range(bins)])

    return (xdf)    

# 3. Applicability Domain

## 3.1. Histogram Applicability Domain analysis functions

In [None]:
def scatterDC1vsDC2(vpath, RFD1, RFD2, MD1, MD2, RFname, MDname, DC1, DC2):

    '''
        This function generates an scatter plot of the descriptors used, e.g. MW vs LogP)
        vpath =              ## path to save files
        RFD1 = RFdf.LogP     ## ReF Database descriptors df column containing logp, PC1,... 
        RFD2 = RFdf.mw       ## ReF Database descriptors df column containing mw, PC2, ...
        MD1 = MDdf.LogP      ## Model Database descriptors df column containing logp, PC1, ...
        MD2 = MDdf.mw        ## Model Database descriptors df column containing mw, PC2, ...
        RFname = 'DrugBank'   ## ReF Database name
        MDname = 'Inditex'     ## Model Database name
        DC1 = 'PC1', 'LogP' ... ## name of descriptors or principal component used ...
        DC2 = 'PC2', 'MW' ...   ## name of descriptors or principal component used ...
    '''

    
    # Database information:
    xdb = RFD1  # logP from DrugBank Database
    ydb = RFD2    # MW from DrugBank Database

    ## Here we will calculate max() and min() for y and x to be added in the plots settings.

    xmax = xdb.max() + (xdb.max()/2) ### maximum of logP data
    xmin = xdb.min() + (xdb.min()/2) ### minimum of logP data
    ymax = ydb.max() + (ydb.max()/2) ### maximum of MW data
    ymin = ydb.min() + (ydb.min()/2) ### minimum of MW data
    
#     xmax = xdb.max() + 5 ### maximum of logP data
#     xmin = xdb.min() - 5 ### minimum of logP data
#     ymax = ydb.max() + 100 ### maximum of MW data
#     ymin = ydb.min() - 100 ### minimum of MW data


    # Model information:
    xmd = MD1  # logP from model Database
    ymd = MD2 # MW from model Database
    
    
    ######## Scatter Plot DC1 vs DC2 ##########

    my_dpi = 96 ### size of my figure

    ### 1st Plot: Scatter plot (model projected to drugbank) ###

    fig0 = plt.figure(figsize=(1250/my_dpi, 800/my_dpi))
    
    plt.title('Scatter Plot '+RFname+' vs '+MDname)
    plt.plot(xdb,ydb,'.k', color = '0.75') ### plot in grey
    plt.axis([xmin, xmax, ymin, ymax])

    # Plot data
    plt.plot(xmd,ymd,'.r')
    plt.axis([xmin, xmax, ymin, ymax])
    plt.xlabel(DC1)
    plt.ylabel(DC2)

    plt.show()
    fig0.savefig(vpath+"scatter_"+RFname+"_vs_"+MDname+"_"+DC1+"_"+DC2+".png")
    plt.close()
    mpl.rcParams.update(mpl.rcParamsDefault) ## set this to change to default matplotlib params
     
    return fig0

In [None]:
def countPieChart(vpath, A, B, C, bins, RFname, MDname):

    '''
        This function generates a Counts pie chart of the applicability domain where:
            - A = Model + Reference compound found
            - B = Only Reference compound found
            - C = Only Model compound found 
        vpath = '1-histogramAnalysisResults'+str(bins)+'/'
        bins = 75        ## Same bins for model and ref database
        RFname = DrugBank ## ReF Database name
        MDname = Inditex ## Model Database name
    '''

    ############# countsPieChart ############

    # make a square figure and axes
    fig1 = figure(0, figsize=(6,6))
    ax = axes([0.1, 0.1, 0.8, 0.8])

    # The slices will be ordered and plotted counter-clockwise.
    labels = 'A', 'B', 'C' # A = DB & MD, B = DB, C = MD 
    fracs = [A, B, C]
    explode=(0.075, 0.075 , 0.075)
    colors= ["red", "lightblue", "yellow"]

    plt.pie(fracs, explode=explode, labels=labels, colors= colors,
                autopct='%1.1f%%', shadow=False, startangle=90)
                # The default startangle is 0, which would start
                # the Frogs slice on the x-axis.  With startangle=90,
                # everything is rotated counter-clockwise by 90 degrees,
                # so the plotting starts on the positive y-axis.

    title('Counts Pie Chart using '+str(bins)+' bins')

    fig1.savefig(vpath+"Counts_piechart_"+RFname+"_vs_"+MDname+ "_" + str(bins) + "bins.png")
    plt.show()
    plt.close()
    mpl.rcParams.update(mpl.rcParamsDefault)

    return fig1

In [None]:
def compoundPieChart(vpath, Ap, Bp, bins, RFname, MDname):

    '''
        This function generates a compounds pie chart of the applicability domain where:
            - A = Model + Reference compound found
            - B = Only Reference compound found
            - C = Only Model compound found 
        vpath = '1-histogramAnalysisResults'+str(bins)+'/'
        bins = 75        ## Same bins for model and ref database
        RFname = DrugBank ## ReF Database name
        MDname = Inditex ## Model Database name
    '''


    ########### Compounds Pie Chart ############

    # make a square figure and axes
    fig2 = figure(1, figsize=(6,6))
    ax = axes([0.1, 0.1, 0.8, 0.8])

    # The slices will be ordered and plotted counter-clockwise.
    labels = "A'", "B'" # A' = DB' & MD', B'= DB'
    fracs = [Ap, Bp]
    explode=(0.1, 0)
    colors= ["red", "lightblue"]

    pie(fracs, explode=explode, labels=labels, colors= colors,
                autopct='%1.1f%%', shadow=False, startangle=90)
                # The default startangle is 0, which would start
                # the Frogs slice on the x-axis.  With startangle=90,
                # everything is rotated counter-clockwise by 90 degrees,
                # so the plotting starts on the positive y-axis.

    title('Compounds Pie Chart using '+str(bins)+' bins')
    
    fig2.savefig(vpath+"Compounds_piechart_"+RFname+"_vs_"+MDname+ "_" + str(bins) + "bins.png")
    plt.show()
    plt.close()
    mpl.rcParams.update(mpl.rcParamsDefault)
    
        
    return fig2

In [None]:
def histogramPlot(vpath, xedges,yedges, Hmasked, xmdedges, ymdedges, Hmdmasked,
                  RFname, MDname, bins, DC1, DC2):

    '''
        This function generates a counts Histogram Plot of the applicability domain where:
            - A = Model + Reference compound found
            - B = Only Reference compound found
            - C = Only Model compound found
            - D = No Compound is found 
        
        xedges, yedges     ## x, y delimitation for Ref Database
        Hmasked            ## 2D coordinates for Ref Database
        xmdedges, ymdedges ## x, y delimitation for model Database
        Hmdmasked          ## 2D coordinates for model database
        vpath              ## path to save files
        bins             ## Same bins for model and ref database
        RFname = DrugBank ## ReF Database name
        MDname = Inditex ## Model Database name
        DC1 = 'PC1', 'LogP' ... ## name of descriptors or principal component used ...
        DC2 = 'PC2', 'MW' ...   ## name of descriptors or principal component used ...
    '''

    ########### Histogram Plot #############

    my_dpi = 96 ### size of my figure 

    ### Plot 2D Histogram  (model (in color scale) projected to drugbank (in grey scale)) ###

    # Plot 2D drugBank histogram using pcolor

    fig3 = plt.figure(figsize=(1250/my_dpi, 800/my_dpi))
   
    plt.title('Histogram Plot '+DC1+' vs '+DC2+' using '+str(bins)+' bins')
    plt.pcolormesh(xedges,yedges,Hmasked, cmap=mpl.cm.gray, vmin=-7, vmax=7) ### change vmin and vmax to put color scale

    # Plot 2D histogram using pcolor
    plt.pcolormesh(xmdedges,ymdedges,Hmdmasked)
    plt.xlabel(DC1)
    plt.ylabel(DC2)
       
    cmdbar = plt.colorbar()
    cmdbar.ax.set_ylabel('Counts ' + str(MDname) + ' Model')

    ####Saving plots in png format 
    
    fig3.savefig(vpath+"2DHist_"+RFname+"_vs_"+MDname+ "_"+str(bins)+"bins.png")
    plt.show()
    plt.close()
    mpl.rcParams.update(mpl.rcParamsDefault)
    
    return fig3

In [None]:
def ratioCSVtable(vpath, RFname, MDname, bins, A,B,C,D,Ap,Bp,Cp,Dp):

    '''
        This function generates a Table in CSV format (tab separated) of the applicability domain where:
            For Counts:
            - A = Model + Reference compound found
            - B = Only Reference compound found
            - C = Only Model compound found
            - D = No Compound is found
            - A/B = ratio describing the percentage of counts covering the reference applicability domain 
            For Compounds:
            - Ap = Model + Reference compound found
            - Bp = Only Reference compound found
            - Cp = Only Model compound found
            - Dp = No Compound is found
            - Ap/Bp = ratio describing the percentage of compounds covering the reference applicability domain
        
        vpath = '1-histogramAnalysisResults'+str(bins)+'/' ## path to save files
        DBbins = 75        ## Same bins for model and ref database
        RFname = DrugBank ## ReF Database name
        MDname = Inditex ## Model Database name
    '''

    ###### Ratio A/B for counts and A'/B' for compounds ######
    
    ratiodict = [{'bins':bins,'AB':((A/B)*100), 'A':int(A), 
                  'B':int(B),'C':int(C), 'D':int(D), 'ApBp':((Ap/Bp)*100),
                  'Ap':int(Ap), 'Bp':int(Bp), 'Cp':int(Cp), 'Dp':int(Dp)}]   
    
    ratioDF = pd.DataFrame(ratiodict)

    ratioDF = ratioDF[['bins','AB', 'ApBp','A', 'B', 'C', 'D', 'Ap','Bp', 'Cp', 'Dp']]
    ratioDF.loc[:,('AB','ApBp')] = ratioDF[['AB','ApBp']].round(2)
    ratioDF.to_csv(vpath+'ratio_'+RFname+'_vs_'+MDname+ '_'+str(bins)+'bins.csv',
                   sep='\t', encoding='utf-8', index=False )
    
    return ratioDF

In [None]:
def histAnalysis (RFD1, RFD2, MD1, MD2, bins, RFname, MDname, DC1, DC2):

    ''' 
        This is the main function that generates and analyses the data to check 
        the applicability domain. It calls the other functions to generating and
        saving plots, and tables.

        RFD1 = RFdf.LogP ## ReF Database descriptors df column containing logp, PC1,... 
        RFD2 = RFdf.mw   ## ReF Database descriptors df column containing mw, PC2, ...
        MD1 = MDdf.LogP ## ReF Database descriptors df column containing logp, PC1, ...
        MD2 = MDdf.mw   ## ReF Database descriptors df column containing mw, PC2, ...
        bins = 75        ## Same bins for model and ref database
        RFname = DrugBank ## ReF Database name
        MDname = Inditex ## Model Database name
        tune = 'YES' or 'NO' ## If someone wants to perform gridSizeOptimization put YES

        Pie Chart For Counts:
        - A = Model + Reference compound found
        - B = Only Reference compound found
        - C = Only Model compound found
        - D = No Compound is found
        - A/B = ratio describing the percentage of counts covering the reference applicability domain 
        Pie Chart For Compounds:
        - Ap = Model + Reference compound found
        - Bp = Only Reference compound found
        - Cp = Only Model compound found
        - Dp = No Compound is found
        - Ap/Bp = ratio describing the percentage of compounds covering the reference applicability domain
        

                                                                                                
                                           TWO DIFFERENT VIEWS                              
                                           -------------------                              
                    Table1. Number of counts                 Table2. Number of compounds    

                               REFERENCE                                 REFERENCE          
                       _________________________                 _________________________  
                      |            |            |               |            |            | 
                      |     YES    |     NO     |               |     YES    |     NO     | 
          ____________|____________|____________|   ____________|____________|____________| 
         |            |            |            |  |            |            |            | 
      M  |     YES    |      A     |      C     |  |     YES    |      Ap    |      Cp    | 
      O  |____________|____________|____________|  |____________|____________|____________| 
      D  |            |            |            |  |            |            |            | 
      E  |      NO    |      B     |      D     |  |      NO    |      Bp    |      Dp    | 
      L  |____________|____________|____________|  |____________|____________|____________| 

        
        
    '''
    
    ##### Defining path where to create a folder to save PNG images and tables ######
    vpath = 'histogramAnalysisResults-'+str(bins)+'/'
    
    if not os.path.exists(vpath):
        os.makedirs(vpath)
    else:
        shutil.rmtree(vpath,ignore_errors=True)
        os.mkdir(vpath)
       
    # Database information:
    
    xdb = RFD1  # logP from DrugBank Database
    ydb = RFD2    # MW from DrugBank Database
    DBbins = bins

    ## Here we will calculate max() and min() for y and x to be added in the plots settings. 
    
    xmax = xdb.max() + 5 ### maximum of logP data
    xmin = xdb.min() - 5 ### minimum of logP data
    ymax = ydb.max() + 100 ### maximum of MW data
    ymin = ydb.min() - 100 ### minimum of MW data
    
    # Model information:
    xmd = MD1  # logP from model Database
    ymd = MD2 # MW from model Database
    
    ###### Estimate the 2D histogram for Reference DATABASE ######
    
    xedges, yedges = np.linspace( xmin, xmax, DBbins ), np.linspace( ymin, ymax, DBbins ) ##### here you can define y and x delimitation and number of bins
    Href, xedges, yedges = np.histogram2d(xdb, ydb, (xedges, yedges))
    
    # Href needs to be rotated and flipped
    Href = np.rot90(Href)
    Href = np.flipud(Href)
    
    Hmasked = np.ma.masked_where(Href==0,Href) # Mask pixels with a value of zero

    ###### Estimate the 2D histogram for the MODEL database chosen ######

    MDbins = bins #### you can modify ModelBins if you want to amplify or reduce the spacing (size of the squares)

    xmdedges, ymdedges = np.linspace(xmin, xmax, MDbins), np.linspace(ymin, ymax, MDbins) ##### here you can define y and x delimitation and number of bins
    Hmd, xmdedges, ymdedges = np.histogram2d(xmd, ymd, (xmdedges, ymdedges))
 
    # Hmd needs to be rotated and flipped
    Hmd = np.rot90(Hmd)
    Hmd = np.flipud(Hmd)

    Hmdmasked = np.ma.masked_where(Hmd==0,Hmd) # Mask pixels with a value of zero

    ##########################################################################################
    #                                                                                        #
    #                                       TWO DIFFERENT VIEWS                              #
    #                                       -------------------                              #
    #                Table1. Number of counts                 Table2. Number of compounds    #
    #                                                                                        #
    #                           REFERENCE                                 REFERENCE          #
    #                   _________________________                 _________________________  #
    #                  |            |            |               |            |            | #
    #                  |     YES    |     NO     |               |     YES    |     NO     | #
    #      ____________|____________|____________|   ____________|____________|____________| #
    #     |            |            |            |  |            |            |            | #
    #  M  |     YES    |      A     |      C     |  |     YES    |      Ap    |      Cp    | #
    #  O  |____________|____________|____________|  |____________|____________|____________| #
    #  D  |            |            |            |  |            |            |            | #
    #  E  |      NO    |      B     |      D     |  |      NO    |      Bp    |      Dp    | #
    #  L  |____________|____________|____________|  |____________|____________|____________| #
    #                                                                                        #
    ##########################################################################################
        
    N = 4 ### we have 4 conditions
    nofCounts = np.zeros( N ) ### number of counts in 4 conditions from table 1
    nofCompounds = np.zeros( N )
#     fcp = open(vpath+"INFO_RefD_vs_"+MDname+"_compounds_"+str(DBbins)+".csv","w")
#     fcp.write('Reference_Database\tModel\tInfo\n')
    
    ### Href ad Hmd is an array contaning x(e.g. logP) and y(e.g. MW) coordinates 
    ### of the points to be histogrammed from the Reference database and 
    ### model respectively
    
    for i in range(DBbins-1):
        for j in range(DBbins-1):
            if (Href[i][j] > 0) and (Hmd[i][j] > 0):         # A and Ap) YES RFD YES Model 
                nofCounts[0] += 1
                nofCompounds[0] += Href[i][j]+ Hmd[i][j]
#                 fcp.write(str(Href[i][j])+"\t"+ str(Hmd[i][j])+ '\tYES RFD | YES MD\n')
            elif (Href[i][j] > 0) and (Hmd[i][j] == 0):       # B and Bp) YES RFD NO Model
                nofCounts[1] += 1
                nofCompounds[1] += Href[i][j]
#                 fcp.write(str(Href[i][j])+"\t"+ str(Hmd[i][j])+ '\tYES RFD | NO MD\n')
            elif (Href[i][j] == 0) and (Hmd[i][j] > 0):       # C and Cp) NO RFD YES Model
                nofCounts[2] += 1
                nofCompounds[2] += Hmd[i][j]
#                 fcp.write(str(Href[i][j])+"\t"+ str(Hmd[i][j])+ '\tNO RFD | YES MD\n')
            elif (Hmd[i][j] == 0) and (Href[i][j] == 0):       # D and Dp) NO RFD NO Model
                nofCounts[3] += 1
                nofCompounds[3] = 0
#                 fcp.write(str(Href[i][j])+"\t"+ str(Hmd[i][j])+ '\tNO RFD | NO MD\n')
    
#     fcp.close()

    A = nofCounts[0]
    B = nofCounts[1]
    C = nofCounts[2]
    D = nofCounts[3]
    Ap = nofCompounds[0]
    Bp = nofCompounds[1]
    Cp = nofCompounds[2]
    Dp = nofCompounds[3]
    
    ### Creating Plots plus text info ####
    
    ratioDF = ratioCSVtable(vpath, RFname, MDname, DBbins, A,B,C,D,Ap,Bp,Cp,Dp)
    scatterDC1vsDC2(vpath, RFD1, RFD2, MD1, MD2, RFname, MDname, DC1,DC2)    
    histogramPlot(vpath, xedges,yedges, Hmasked, xmdedges, ymdedges, Hmdmasked,
                  RFname, MDname, DBbins, DC1,DC2)
    countPieChart(vpath, A, B, C, DBbins, RFname, MDname)
    compoundPieChart(vpath, Ap, Bp, DBbins, RFname, MDname)
    
    AB = (A/B)*100
    ABp = (Ap/Bp)*100
  
    return ratioDF

        
        

In [None]:
def gridSizeOptimization(RFD1, RFD2, MD1, MD2, RFname, MDname, DC1, DC2, start, stop, step):

    ''' 

        PART 4.1: Grid size optimization analysis:

        The optimum size of the grid ( number of divisions of the e.g. MW and e.g.logP axes)
        will be selected after analysing the results obtained with vaules ranging
        from 4 to 300 (one can change this as necessary) and identifying which will
        be the vaules too low or too high, leading to extreme and poorly informative
        results. 

        RFD1 = RFdf.LogP ## ReF Database descriptors df column containing logp, PC1,... 
        RFD2 = RFdf.mw   ## ReF Database descriptors df column containing mw, PC2, ...
        MD1 = MDdf.LogP ## ReF Database descriptors df column containing logp, PC1, ...
        MD2 = MDdf.mw   ## ReF Database descriptors df column containing mw, PC2, ...

        RFname = DrugBank ## ReF Database name
        MDname = Inditex ## Model Database name
        start = 4  ## beginning number 4
        stop = 300 ## final number 300
        step = 4   ## timestep from 4 to 4 (e.g. 4 8 12 16 ...)
    
    '''       
 
    os.system('mkdir gridSizeOptimization')
    
    newDF = pd.DataFrame([])
    
    for i in np.arange(start, stop, step):
        bins = i
        ratioOptDF = histAnalysis(RFD1, RFD2, MD1, MD2, bins, RFname,MDname, DC1,DC2)
        newDF = newDF.append(ratioOptDF)
    
    newDF.to_csv('gridSizeOptimization/ratio_'+RFname+'_vs_'+MDname+'_from'+str(start)+'to_'+str(stop)+'bins.csv',
                 sep='\t', encoding='utf-8', index=False )
    os.system('mv histogramAnalysisResults-* gridSizeOptimization/')
    
    newDF = newDF.reset_index(drop=True)

    return newDF

# 4. Bokeh interactive graphs

In [None]:
from bokeh.plotting import figure, output_notebook, show, output_file, save
from bokeh.models import ColumnDataSource, HoverTool, CrosshairTool, WheelZoomTool
from bokeh.models import ColorBar, ResetTool, PanTool, LinearColorMapper, SaveTool

import bokeh.palettes as palettes
from bokeh.models.markers import *
# from bokeh.io import export_png
output_notebook()

In [None]:
def scatter_mol(df,activity, DC1, DC2, molcol, catcol, name, img):
    """
    Makes an interactive scatter plot with 
    - df = pcaDF        ## dataframe contains molecules, principal components or descriptors, name of the molecules
    - activity = 'LogP' ## name of the activity field to color them 
    - mol = 'mol'       ## molecule column 
    - name = 'name'     ## name column 
    - DC1 = 'PC1'       ## column name first descriptor e.g. PC1, LogP, ...
    - DC2 = 'PC2'       ## column name first descriptor e.g. PC2, MW, ...
    - img = 'INCimg/'   ## folder path
    - catcol = 'State'  ## category column to color by them
    """
    
    xlabel = DC1
    ylabel = DC2
    def get_structures(mol_df, molcol, name, imgdir):
        img_path = []
        img_dir = imgdir
        
        if not os.path.exists(img_dir):  #  Checks if folder exists then creates it
            os.makedirs(img_dir)
            
        for i in range(len(mol_df)):
            Draw.MolToFile(mol_df[molcol][i],
                            img_dir + mol_df[name][i] + ".svg",
                            imageType="svg",
                            fitImage=False,
                            size=(200, 200))
            img_path.append(img_dir + mol_df[name][i] + ".svg")
        mol_df["img_path"] = img_path
        return mol_df

    bokehDF = get_structures(df, molcol, name, img)
    bokehDF = bokehDF.drop(molcol, axis = 1)    

    ## Choosing categories
    try:
        categories = np.unique(bokehDF[catcol]) ## creating categories by column
    except:
        bokehDF[catcol] = 'None'
        categories = np.array(['None']) ## creating categories by column
        
    
    ## Choosing colors
    if len(categories) < 5:
        mycolorlist = ['#d62728','#2ca02c','#6a3d9a','#c7c7c7'] ## red, green, violet, grey
    elif len(categories) >= 5 and len(categories) <= 10 :    
        mycolorlist = ['#ffff33', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
                       '#000000', '#e377c2', '#7f7f7f', '#6a3d9a', '#17becf']
    
    catcolors = np.random.choice((len(mycolorlist)), len(categories),replace=False)
    pal = []
    for i in catcolors:
        pal.append(mycolorlist[i])
        
    ## Creating color dictionary by category
    colordict = dict(zip(categories, pal)) ## dictionary with category and color randomly asigned
    bokehDF["colors"] = [colordict[x] for x in bokehDF[catcol]]

    source = ColumnDataSource(bokehDF)
    hover = HoverTool(tooltips = 
                f"""
                <div>
                    <img src="@img_path" width="170" height="170"></img>
                </div>
                <div>
                    <span style="font-size: 12px; font-weight: bold;">@{name}</span>
                </div>
                <div>
                    <span style="font-size: 12px; font-weight: bold;">PCA:</span>
                    <span style="font-size: 12px;">@{DC1}, @{DC2}</span>
                </div>
                <div>
                    <span style="font-size: 12px; font-weight: bold;">Activity:</span>
                    <span style="font-size: 12px;">@{activity}</span>
                </div>

                """)

    TOOLS=[hover, CrosshairTool(), WheelZoomTool(), ResetTool(), PanTool(), SaveTool()]
    p = figure(plot_width=800, plot_height=800, tools=TOOLS, x_axis_label=xlabel, y_axis_label=ylabel)
    p.scatter(x=DC1,y=DC2, size=5, alpha = 0.9, color = 'colors', legend=catcol, source=source)
    p.legend.location = "top_left"
    p.legend.click_policy="mute"
    show(p)
    return bokehDF

In [None]:
def scatter_mol_RefvsMD(refDF,refDC1, refDC2, refmolcol, refName, refACT,refimg,
                        mdDF,mdDC1, mdDC2, mdmolcol, mdName, mdACT,mdimg):
    """
    Makes an interactive scatter plot with 
    
    Reference Dataframe parameters:
    
    - refDF = pcaDF      ## dataframe contains molecules, principal components or descriptors, name of the molecules
    - refDC1 = 'PC1'     ## column name first descriptor e.g. PC1, LogP, ...
    - refDC2 = 'PC2'     ## column name second descriptor e.g. PC1, LogP, ...
    - refmolcol = 'mol'  ## molecule column 
    - refName = 'name'   ## name column
    - refACT = 'LogP'    ## name of the activity field
    - refimg = 'REFimg/' ## Reference images folder path

    Model Dataframe parameters:
    
    - mdDF = pcaDF      ## dataframe contains molecules, principal components or descriptors, name of the molecules
    - mdDC1 = 'PC1'     ## column name first descriptor e.g. PC1, LogP, ...
    - mdDC2 = 'PC2'     ## column name second descriptor e.g. PC1, LogP, ...
    - mdmolcol = 'mol'  ## molecule column 
    - mdName = 'name'   ## name column
    - mdACT = 'LogP'    ## name of the activity field 
    - mdimg = 'MDimg/'  ## Model images folder path
    
   e.g. : Before running this one needs to assure both dataframe contains same columns names. 
   scatter_mol_RefvsMD(refDF, refDC1, refDC2, refmolcol, refName, refACT, 
                        refimg, mdDF, mdDC1, mdDC2, mdmolcol, mdName, mdACT, mdimg)
    """

    def get_structures(mol_df, molcol, name, imgdir):
        img_path = []
        img_dir = imgdir
        
        if not os.path.exists(img_dir):  #  Checks if folder exists then creates it
            os.makedirs(img_dir)
            
        for i in range(len(mol_df)):
            Draw.MolToFile(mol_df[molcol][i],
                            img_dir + mol_df[name][i] + ".svg",
                            imageType="svg",
                            fitImage=False,
                            size=(200, 200))
            img_path.append(img_dir + mol_df[name][i] + ".svg")
        mol_df.loc[:,'img_path'] = img_path ## labeling model
#         mol_df['img_path'] = img_path
        return mol_df


    ######## Reference Dataframe ########
    
#     refDF['phiID'] = [str('mol%0.6d'%(int(x)+1)) for x in range(len(refDF))]
#     refName = 'phiID'
    refDF = refDF[[refDC1, refDC2, refmolcol, refName, refACT]]
    refDF.columns = [refDC1, refDC2, 'mol', 'name', 'activity']
    bokeh_refDF = get_structures(refDF, 'mol', 'name', refimg)
#     bokeh_refDF['origin'] = 'ref' ## labeling reference
    bokeh_refDF.loc[:,'origin'] = 'ref' ## labeling model
    bokeh_refDF = bokeh_refDF.drop('mol', axis = 1)
    
    ######## Model Dataframe ########
#     mdDF['phiID'] = [str('mol%0.6d'%(int(x)+1)) for x in range(len(mdDF))]
#     mdName = 'phiID'
   
    mdDF = mdDF[[mdDC1, mdDC2, mdmolcol, mdName, mdACT]]
    mdDF.columns = [mdDC1, mdDC2, 'mol', 'name', 'activity']
    bokeh_mdDF = get_structures(mdDF, 'mol', 'name', mdimg)
    bokeh_mdDF.loc[:,'origin'] = 'model' ## labeling model
#     bokeh_mdDF['origin'] = 'model' ## labeling model
    bokeh_mdDF = bokeh_mdDF.drop('mol', axis = 1)
    
    ######### Concatenating reference and model Dataframes ########
    bokehDF = pd.concat([bokeh_refDF,bokeh_mdDF])

    
    ### colormap ###
    colormap = {'ref': '#cccccc', 'model': '#d9534f'}
    colors = [colormap[x] for x in bokehDF['origin']]
    bokehDF.loc[:,'colors'] = colors
#     bokehDF['colors'] = colors
    
    ### sizemap ###
    sizemap = {'ref': 10, 'model': 4}
    sizes = [sizemap[x] for x in bokehDF['origin']]
    bokehDF.loc[:,'sizes'] = sizes
    
#     ### markersmap ###
#     markersmap = {'ref': "x", 'model': "x"}
#     markers = [markersmap[x] for x in bokehDF['origin']]
#     bokehDF.loc[:,'markers'] = markers


#     bokehDF[refDC1] = pd.to_numeric(bokehDF[refDC1])
#     bokehDF[refDC2] = pd.to_numeric(bokehDF[refDC2])

    source = ColumnDataSource(bokehDF)
    hover = HoverTool(tooltips = 
                f"""
                <div>
                    <img src="@img_path" width="170" height="170"></img>
                </div>
                <div>
                    <span style="font-size: 12px; font-weight: bold;">@name</span>
                </div>
                <div>
                    <span style="font-size: 12px; font-weight: bold;">PCA:</span>
                    <span style="font-size: 12px;">@{refDC1}, @{refDC2}</span>
                </div>
                <div>
                    <span style="font-size: 12px; font-weight: bold;">Info:</span>
                    <span style="font-size: 12px;">@activity</span>
                </div>
                <div>
                    <span style="font-size: 12px; font-weight: bold;">Source:</span>
                    <span style="font-size: 12px;">@origin</span>
                </div>

                """)


    TOOLS=[hover, CrosshairTool(), WheelZoomTool(), ResetTool(), PanTool(), SaveTool()]
    p = figure(plot_width=800, plot_height=800,tools=TOOLS)
    p.scatter(x=refDC1,y=refDC2, size='sizes',alpha=0.9, color='colors',legend='origin',source=source)
    p.legend.location = "top_left"
    p.legend.click_policy="mute"

    show(p)