In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
%matplotlib widget


In [2]:
def std_PCA(**argv): 
    scaler = MinMaxScaler()  # Data standardization and normalization
    pca = PCA(**argv) 
    pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
    return pipeline

def PCA_process(df, n_components=2):
    column_name = df.columns.values.tolist()
    property = df[column_name[2:]]
    # print(property)
    pca = std_PCA(n_components=n_components) 
    PCA_result = pca.fit_transform(property)
    explained_var_ratio = pca['pca'].explained_variance_ratio_  # Variance explanation ratio 

    return PCA_result, explained_var_ratio

In [5]:
def PCA_plot(path_train, path_gen):
    mol_train = pd.read_csv(path_train)
    mol_gen = pd.read_csv(path_gen)
    mol = pd.concat([mol_gen, mol_train])

    PCA_all, ex_var_ratio_all = PCA_process(mol, n_components=2, )
    # PCA_train, ex_var_ratio_train = PCA_process(mol_train, n_components=2, )
    # PCA_gen, ex_var_ratio_gen = PCA_process(mol_gen, n_components=2, )

    # print(ex_var_ratio_all, sum(ex_var_ratio_all))
    print('explained_variance of PCA 1: {:.2%}'.format(ex_var_ratio_all[0]))
    print('explained_variance of PCA 2: {:.2%}'.format(ex_var_ratio_all[1]))
    print('sum explained_variance: {:.2%}'.format(sum(ex_var_ratio_all)))
    # print(ex_var_ratio_train)
    # print(ex_var_ratio_gen)
    
    plt.close()
    plt.figure()
    s1 = plt.scatter(PCA_all[:len(mol_gen), 0], PCA_all[:len(mol_gen), 1], c='b', marker='o', s=2, alpha=1)
    s2 = plt.scatter(PCA_all[len(mol_gen):, 0], PCA_all[len(mol_gen):, 1], c='r', marker='o', s=2, alpha=1) 
    
    ax = plt.gca()

    # Set figure title
    font_title = {'family': 'Times New Roman',
             'style': 'normal',
             'weight': 'bold',
             'color': 'black',
             'size': 18
             }
    plt.title("PCA of Molecular Discriptors (WGAN-GP)", fontdict=font_title, verticalalignment='bottom', pad=None)  

    # Set legend
    plt.legend((s1,s2), ('Generating set','Training set'), 
               prop={'family':'Times New Roman', 'size':10, 'weight':'bold'}, 
               loc='upper right')

    # Set tick labels
    plt.tick_params(axis='x', labelsize='12', width=1.5, direction='in')
    plt.tick_params(axis='y', labelsize='12', width=1.5, direction='in')
    labels = ax.get_xticklabels() + ax.get_yticklabels()
    [label.set_fontname('Times New Roman') for label in labels]
    [label.set_fontweight('bold') for label in labels]

    # Set the thickness of the coordinate axis 
    ax.spines['bottom'].set_linewidth(1.5)
    ax.spines['left'].set_linewidth(1.5)
    ax.spines['right'].set_linewidth(1.5)
    ax.spines['top'].set_linewidth(1.5)

    # Set axis labels
    x_label = {'family': 'Times New Roman',
             'style': 'normal',
             'weight': 'semibold',
             'color': 'black',
             'size': 16
             }
    y_label = {'family': 'Times New Roman',
             'style': 'normal',
             'weight': 'semibold',
             'color': 'black',
             'size': 16
             }
    plt.xlabel("PC 1", fontdict=x_label)
    plt.ylabel("PC 2", fontdict=y_label,)

    # Gridlines
    # plt.grid(linewidth='0.5', linestyle='--') 

    # Save
    plt.savefig('PCA_2D_Molecular_Discriptors_ChEMBL.pdf')
    plt.savefig('PCA_2D_Molecular_Discriptors_ChEMBL.png', dpi=1000)


In [6]:
PCA_plot(path_train='ChEMBL25_property_smi.csv', 
         path_gen='3000_property_smi.csv')

explained_variance of PCA 1: 43.71%
explained_variance of PCA 2: 25.19%
sum explained_variance: 68.90%


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [34]:
def PCA_plot3D(path_train, path_gen):
    mol_train = pd.read_csv(path_train)
    mol_gen = pd.read_csv(path_gen)
    mol = pd.concat([mol_gen, mol_train])

    PCA_all, ex_var_ratio_all = PCA_process(mol, n_components=3, )
    # PCA_train, ex_var_ratio_train = PCA_process(mol_train, n_components=3, )
    # PCA_gen, ex_var_ratio_gen = PCA_process(mol_gen, n_components=3, )

    print('explained_variance of PCA 1: {:.2%}'.format(ex_var_ratio_all[0]))
    print('explained_variance of PCA 2: {:.2%}'.format(ex_var_ratio_all[1]))
    print('explained_variance of PCA 3: {:.2%}'.format(ex_var_ratio_all[2]))
    print('sum explained_variance: {:.2%}'.format(sum(ex_var_ratio_all)))
    # print(ex_var_ratio_train)
    # print(ex_var_ratio_gen)

    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')
    ax.scatter(PCA_all[:int(len(mol_gen)), 0], PCA_all[:int(len(mol_gen)), 1], PCA_all[:int(len(mol_gen)), 2], c='b', marker='o', s=1, alpha=0.3)
    ax.scatter(PCA_all[len(mol_gen):, 0], PCA_all[len(mol_gen):, 1], PCA_all[len(mol_gen):, 2], c='r', marker='o', s=1, alpha=0.3)   

    # ax.view_init(30, 150)
    # Save
    plt.savefig('PCA_3D_Molecular_Discriptors_ChEMBL.pdf')
    # plt.savefig('PCA_2D_Molecular_Discriptors_ChEMBL.png', dpi=1000)        

In [35]:
PCA_plot3D(path_train='ChEMBL_property_smi.csv', 
           path_gen='3000_property_smi.csv')

explained_variance of PCA 1: 53.46%
explained_variance of PCA 2: 12.38%
explained_variance of PCA 3: 9.67%
sum explained_variance: 75.51%


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …