# Marginalized Graph Kernel (MGK) Architecture

In [1]:
# How to define the architecture of MGK is still an open question.
# In mgktools package, the architecture of MGK is defined in a json file.
# In default, mgktools provides 8 sets of choices. 
# For details, please refer to:
# Xiang Y, Tang Y H, Liu H, et al. JPCA, 2021, 125(20): 4488-4497.
# Xiang Y, Tang Y H, Lin G, et al. JCIM, 2021, 61(11): 5414-5424.
from mgktools.hyperparameters import product, product_norm, product_msnorm, product_pnorm
from mgktools.hyperparameters import additive, additive_norm, additive_msnorm, additive_pnorm
import json
# product_msnorm means combining different features through product, and using molecular-sized normalization.
# This is used in the JPCA paper.
json.load(open(product_msnorm))

{'Normalization': [10000, [1000, 30000]],
 'a_type': ['Tensorproduct', 'fixed'],
 'atom_AtomicNumber': {'kDelta': [0.75, [0.75, 1.0], 0.05]},
 'atom_AtomicNumber_list_1': {'kConv': [0.9, [0.75, 1.0], 0.05]},
 'atom_AtomicNumber_list_2': {'kConv': [0.9, [0.75, 1.0], 0.05]},
 'atom_AtomicNumber_list_3': {'kConv': [0.9, [0.75, 1.0], 0.05]},
 'atom_AtomicNumber_list_4': {'kConv': [0.9, [0.75, 1.0], 0.05]},
 'atom_MorganHash': {'kDelta': [0.9, [0.75, 1.0], 0.05]},
 'atom_Ring_count': {'kDelta': [0.9, [0.75, 1.0], 0.05]},
 'atom_RingSize_list': {'kConv': [0.9, [0.75, 1.0], 0.05]},
 'atom_Hcount': {'kDelta': [0.9, [0.75, 1.0], 0.05]},
 'atom_AtomicNumber_count_1': {'kDelta': [0.9, [0.75, 1.0], 0.05]},
 'atom_AtomicNumber_count_2': {'kDelta': [0.9, [0.75, 1.0], 0.05]},
 'atom_Chiral': {'kDelta': [0.9, [0.75, 1.0], 0.05]},
 'b_type': ['Tensorproduct', 'fixed'],
 'bond_Order': {'kDelta': [0.9, [0.75, 1.0], 0.05]},
 'bond_Stereo': {'kDelta': [0.9, [0.75, 1.0], 0.05]},
 'bond_RingStereo': {'kDelta

# Bayesian Optimization

In [2]:
# Dataset.
from mgktools.data.data import Dataset
dataset = Dataset.load(path='freesolv', filename='dataset.pkl')
dataset.graph_kernel_type = 'graph'



In [3]:
# kernel
kernel_type = 'additive_pnorm'
from mgktools.kernels.utils import get_kernel_config
kernel_config = get_kernel_config(
    dataset,
    graph_kernel_type = 'graph',
    # arguments for marginalized graph kernel
    mgk_hyperparameters_files = [additive_pnorm],
)


The sympy.printing.cxxcode submodule is deprecated. It has been renamed to
sympy.printing.cxx.

See https://docs.sympy.org/latest/explanation/active-deprecations.html#deprecated-printing-code-submodules
for details.

This has been deprecated since SymPy version 1.7. It
will be removed in a future version of SymPy.

  from sympy.printing.cxxcode import CXX11CodePrinter


In [4]:
# Use hyperopt Python package.
import os
if not os.path.exists('freesolv/hyperopt'):
    os.mkdir('freesolv/hyperopt')
    if not os.path.exists('freesolv/hyperopt/%s' % kernel_type):
        os.mkdir('freesolv/hyperopt/%s' % kernel_type)
from mgktools.hyperparameters.hyperopt import bayesian_optimization
best_hyperdict, results, hyperdicts = \
bayesian_optimization(save_dir='freesolv/hyperopt/%s' % kernel_type,
                      dataset=dataset,
                      kernel_config=kernel_config,
                      model_type='gpr',
                      task_type='regression',
                      metric='rmse',
                      split_type='loocv',
                      alpha_bounds=(0.001, 0.02),
                      d_alpha=0.001)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [12:37<00:00,  7.57s/trial, best loss: 1.0801048910064448]       


# Gradient-Based Optimization

In [5]:
kernel_config = get_kernel_config(
    dataset,
    graph_kernel_type = 'graph',
    # arguments for marginalized graph kernel
    mgk_hyperparameters_files = ['freesolv/hyperopt/%s/hyperparameters_0.json' % kernel_type],
)
kernel = kernel_config.kernel
alpha = float(open('freesolv/hyperopt/%s/alpha' % kernel_type, 'r').readline())

In [6]:
# optimization
from mgktools.models.regression.GPRgraphdot import GPR
gpr = GPR(kernel=kernel,
          optimizer='L-BFGS-B',
          alpha=alpha,
          normalize_y=True)
gpr.fit(dataset.X, dataset.y, loss='loocv', verbose=True)

|     Sq.Err.|    d(SqErr)|     log|K| |     Cond(K)| t_GPU (s)| t_CPU (s)|
|------------|------------|------------|------------|----------|----------|
|      25.323|      25.323|     -2912.6|   1.023e+06|        77|      0.22|
|      26.002|      26.002|       -2411|  1.1529e+06|        13|      0.23|
|       23.31|       23.31|     -2602.8|  1.0993e+06|        13|      0.23|
|      24.434|      24.434|     -1977.7|  1.1291e+06|        13|      0.22|
|      23.439|      23.439|     -2294.9|  1.1098e+06|        13|      0.23|
|       23.28|       23.28|     -2487.9|  1.1021e+06|        13|      0.22|
|      23.259|      23.259|     -2491.3|  1.1047e+06|        13|      0.22|
|      23.158|      23.158|     -2549.6|  1.1094e+06|        13|      0.22|
|      23.221|      23.221|     -2537.3|  1.1155e+06|        13|      0.22|
|      23.081|      23.081|     -2541.2|  1.1121e+06|        13|      0.21|
|      23.023|      23.023|     -2516.6|  1.1157e+06|        13|      0.22|
|      23.04

<mgktools.models.regression.GPRgraphdot.gpr.GPR at 0x7fabaa990ac0>

In [7]:
# save optimized hyperparameters
kernel_config.update_from_theta()
kernel_config.save_hyperparameters('freesolv/hyperopt/%s' % kernel_type)