# Marginalized Graph Kernel (MGK) Architecture

In [1]:
# How to define the architecture of MGK is still an open question.
# In mgktools package, the architecture of MGK is defined in a json file.
# In default, mgktools provides 8 sets of choices. 
# For details, please refer to:
# Xiang Y, Tang Y H, Liu H, et al. JPCA, 2021, 125(20): 4488-4497.
# Xiang Y, Tang Y H, Lin G, et al. JCIM, 2021, 61(11): 5414-5424.
from mgktools.hyperparameters import product, product_norm, product_msnorm, product_pnorm
from mgktools.hyperparameters import additive, additive_norm, additive_msnorm, additive_pnorm
import json
# product_msnorm means combining different features through product, and using molecular-sized normalization.
# This is used in the JPCA paper.
json.load(open(product_msnorm))

{'Normalization': [10000.0, [1000.0, 30000.0], None, None],
 'a_type': ['Tensorproduct', 'fixed', None, None],
 'atom_AtomicNumber': {'kDelta': [0.75, [0.75, 1.0], 0.05, None]},
 'atom_AtomicNumber_list_1': {'kConv': [0.9, [0.75, 1.0], 0.05, None]},
 'atom_AtomicNumber_list_2': {'kConv': [0.9, [0.75, 1.0], 0.05, None]},
 'atom_AtomicNumber_list_3': {'kConv': [0.9, [0.75, 1.0], 0.05, None]},
 'atom_AtomicNumber_list_4': {'kConv': [0.9, [0.75, 1.0], 0.05, None]},
 'atom_MorganHash': {'kDelta': [0.9, [0.75, 1.0], 0.05, None]},
 'atom_Ring_count': {'kDelta': [0.9, [0.75, 1.0], 0.05, None]},
 'atom_RingSize_list': {'kConv': [0.9, [0.75, 1.0], 0.05, None]},
 'atom_Hcount': {'kDelta': [0.9, [0.75, 1.0], 0.05, None]},
 'atom_AtomicNumber_count_1': {'kDelta': [0.9, [0.75, 1.0], 0.05, None]},
 'atom_AtomicNumber_count_2': {'kDelta': [0.9, [0.75, 1.0], 0.05, None]},
 'atom_Chiral': {'kDelta': [0.9, [0.75, 1.0], 0.05, None]},
 'b_type': ['Tensorproduct', 'fixed', None, None],
 'bond_Order': {'kDel

# Bayesian Optimization

In [2]:
# Dataset.
import os
import pandas as pd
from mgktools.data.data import Dataset

if not os.path.exists('freesolv'):
    os.mkdir('freesolv')
    df = pd.read_csv('../datasets/freesolv.csv')
    dataset = Dataset.from_df(
        df,
        pure_columns = ['smiles'],
        target_columns = ['freesolv'],
        n_jobs = 1
    )
    dataset.save(path='freesolv', filename='dataset.pkl', overwrite=True)
else:
    dataset = Dataset.load(path='freesolv', filename='dataset.pkl')
dataset.graph_kernel_type = 'graph'

In [3]:
# kernel
kernel_type = 'additive_pnorm'
from mgktools.kernels.utils import get_kernel_config
kernel_config = get_kernel_config(
    dataset,
    graph_kernel_type = 'graph',
    # arguments for marginalized graph kernel
    mgk_hyperparameters_files = [additive_pnorm],
)

In [4]:
# Use Optuna Python package.
import os
if not os.path.exists('freesolv/hyperopt'):
    os.mkdir('freesolv/hyperopt')
    if not os.path.exists('freesolv/hyperopt/%s' % kernel_type):
        os.mkdir('freesolv/hyperopt/%s' % kernel_type)
from mgktools.hyperparameters.optuna import bayesian_optimization
bayesian_optimization(save_dir='freesolv/hyperopt/%s' % kernel_type,
                      datasets=[dataset],
                      kernel_config=kernel_config,
                      model_type='gpr',
                      task_type='regression',
                      metric='rmse',
                      cross_validation='leave-one-out',
                      num_iters=50,
                      alpha_bounds=(0.001, 0.02),
                      d_alpha=0.001)

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-05-23 19:47:28,124] Using an existing study with name 'optuna-study' instead of creating a new one.


In [5]:
# The details of the hyperparameter optimization is saved at freesolv/hyperopt/additive_pnorm/optuna.db
# You can visualize it using optuna-dashboard: https://github.com/optuna/optuna-dashboard.

# Gradient-Based Optimization

In [6]:
kernel_config = get_kernel_config(
    dataset,
    graph_kernel_type = 'graph',
    # arguments for marginalized graph kernel
    mgk_hyperparameters_files = ['freesolv/hyperopt/%s/graph_hyperparameters.json' % kernel_type],
)
kernel = kernel_config.kernel
alpha = float(open('freesolv/hyperopt/%s/alpha' % kernel_type, 'r').readline())

In [7]:
# optimization
from mgktools.models.regression.GPRgraphdot import GPR
gpr = GPR(kernel=kernel,
          optimizer='L-BFGS-B',
          alpha=alpha,
          normalize_y=True)
gpr.fit(dataset.X, dataset.y, loss='loocv', verbose=True)

|     Sq.Err.|    d(SqErr)|     log|K| |     Cond(K)| t_GPU (s)| t_CPU (s)|
|------------|------------|------------|------------|----------|----------|
|      25.789|      25.789|     -2079.5|  4.3462e+05|        76|      0.39|
|      25.916|      25.916|       -1727|  5.1566e+05|        10|      0.39|
|      23.564|      23.564|     -1841.7|  4.9085e+05|       9.9|      0.39|
|      23.389|      23.389|     -1787.5|   4.959e+05|       9.9|      0.41|
|      24.009|      24.009|     -1611.5|  5.1222e+05|        10|       0.4|
|      23.246|      23.246|       -1700|  5.0321e+05|        10|      0.41|
|      23.142|      23.142|     -1651.3|   5.045e+05|        10|      0.39|
|      22.991|      22.991|     -1628.9|  5.0359e+05|        10|       0.4|
|      22.839|      22.839|     -1671.7|   4.869e+05|        10|       0.4|
|      22.751|      22.751|     -1679.8|  4.9067e+05|        10|      0.42|
|      23.002|      23.002|     -1753.7|  4.9973e+05|        10|       0.4|
|      22.65

<mgktools.models.regression.GPRgraphdot.gpr.GPR at 0x7623764aec80>

In [9]:
# save optimized hyperparameters
kernel_config.update_from_theta()
kernel_config.save('freesolv/hyperopt')