In [1]:
import sys, os, math
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw

import torch
import espaloma as esp

from openff.toolkit.topology import Molecule



In [3]:
# grab data
if not os.path.exists("zinc"):
    os.system("wget http://data.wangyq.net/esp_datasets/zinc")
ds = esp.data.dataset.GraphDataset.load("zinc")

--2022-04-30 09:57:43--  http://data.wangyq.net/esp_datasets/zinc
data.wangyq.net (data.wangyq.net) をDNSに問いあわせています... 52.21.33.16, 35.168.187.155, 52.2.56.64
data.wangyq.net (data.wangyq.net)|52.21.33.16|:80 に接続しています... 接続しました。
HTTP による接続要求を送信しました、応答を待っています... 404 Not Found
2022-04-30 09:57:43 エラー 404: Not Found。



FileNotFoundError: [Errno 2] No such file or directory: 'zinc'

In [None]:
# assign GAFF-1.81 atom typing
typing = esp.graphs.legacy_force_field.LegacyForceField('gaff-1.81')
ds.apply(typing, in_place=True) # this modify the original data

In [None]:
# split the data into training, test, and validation (80:10:10) and batch the the datasets
ds_tr, ds_te, ds_vl = ds.split([8, 1, 1])

ds_tr = ds_tr.view('graph', batch_size=100, shuffle=True)
ds_te = ds_te.view('graph', batch_size=100)
ds_vl = ds_vl.view('graph', batch_size=100)

In [None]:
# define a layer
layer = esp.nn.layers.dgl_legacy.gn("SAGEConv")

# define a representation
representation = esp.nn.Sequential(
        layer,
        [128, "relu", 128, "relu", 128, "relu"],
)

# define a readout
readout = esp.nn.readout.node_typing.NodeTyping(
        in_features=128,
        n_classes=100
)

net = torch.nn.Sequential(
    representation,
    readout
)

In [None]:
# define loss
loss_fn = esp.metrics.TypingAccuracy()

In [None]:
# define optimizer
optimizer = torch.optim.Adam(net.parameters(), 1e-5)

# train the model
for _ in range(3000):
    for g in ds_tr:
        optimizer.zero_grad()
        net(g.heterograph)
        loss = loss_fn(g.heterograph)
        loss.backward()
        optimizer.step()