In [1]:
import torch
import rdkit
from torch_geometric.datasets import MoleculeNet
from rdkit import Chem
from rdkit.Chem import Draw
from torch import nn as nn
import torch.nn.functional as F
from torch.nn import Linear

print(torch.__version__)
print(torch.version.cuda)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

2.0.1+cu117
11.7
cuda


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = MoleculeNet(root=".", name="ESOL")

In [3]:
print('dataset type: ', type(data))
print('dataset length: ', len(data))
print('dataset feature: ', data.num_features)
print('dataset target: ', data.num_classes)
print('dataset sample: ', data[0])
print('dataset sample x: ', data[0].x)
print('dataset sample y: ', data[0].y)
print('dataset sample edge_index: ', data[0].edge_index.t())

dataset type:  <class 'torch_geometric.datasets.molecule_net.MoleculeNet'>
dataset length:  1128
dataset feature:  9
dataset target:  734
dataset sample:  Data(x=[32, 9], edge_index=[2, 68], edge_attr=[68, 3], smiles='OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O ', y=[1, 1])
dataset sample x:  tensor([[8, 0, 2, 5, 1, 0, 4, 0, 0],
        [6, 0, 4, 5, 2, 0, 4, 0, 0],
        [6, 0, 4, 5, 1, 0, 4, 0, 1],
        [8, 0, 2, 5, 0, 0, 4, 0, 1],
        [6, 0, 4, 5, 1, 0, 4, 0, 1],
        [8, 0, 2, 5, 0, 0, 4, 0, 0],
        [6, 0, 4, 5, 2, 0, 4, 0, 0],
        [6, 0, 4, 5, 1, 0, 4, 0, 1],
        [8, 0, 2, 5, 0, 0, 4, 0, 1],
        [6, 0, 4, 5, 1, 0, 4, 0, 1],
        [8, 0, 2, 5, 0, 0, 4, 0, 0],
        [6, 0, 4, 5, 1, 0, 4, 0, 0],
        [6, 0, 2, 5, 0, 0, 2, 0, 0],
        [7, 0, 1, 5, 0, 0, 2, 0, 0],
        [6, 0, 3, 5, 0, 0, 3, 1, 1],
        [6, 0, 3, 5, 1, 0, 3, 1, 1],
        [6, 0, 3, 5, 1, 0, 3, 1, 1],
        [6, 0, 3, 5, 1, 0, 3, 1, 1],
        [6, 0, 3, 5, 1, 0, 3, 

In [4]:
from torch_geometric.nn import GCNConv, TopKPooling, global_mean_pool
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
embedding_size = 64

class GNN(nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(12345)

        self.initial_conv = GCNConv(data.num_features, embedding_size)
        self.conv1 = GCNConv(embedding_size, embedding_size)
        self.conv2 = GCNConv(embedding_size, embedding_size)
        self.conv3 = GCNConv(embedding_size, embedding_size)
        self.out = nn.Linear(embedding_size*2, 1)
    
    def forward(self, x, edge_index, batch_index):
        x = self.initial_conv(x, edge_index)
        x = F.relu(x)
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)

        x = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1)
        out = self.out(x)

        return out, x




In [9]:
from torch_geometric.data import DataLoader
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

model = GNN().to(device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

data_size = len(data)
batch_size = 64
train_loader = DataLoader(data[:int(data_size*0.8)], batch_size=batch_size, shuffle=True)
test_loader = DataLoader(data[int(data_size*0.8):], batch_size=batch_size, shuffle=True)
epochs = 1000
loss_over_time = []


for epoch in range(epochs+1):
    for batch in train_loader:
        batch.to(device)
        model.zero_grad()
        pred, embedding = model(batch.x.float(), batch.edge_index, batch.batch)
        loss = loss_fn(pred, batch.y)
        loss.backward()
        optimizer.step()
    if epoch % 100 == 0:
        print('epoch: ', epoch, 'loss: ', loss.item())
    loss_over_time.append(loss.item())

plt.plot(loss_over_time)
    

epoch:  0 loss:  1.8305706977844238
epoch:  100 loss:  0.8861204981803894
epoch:  200 loss:  0.28484317660331726
epoch:  300 loss:  0.5201655030250549
epoch:  400 loss:  0.23308336734771729
epoch:  500 loss:  0.9278721809387207
epoch:  600 loss:  0.2553911805152893
epoch:  700 loss:  1.4415853023529053


In [180]:
import pandas as pd
import numpy as np

df = pd.DataFrame()
values = np.array([])
predictions = np.array([])
with torch.no_grad():
    total_loss = 0
    for batch in test_loader:
        batch.to(device)
        next_predictions, embedding = model(batch.x.float(), batch.edge_index, batch.batch)
        loss = loss_fn(next_predictions, batch.y)
        values = np.append(values, batch.y.cpu().numpy())
        predictions = np.append(predictions, next_predictions.cpu().numpy())
    
values = values.reshape(-1, 1)
predictions = predictions.reshape(-1, 1)
value_pred = np.concatenate((values, predictions), axis=1)
avg_loss = np.mean(np.abs(values - predictions))
print('avg_loss: ', avg_loss)
print('value vs prediction')
print(value_pred[:15])
        
        


avg_loss:  0.6565232526230733
[[-0.85000002 -1.10964048]
 [-1.13999999 -0.57325733]
 [-1.05999994 -0.9082278 ]
 [ 0.1        -0.23873915]
 [-1.88999999 -1.50209045]
 [-2.34899998 -2.01733398]
 [-3.36999989 -3.47919822]
 [ 0.15000001  0.25600481]
 [-2.44000006 -2.11246109]
 [-3.95300007 -3.28013277]
 [-2.08999991 -1.63523877]
 [-0.92000002 -0.79530114]
 [-1.08000004 -1.36468256]
 [-8.39999962 -7.684443  ]
 [-0.49000001 -0.72971666]]
