In [1]:
import rdkit

import pandas as pd
import numpy as np
from copy import deepcopy

from torch_geometric.datasets import MoleculeNet
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import AllChem

In [2]:
import custom_regression_functions
import custom_models
from custom_regression_functions import *
from custom_models import *
import custom_general_functions
from custom_general_functions import *

from copy import deepcopy

In [3]:
from torch_geometric.nn import global_mean_pool, global_max_pool

model_type = "GNN"

filename_basic = "results_regression/model_gnn_conv"

apply_scaffold_split = False

hidden_channels=[64]
gcn_layers=4
linear_sizes=[[512], [512, 256]]
aggregations=[global_mean_pool, global_max_pool]
apply_random_aggregations=False

learning_rate=0.001

In [4]:
param_combinations = dict()

hidden_channels_list = [[64], [64, 128]]
linear_sizes_list = [[], [512], [512, 256]]
gcn_layers_list = [1, 2, 3, 4]
aggregations_list = [[global_mean_pool, global_max_pool], [global_mean_pool], [global_max_pool]]
apply_scaffold_split_list = [False, True]

In [5]:
create_new_data = True

In [6]:
data, data_y = load_esol()

pytorch_graph_list = create_pytorch_graph(data)

processed_data = process_pytorch_graph(pytorch_graph_list, data_y)

train_dataset, test_dataset, val_dataset = create_train_test_graphs(processed_data, train_percentage= 0.8, test_percentage=0.1, apply_scaffold_split = False )
loader, test_loader, val_loader = create_dataloader_val(train_dataset, test_dataset, val_dataset, batch_size=64)



In [7]:
train_dataset, test_dataset, val_dataset 

(<torch.utils.data.dataset.Subset at 0x22af5715610>,
 <torch.utils.data.dataset.Subset at 0x22af57155e0>,
 <torch.utils.data.dataset.Subset at 0x22af5715040>)

In [8]:
datasets = [train_dataset, test_dataset, val_dataset ]

In [9]:
loaders = [loader, test_loader, val_loader]

In [10]:
import torch_geometric.data

In [11]:
from torch_geometric.data import *

In [12]:
train_dataset.dataset

[Data(x=[32, 9], edge_index=[2, 68], edge_attr=[68, 3], smiles='OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O ', y=[1, 1]),
 Data(x=[15, 9], edge_index=[2, 32], edge_attr=[32, 3], smiles='Cc1occc1C(=O)Nc2ccccc2', y=[1, 1]),
 Data(x=[11, 9], edge_index=[2, 20], edge_attr=[20, 3], smiles='CC(C)=CCCC(C)=CC(=O)', y=[1, 1]),
 Data(x=[22, 9], edge_index=[2, 52], edge_attr=[52, 3], smiles='c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43', y=[1, 1]),
 Data(x=[5, 9], edge_index=[2, 10], edge_attr=[10, 3], smiles='c1ccsc1', y=[1, 1]),
 Data(x=[9, 9], edge_index=[2, 20], edge_attr=[20, 3], smiles='c2ccc1scnc1c2 ', y=[1, 1]),
 Data(x=[17, 9], edge_index=[2, 36], edge_attr=[36, 3], smiles='Clc1cc(Cl)c(c(Cl)c1)c2c(Cl)cccc2Cl', y=[1, 1]),
 Data(x=[20, 9], edge_index=[2, 46], edge_attr=[46, 3], smiles='CC12CCC3C(CCc4cc(O)ccc34)C2CCC1O', y=[1, 1]),
 Data(x=[19, 9], edge_index=[2, 46], edge_attr=[46, 3], smiles='ClC4=C(Cl)C5(Cl)C3C1CC(C2OC12)C3C4(Cl)C5(Cl)Cl', y=[1, 1]),
 Data(x=[29, 9], edge_index=[2, 66], 

In [13]:
len(train_dataset)

902

In [14]:
len(test_dataset)

112

In [15]:
test_dataset.dataset

[Data(x=[32, 9], edge_index=[2, 68], edge_attr=[68, 3], smiles='OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O ', y=[1, 1]),
 Data(x=[15, 9], edge_index=[2, 32], edge_attr=[32, 3], smiles='Cc1occc1C(=O)Nc2ccccc2', y=[1, 1]),
 Data(x=[11, 9], edge_index=[2, 20], edge_attr=[20, 3], smiles='CC(C)=CCCC(C)=CC(=O)', y=[1, 1]),
 Data(x=[22, 9], edge_index=[2, 52], edge_attr=[52, 3], smiles='c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43', y=[1, 1]),
 Data(x=[5, 9], edge_index=[2, 10], edge_attr=[10, 3], smiles='c1ccsc1', y=[1, 1]),
 Data(x=[9, 9], edge_index=[2, 20], edge_attr=[20, 3], smiles='c2ccc1scnc1c2 ', y=[1, 1]),
 Data(x=[17, 9], edge_index=[2, 36], edge_attr=[36, 3], smiles='Clc1cc(Cl)c(c(Cl)c1)c2c(Cl)cccc2Cl', y=[1, 1]),
 Data(x=[20, 9], edge_index=[2, 46], edge_attr=[46, 3], smiles='CC12CCC3C(CCc4cc(O)ccc34)C2CCC1O', y=[1, 1]),
 Data(x=[19, 9], edge_index=[2, 46], edge_attr=[46, 3], smiles='ClC4=C(Cl)C5(Cl)C3C1CC(C2OC12)C3C4(Cl)C5(Cl)Cl', y=[1, 1]),
 Data(x=[29, 9], edge_index=[2, 66], 

In [16]:
len(test_dataset.dataset)

1128

In [17]:
len(test_dataset)

112

In [18]:
import dill as pickle
with open('datasets_class_esol.pkl', 'wb') as file: 
      
    # A new file will be created 
    pickle.dump(datasets, file) 

In [19]:
data, data_y = load_esol()

pytorch_graph_list = create_pytorch_graph(data)

processed_data = process_pytorch_graph(pytorch_graph_list, data_y)

train_dataset, test_dataset, val_dataset = create_train_test_graphs(processed_data, train_percentage= 0.8, test_percentage=0.1, apply_scaffold_split = True )
loader, test_loader, val_loader = create_dataloader_val(train_dataset, test_dataset, val_dataset, batch_size=64)

datasets_scaffold = [train_dataset, test_dataset, val_dataset ]

loaders_scaffold = [loader, test_loader, val_loader]
import dill as pickle
with open('datasets_scaffold_class_esol.pkl', 'wb') as file: 
      
    # A new file will be created 
    pickle.dump(datasets_scaffold, file) 

Start initializing RDKit molecule instances...
Creating RDKit molecule instance 1000/1128
Start computing Bemis-Murcko scaffolds.
Computing Bemis-Murcko for compound 1000/1128


