In [51]:
# !pip install torch_geometric rdkit mols2grid --quiet

In [52]:
from rdkit import Chem

import torch
import torch.nn.functional as F
from torch.nn import GRU, Linear, ReLU, Sequential

import torch_geometric.transforms as T
from torch_geometric.datasets import QM9
from torch_geometric.nn import NNConv, Set2Set
from torch_geometric.utils import remove_self_loops

from tqdm.notebook import tqdm

In [53]:
import os
import copy
from rdkit import Chem
from rdkit.Chem import RDConfig
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import rdDepictor, rdMolDraw2D

generate_images = True # set to True if running for first time to generate images
if generate_images:
    opts = Draw.DrawingOptions()
    Draw.SetComicMode(opts)

    path = os.path.join('qm9/raw/gdb9.sdf')
    os.makedirs('images',exist_ok=True)

    mol_list = []
    mol_name_list = []

    for mol in tqdm(Chem.SDMolSupplier(path)):
        if mol is not None: 
            mol_list.append(mol)
            mol_name_list.append(mol.GetProp('_Name'))

    panelx = 224
    panely = 224
    canvasx = panelx * 1
    canvasy = panely * 1

    for idx, mol in enumerate(tqdm(mol_list)):

        save_path = os.path.join("images",mol_name_list[idx]+".png")
        drawer = rdMolDraw2D.MolDraw2DCairo(canvasx,canvasy,panelx,panely)
        drawer.DrawMolecules([mol])
        drawer.FinishDrawing()
        with open(save_path,'wb') as out:
            out.write(drawer.GetDrawingText())


In [110]:
# create image dataloader
from torch.utils.data import Dataset
from torchvision.io import read_image
import torchvision.transforms as transforms

class QM9Dataset(Dataset):
    def __init__(self,root='cnn_qm9',img_folder="images",target=2, mode="train", test_split=0.1):
        qm9 = QM9(root)
        names = qm9.name #130831
        y = qm9.y[:,target] #138031

        mean = y.mean(dim=0, keepdim=True)
        std = y.std(dim=0, keepdim=True)
        y = (y - mean) / std
        self.mean, self.std = mean.item(), std.item()

        split = int((1-test_split)*len(names))
        if mode=="train":
            names = names[:split]
            y = y[:split]
        else:
            names = names[split:]
            y = y[split:]

        self.y = y
        self.img_names = names
        self.img_folder = img_folder

        self.augment_tensor = transforms.Compose([
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])


    def __len__(self):
        return len(self.img_names)
    
    def __getitem__(self,index):
        y = self.y[index]
        img_name = self.img_names[index]
        img_path = os.path.join(self.img_folder,img_name+".png")
        if not os.path.exists(img_path):
            # print(f"{img_path} does not exist")
            return self.__getitem__(index+1)
        img = read_image(img_path).to(dtype=float)/255.0
        img = self.augment_tensor(img)

        return {'img':img,
                'target':y}
    
train_dataset = QM9Dataset(root='cnn_qm9',img_folder='images',target=2, mode="train", test_split=0.1)
test_dataset = QM9Dataset(root='cnn_qm9',img_folder='images',target=2, mode="test", test_split=0.1)

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=8, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=8, drop_last=True)

In [111]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"using device: {device}")

class Net(torch.nn.Module):
    def __init__(self,device):
        super().__init__()
        self.regressor = torch.nn.Linear(1000, 1).to(device)
        self.resnet = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True).to(device)

    def forward(self, data):
        x = self.resnet(data)
        return self.regressor(x).squeeze(-1)
    
model = Net(device)
# print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                       factor=0.7, patience=5,
                                                       min_lr=0.00001)
os.makedirs("output",exist_ok=True)

def train(epoch):
    model.train()
    loss_all = 0
    
    pbar = tqdm(train_loader)
    for data in pbar:
        imgs = data['img'].to(device,dtype=torch.float32)
        gt = data['target'].to(device,dtype=torch.float32)
        optimizer.zero_grad()
        loss = F.mse_loss(model(imgs), gt)
        loss.backward()
        loss_all += loss.item()
        optimizer.step()
        pbar.set_postfix(epoch=epoch, loss=loss.item())
    return loss_all / len(train_loader.dataset)


def test(loader):
    model.eval()
    error = 0

    for data in tqdm(loader):
        imgs = data['img'].to(device,dtype=torch.float32)
        gt = data['target'].to(device,dtype=torch.float32)

        error += (model(imgs) * train_dataset.std - gt * train_dataset.std).abs().sum().item()  # MAE
    return error / len(loader.dataset)

best_val_error = None
for epoch in range(1, 201):
    lr = scheduler.optimizer.param_groups[0]['lr']
    loss = train(epoch)
    test_error = test(test_loader)
    scheduler.step(test_error)

    print(f'Epoch: {epoch:03d}, LR: {lr:7f}, Loss: {loss:.7f}, Test MAE: {test_error:.7f}')
    if epoch%10==0:
        torch.save(model, f"output/cnn_epoch{epoch}.pt")


using device: cuda


Using cache found in /home/jielong/.cache/torch/hub/pytorch_vision_v0.10.0


  0%|          | 0/3679 [00:00<?, ?it/s]