In [1]:
import pandas as pd
import numpy as np
import torch
from rdkit import Chem
from torch_geometric.data import Data

In [2]:
smiles_df = pd.read_csv('umap_smiles.csv')
smiles_df = smiles_df[['Compounds', 'SMILES']]
smiles_df

Unnamed: 0,Compounds,SMILES
0,Compound 1,[H][C@]12CC[C@]3([H])[C@]([H])(CC[C@]4(C)[C@@H...
1,Compound 2,[H][C@@]12CCC3=CC(=O)CC[C@]3(C)[C@@]1([H])CC[C...
2,Compound 3,[H][C@@]12CC[C@]([H])([C@H](C)CC[C@H](O)C(C)C)...
3,Compound 4,*C1CCC2C3CCC4CCCCC4(C)C3[C@@H](O)CC12C
4,Compound 5,CC(C)CC[C@H](O)C(C)C1CCC2C3CCC4CCCCC4(C)C3CCC21C
...,...,...
795,Compound 796,[H][C@]12CC[C@]3([H])[C@]([H])(CC[C@@]4(C)[C@@...
796,Compound 797,[H][C@@]12CCC3C[C@H](O)CC[C@]3(C)[C@@]1([H])CC...
797,Compound 798,[H][C@@]12CC[C@]([H])(C(=O)C=O)[C@@]1(C)CC(=O)...
798,Compound 799,[H][C@@]12CC[C@]3([H])[C@]([H])(CC[C@]4(C)C(=O...


In [3]:
def smiles_to_graph(smiles, label):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    atom_feats = []
    for atom in mol.GetAtoms():
        atom_feats.append([atom.GetAtomicNum()])  # basic: atomic number only

    edges = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edges.append((i, j))
        edges.append((j, i))  # undirected

    x = torch.tensor(atom_feats, dtype=torch.float)
    edge_index = torch.tensor(edges, dtype=torch.long).T
    y = torch.tensor([label], dtype=torch.long)  # classification label

    return Data(x=x, edge_index=edge_index, y=y)



In [4]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool

class GNNClassifier(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 32)
        self.conv2 = GCNConv(32, 64)
        self.lin = torch.nn.Linear(64, 2)  # 2 classes: 0 or 1

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)  # one embedding per graph
        return self.lin(x)


In [5]:
from torch_geometric.loader import DataLoader

def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)
