In [2]:
import os
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.data.utils import split_dataset
from torch.utils.data import DataLoader
from dgllife.data import Tox21
from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer, CanonicalBondFeaturizer
from dgl.nn.pytorch import GraphConv, GATConv, GATv2Conv, SAGEConv, GINConv
from torch.nn import Linear, Sequential, BatchNorm1d, ReLU, Dropout
from rdkit import Chem
import numpy as np
import networkx as nx
from dgl.dataloading import GraphDataLoader
import concurrent.futures
import time
import torch.optim as optim

In [16]:
# !conda list --export > local_env.txt

In [17]:
# !conda env export > environment.yml

In [19]:
# !pip --version

pip 24.1 from C:\Users\jenif\anaconda3\envs\py3.10.12\lib\site-packages\pip (python 3.10)



In [3]:
import rdkit
rdkit.__file__

'C:\\Users\\jenif\\anaconda3\\envs\\py3.10.12\\lib\\site-packages\\rdkit\\__init__.py'

In [15]:
import rdkit.rdBase
dir(rdkit.rdBase)

['AttachFileToLog',
 'BlockLogs',
 'DisableLog',
 'EnableLog',
 'LogDebugMsg',
 'LogErrorMsg',
 'LogInfoMsg',
 'LogMessage',
 'LogStatus',
 'LogToCppStreams',
 'LogToPythonLogger',
 'LogToPythonStderr',
 'SeedRandomNumberGenerator',
 'WrapLogs',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_iostreamsEnabled',
 '_listclass std::vector<int,class std::allocator<int> >',
 '_listclass std::vector<unsigned int,class std::allocator<unsigned int> >',
 '_listint',
 '_multithreadedEnabled',
 '_serializationEnabled',
 '_vectclass std::basic_string<char,struct std::char_traits<char>,class std::allocator<char> >',
 '_vectclass std::vector<double,class std::allocator<double> >',
 '_vectclass std::vector<int,class std::allocator<int> >',
 '_vectclass std::vector<unsigned int,class std::allocator<unsigned int> >',
 '_vectdouble',
 '_vectint',
 '_vectunsigned int',
 '_version',
 'boostVersion',
 'ostream',
 'rdkitBuild',
 'rdkitVersion',
 'std_ostream',
 'streambu

In [16]:
rdkit.rdBase.__file__

'C:\\Users\\jenif\\anaconda3\\envs\\py3.10.12\\lib\\site-packages\\rdkit\\rdBase.pyd'

In [5]:
# Use CanonicalAtomFeaturizer to generate node features
node_featurizer = CanonicalAtomFeaturizer()
edge_featurizer = CanonicalBondFeaturizer()

def collate(samples):
    graphs, labels = [], []
    for sample in samples:
        graphs.append(dgl.add_self_loop(sample[1]))  # Add self-loops during graph construction
        labels.append(sample[2])  # Extract the labels (3rd element is the labels tensor)
    batched_graph = dgl.batch(graphs)
    labels = torch.stack(labels)
    return batched_graph, labels

# Load the Tox21 dataset
dataset = Tox21(smiles_to_graph=smiles_to_bigraph, node_featurizer=node_featurizer)
# , edge_featurizer=edge_featurizer
train_set, val_set, test_set = split_dataset(dataset, frac_list=[0.8, 0.1, 0.1], shuffle=True, random_state=42)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False, collate_fn=collate)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False, collate_fn=collate)

Processing dgl graphs from scratch...
Processing molecule 1000/7831
Processing molecule 2000/7831
Processing molecule 3000/7831
Processing molecule 4000/7831
Processing molecule 5000/7831
Processing molecule 6000/7831
Processing molecule 7000/7831


In [6]:
# Declare variables
hidden_size = 64
num_epochs = 50
checkpoint_dir = 'saveStates'
lr = 0.01

In [7]:
# Function to save a checkpoint
def save_checkpoint(model, optimizer, epoch, directory, filename):
    if not os.path.exists(directory):
        os.makedirs(directory)
    path = os.path.join(directory, filename)
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch
    }
    torch.save(checkpoint, path)

# Function to load a checkpoint
def load_checkpoint(model, optimizer, path):
    if os.path.isfile(path):
        checkpoint = torch.load(path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        return model, optimizer, epoch
    return model, optimizer, -1
    

In [8]:
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, hidden_size)
        self.conv2 = GraphConv(hidden_size, hidden_size)
        self.classify = nn.Linear(hidden_size, num_classes)

    def forward(self, g):
        h = g.ndata['h'].float()
        h = F.relu(self.conv1(g, h))
        h = self.conv2(g, h)
        with g.local_scope():
            g.ndata['h'] = h
            hg = dgl.mean_nodes(g, 'h')
        return self.classify(hg)

In [9]:
# Check the first graph to get the feature dimension
first_graph = dataset[0][1]
print(first_graph.ndata)
in_feats = first_graph.ndata['h'].shape[1]


# Determine the number of tasks
num_classes = dataset.labels.shape[1]
model = GCN(in_feats, hidden_size, num_classes)

# Use DataParallel to wrap the model for parallel training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = nn.DataParallel(model)
model.to(device)
best_val_acc = 0

def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batched_graph, labels in dataloader:

        batched_graph = batched_graph.to(device)
        labels = labels.to(device)
        
        logits = model(batched_graph)
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Save checkpoint at the end of each epoch
        # save_checkpoint(model, optimizer, epoch, checkpoint_dir, 'saveState1.pth')

    return total_loss / len(dataloader)

def evaluate(model, dataloader):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batched_graph, labels in dataloader:
            
            batched_graph = batched_graph.to(device)
            labels = labels.to(device)
            
            logits = model(batched_graph)
            preds = (logits > 0.5).float()
            total_correct += (preds == labels).sum().item()
            total_samples += labels.numel()
    return total_correct / total_samples

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr)

# Check if a checkpoint exists and load it
#model, optimizer, start_epoch = load_checkpoint(model, optimizer, os.path.join(checkpoint_dir, 'saveState1.pth'))
#if start_epoch == -1:
#    start_epoch = 0  # No checkpoint found, start from scratch
#else:
#    print(f"Resuming from epoch {start_epoch + 1}")

# Measure the time for training
start_time = time.time()

# Train the model
# num_epochs = 50
for epoch in range(num_epochs):
    loss = train(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader)
    if best_val_acc < val_acc:
            best_val_acc = val_acc
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Accuracy: {val_acc:.4f}, (best {best_val_acc:.3f})")

end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

# Measure the time for evaluation on the test set
start_time = time.time()

# Evaluate the model on the test set
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy with GCN with parallel: {test_acc:.4f}")

end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.2f} seconds")


{'h': tensor([[1., 0., 0.,  ..., 0., 1., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 1., 0.,  ..., 1., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]])}
Epoch 1/50, Loss: 0.2259, Validation Accuracy: 0.9297, (best 0.930)
Epoch 2/50, Loss: 0.2060, Validation Accuracy: 0.9297, (best 0.930)
Epoch 3/50, Loss: 0.2020, Validation Accuracy: 0.9294, (best 0.930)
Epoch 4/50, Loss: 0.1987, Validation Accuracy: 0.9301, (best 0.930)
Epoch 5/50, Loss: 0.1955, Validation Accuracy: 0.9297, (best 0.930)
Epoch 6/50, Loss: 0.1947, Validation Accuracy: 0.9302, (best 0.930)
Epoch 7/50, Loss: 0.1944, Validation Accuracy: 0.9303, (best 0.930)
Epoch 8/50, Loss: 0.1912, Validation Accuracy: 0.9302, (best 0.930)
Epoch 9/50, Loss: 0.1904, Validation Accuracy: 0.9316, (best 0.932)
Epoch 10/50, Loss: 0.1886, Validation Accuracy: 0.9306, (best 0.932)
Epoch 11/50, Loss: 0.1887, Validation Accuracy: 0.9304, (b

In [10]:
# Check the first graph to get the feature dimension
first_graph = dataset[0][1]
print(first_graph.ndata)
in_feats = first_graph.ndata['h'].shape[1]
# hidden_size = 16

# Determine the number of tasks
num_classes = dataset.labels.shape[1]
model = GCN(in_feats, hidden_size, num_classes)

def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batched_graph, labels in dataloader:
        
        logits = model(batched_graph)
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batched_graph, labels in dataloader:
            
            logits = model(batched_graph)
            preds = (logits > 0.5).float()
            total_correct += (preds == labels).sum().item()
            total_samples += labels.numel()
    return total_correct / total_samples

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr)

# Measure the time for training
start_time = time.time()

# Train the model
# num_epochs = 50
for epoch in range(num_epochs):
    loss = train(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Accuracy: {val_acc:.4f}")

end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

# Measure the time for evaluation on the test set
start_time = time.time()

# Evaluate the model on the test set
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy with GCN w/o parallel: {test_acc:.4f}")

end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.2f} seconds")

{'h': tensor([[1., 0., 0.,  ..., 0., 1., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 1., 0.,  ..., 1., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]])}
Epoch 1/50, Loss: 0.2245, Validation Accuracy: 0.9297
Epoch 2/50, Loss: 0.2038, Validation Accuracy: 0.9297
Epoch 3/50, Loss: 0.2024, Validation Accuracy: 0.9294
Epoch 4/50, Loss: 0.1987, Validation Accuracy: 0.9304
Epoch 5/50, Loss: 0.1955, Validation Accuracy: 0.9294
Epoch 6/50, Loss: 0.1945, Validation Accuracy: 0.9315
Epoch 7/50, Loss: 0.1908, Validation Accuracy: 0.9295
Epoch 8/50, Loss: 0.1905, Validation Accuracy: 0.9322
Epoch 9/50, Loss: 0.1906, Validation Accuracy: 0.9295
Epoch 10/50, Loss: 0.1891, Validation Accuracy: 0.9305
Epoch 11/50, Loss: 0.1878, Validation Accuracy: 0.9312
Epoch 12/50, Loss: 0.1868, Validation Accuracy: 0.9319
Epoch 13/50, Loss: 0.1868, Validation Accuracy: 0.9307
Epoch 14/50, Loss: 0.1858, Valida

In [11]:
class GAT(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(GAT, self).__init__()
        self.conv1 = GATConv(in_feats, hidden_size, num_heads=2)
        self.conv2 = GATConv(hidden_size * 2, hidden_size, num_heads=2)
        self.classify = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, g):
        h = g.ndata['h'].float()
        h = F.elu(self.conv1(g, h))
        h = h.view(h.size(0), -1)  # Flatten the output of conv1
        h = self.conv2(g, h)
        h = h.view(h.size(0), -1)  # Flatten the output of conv2
        with g.local_scope():
            g.ndata['h'] = h
            hg = dgl.mean_nodes(g, 'h')
        return self.classify(hg)

In [12]:
# GAT

In [13]:
# Check the first graph to get the feature dimension
first_graph = dataset[0][1]
print(first_graph.ndata)
in_feats = first_graph.ndata['h'].shape[1]
# hidden_size = 16

# Determine the number of tasks
num_classes = dataset.labels.shape[1]
model = GAT(in_feats, hidden_size, num_classes)

# Use DataParallel to wrap the model for parallel training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = nn.DataParallel(model)
model.to(device)

def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batched_graph, labels in dataloader:

        batched_graph = batched_graph.to(device)
        labels = labels.to(device)
        
        logits = model(batched_graph)
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batched_graph, labels in dataloader:

            batched_graph = batched_graph.to(device)
            labels = labels.to(device)
            
            logits = model(batched_graph)
            preds = (logits > 0.5).float()
            total_correct += (preds == labels).sum().item()
            total_samples += labels.numel()
    return total_correct / total_samples

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr)

# Measure the time for training
start_time = time.time()

# Train the model
# num_epochs = 50
for epoch in range(num_epochs):
    loss = train(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Accuracy: {val_acc:.4f}")

end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

# Measure the time for evaluation on the test set
start_time = time.time()

# Evaluate the model on the test set
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy with GAT w/parallel: {test_acc:.4f}")

end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.2f} seconds")


{'h': tensor([[1., 0., 0.,  ..., 0., 1., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 1., 0.,  ..., 1., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]])}
Epoch 1/50, Loss: 0.2177, Validation Accuracy: 0.9297
Epoch 2/50, Loss: 0.2037, Validation Accuracy: 0.9301
Epoch 3/50, Loss: 0.2007, Validation Accuracy: 0.9298
Epoch 4/50, Loss: 0.2013, Validation Accuracy: 0.9297
Epoch 5/50, Loss: 0.1969, Validation Accuracy: 0.9308
Epoch 6/50, Loss: 0.1981, Validation Accuracy: 0.9300
Epoch 7/50, Loss: 0.1955, Validation Accuracy: 0.9302
Epoch 8/50, Loss: 0.1943, Validation Accuracy: 0.9301
Epoch 9/50, Loss: 0.1925, Validation Accuracy: 0.9317
Epoch 10/50, Loss: 0.1915, Validation Accuracy: 0.9311
Epoch 11/50, Loss: 0.1925, Validation Accuracy: 0.9323
Epoch 12/50, Loss: 0.1926, Validation Accuracy: 0.9339
Epoch 13/50, Loss: 0.1904, Validation Accuracy: 0.9304
Epoch 14/50, Loss: 0.1903, Valida

In [14]:
# Check the first graph to get the feature dimension
first_graph = dataset[0][1]
print(first_graph.ndata)
in_feats = first_graph.ndata['h'].shape[1]
# hidden_size = 16

# Determine the number of tasks
num_classes = dataset.labels.shape[1]
model = GAT(in_feats, hidden_size, num_classes)

# Use DataParallel to wrap the model for parallel training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = nn.DataParallel(model)
model.to(device)

def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batched_graph, labels in dataloader:
        
        batched_graph = batched_graph.to(device)
        labels = labels.to(device)
        
        logits = model(batched_graph)
        # Debugging statement
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batched_graph, labels in dataloader:

            batched_graph = batched_graph.to(device)
            labels = labels.to(device)
            
            logits = model(batched_graph)
            preds = (logits > 0.5).float()
            total_correct += (preds == labels).sum().item()
            total_samples += labels.numel()
    return total_correct / total_samples

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr)

# Measure the time for training
start_time = time.time()

# Train the model
# num_epochs = 100
for epoch in range(num_epochs):
    loss = train(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Accuracy: {val_acc:.4f}")

end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

# Measure the time for evaluation on the test set
start_time = time.time()

# Evaluate the model on the test set
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy with GAT w/o parallel: {test_acc:.4f}")

end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.2f} seconds")


{'h': tensor([[1., 0., 0.,  ..., 0., 1., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 1., 0.,  ..., 1., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]])}
Epoch 1/50, Loss: 0.2162, Validation Accuracy: 0.9299
Epoch 2/50, Loss: 0.2040, Validation Accuracy: 0.9300
Epoch 3/50, Loss: 0.2003, Validation Accuracy: 0.9295
Epoch 4/50, Loss: 0.1981, Validation Accuracy: 0.9317
Epoch 5/50, Loss: 0.1962, Validation Accuracy: 0.9301
Epoch 6/50, Loss: 0.1952, Validation Accuracy: 0.9311
Epoch 7/50, Loss: 0.1923, Validation Accuracy: 0.9300
Epoch 8/50, Loss: 0.1937, Validation Accuracy: 0.9295
Epoch 9/50, Loss: 0.1935, Validation Accuracy: 0.9323
Epoch 10/50, Loss: 0.1931, Validation Accuracy: 0.9321
Epoch 11/50, Loss: 0.1901, Validation Accuracy: 0.9310
Epoch 12/50, Loss: 0.1906, Validation Accuracy: 0.9304
Epoch 13/50, Loss: 0.1898, Validation Accuracy: 0.9327
Epoch 14/50, Loss: 0.1906, Valida


KeyboardInterrupt



In [None]:
class GATv2(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(GATv2, self).__init__()
        self.conv1 = GATv2Conv(in_feats, hidden_size, num_heads=2)
        self.conv2 = GATv2Conv(hidden_size * 2, hidden_size, num_heads=2)
        self.classify = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, g):
        h = g.ndata['h'].float()
        h = F.elu(self.conv1(g, h))
        h = h.view(h.size(0), -1)  # Flatten the output of conv1
        h = self.conv2(g, h)
        h = h.view(h.size(0), -1)  # Flatten the output of conv2
        with g.local_scope():
            g.ndata['h'] = h
            hg = dgl.mean_nodes(g, 'h')
        return self.classify(hg)

In [None]:
# Check the first graph to get the feature dimension
first_graph = dataset[0][1]
print(first_graph.ndata)
in_feats = first_graph.ndata['h'].shape[1]
# hidden_size = 16

# Determine the number of tasks
num_classes = dataset.labels.shape[1]
model = GATv2(in_feats, hidden_size, num_classes)

# Use DataParallel to wrap the model for parallel training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = nn.DataParallel(model)
model.to(device)

def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batched_graph, labels in dataloader:

        batched_graph = batched_graph.to(device)
        labels = labels.to(device)
        
        logits = model(batched_graph)
        # Debugging statement
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batched_graph, labels in dataloader:

            batched_graph = batched_graph.to(device)
            labels = labels.to(device)
        
            logits = model(batched_graph)
            preds = (logits > 0.5).float()
            total_correct += (preds == labels).sum().item()
            total_samples += labels.numel()
    return total_correct / total_samples

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr)

# Measure the time for training
start_time = time.time()

# Train the model
# num_epochs = 100
for epoch in range(num_epochs):
    loss = train(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Accuracy: {val_acc:.4f}")

end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

# Measure the time for evaluation on the test set
start_time = time.time()

# Evaluate the model on the test set
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy with GATv2 with parallel: {test_acc:.4f}")

end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.2f} seconds")


In [None]:
# Check the first graph to get the feature dimension
first_graph = dataset[0][1]
print(first_graph.ndata)
in_feats = first_graph.ndata['h'].shape[1]
# hidden_size = 16

# Determine the number of tasks
num_classes = dataset.labels.shape[1]
model = GATv2(in_feats, hidden_size, num_classes)

def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batched_graph, labels in dataloader:

        batched_graph = batched_graph.to(device)
        labels = labels.to(device)
        
        logits = model(batched_graph)
        # Debugging statement
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batched_graph, labels in dataloader:

            batched_graph = batched_graph.to(device)
            labels = labels.to(device)
            
            logits = model(batched_graph)
            preds = (logits > 0.5).float()
            total_correct += (preds == labels).sum().item()
            total_samples += labels.numel()
    return total_correct / total_samples

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr)

# Measure the time for training
start_time = time.time()

# Train the model
# num_epochs = 100
for epoch in range(num_epochs):
    loss = train(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Accuracy: {val_acc:.4f}")

end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

# Measure the time for evaluation on the test set
start_time = time.time()

# Evaluate the model on the test set
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy with GATv2 w/o parallel: {test_acc:.4f}")

end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.2f} seconds")


In [None]:
torch.cuda.is_available()

In [None]:
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes, aggregator_type):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, hidden_size, aggregator_type)
        self.conv2 = SAGEConv(hidden_size, hidden_size, aggregator_type)
        self.classify = nn.Linear(hidden_size, num_classes)

    def forward(self, g):
        h = g.ndata['h'].float()
        h = F.elu(self.conv1(g, h))
        #h = h.view(h.size(0), -1)  # Flatten the output of conv1
        h = self.conv2(g, h)
        #h = h.view(h.size(0), -1)  # Flatten the output of conv2
        with g.local_scope():
            g.ndata['h'] = h
            hg = dgl.mean_nodes(g, 'h')
        return self.classify(hg)

In [None]:
def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batched_graph, labels in dataloader:

        batched_graph = batched_graph.to(device)
        labels = labels.to(device)
        
        logits = model(batched_graph)
        # Debugging statement
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batched_graph, labels in dataloader:

            batched_graph = batched_graph.to(device)
            labels = labels.to(device)
            
            logits = model(batched_graph)
            preds = (logits > 0.5).float()
            total_correct += (preds == labels).sum().item()
            total_samples += labels.numel()
    return total_correct / total_samples

In [None]:
# Check the first graph to get the feature dimension
first_graph = dataset[0][1]
print(first_graph.ndata)
in_feats = first_graph.ndata['h'].shape[1]
# hidden_size = 16
aggregator_type = 'mean'

# Determine the number of tasks
num_classes = dataset.labels.shape[1]
model = GraphSAGE(in_feats, hidden_size, num_classes, aggregator_type)

# Use DataParallel to wrap the model for parallel training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = nn.DataParallel(model)
model.to(device)



criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr)

# Measure the time for training
start_time = time.time()

# Train the model
# num_epochs = 100
for epoch in range(num_epochs):
    loss = train(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Accuracy: {val_acc:.4f}")

end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

# Measure the time for evaluation on the test set
start_time = time.time()

# Evaluate the model on the test set
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy with GraphSAGE with mean aggregation: {test_acc:.4f}")

end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.2f} seconds")

In [None]:
# Check the first graph to get the feature dimension
first_graph = dataset[0][1]
print(first_graph.ndata)
in_feats = first_graph.ndata['h'].shape[1]
# hidden_size = 64
aggregator_type = 'pool'

# Determine the number of tasks
num_classes = dataset.labels.shape[1]
model = GraphSAGE(in_feats, hidden_size, num_classes, aggregator_type)

# Use DataParallel to wrap the model for parallel training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = nn.DataParallel(model)
model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr)

# Measure the time for training
start_time = time.time()

# Train the model
# num_epochs = 100
for epoch in range(num_epochs):
    loss = train(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Accuracy: {val_acc:.4f}")

end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

# Measure the time for evaluation on the test set
start_time = time.time()

# Evaluate the model on the test set
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy with GraphSAGE with pool aggregation: {test_acc:.4f}")

end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.2f} seconds")


In [None]:
# Check the first graph to get the feature dimension
first_graph = dataset[0][1]
print(first_graph.ndata)
in_feats = first_graph.ndata['h'].shape[1]
# hidden_size = 64
aggregator_type = 'lstm'

# Determine the number of tasks
num_classes = dataset.labels.shape[1]
model = GraphSAGE(in_feats, hidden_size, num_classes, aggregator_type)

# Use DataParallel to wrap the model for parallel training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = nn.DataParallel(model)
model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr)

# Measure the time for training
start_time = time.time()

# Train the model
# num_epochs = 100
for epoch in range(num_epochs):
    loss = train(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Accuracy: {val_acc:.4f}")

end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

# Measure the time for evaluation on the test set
start_time = time.time()

# Evaluate the model on the test set
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy with GraphSAGE with lstm aggregation: {test_acc:.4f}")

end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.2f} seconds")

In [None]:
# Check the first graph to get the feature dimension
first_graph = dataset[0][1]
print(first_graph.ndata)
in_feats = first_graph.ndata['h'].shape[1]
# hidden_size = 64
aggregator_type = 'gcn'

# Determine the number of tasks
num_classes = dataset.labels.shape[1]
model = GraphSAGE(in_feats, hidden_size, num_classes, aggregator_type)

# Use DataParallel to wrap the model for parallel training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = nn.DataParallel(model)
model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr)

# Measure the time for training
start_time = time.time()

# Train the model
# num_epochs = 100
for epoch in range(num_epochs):
    loss = train(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Accuracy: {val_acc:.4f}")

end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

# Measure the time for evaluation on the test set
start_time = time.time()

# Evaluate the model on the test set
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy with GraphSAGE with gcn aggregation: {test_acc:.4f}")

end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.2f} seconds")

In [None]:
# GINConv
# classdgl.nn.pytorch.conv.GINConv(apply_func=None, aggregator_type='sum', init_eps=0, learn_eps=False, activation=None)

In [None]:
class GIN(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes):
        super(GIN, self).__init__()
        self.conv1 = GINConv(
            Sequential(Linear(in_feats, hidden_size),
                       BatchNorm1d(hidden_size), ReLU(),
                       Linear(hidden_size, hidden_size), ReLU()))
        self.conv2 = GINConv(
            Sequential(Linear(hidden_size, hidden_size), BatchNorm1d(hidden_size), ReLU(),
                       Linear(hidden_size, hidden_size), ReLU()))

        # self.conv3 = GINConv(
        #     Sequential(Linear(dim_h, dim_h), BatchNorm1d(dim_h), ReLU(),
        #                Linear(dim_h, dim_h), ReLU()))
        
        #self.lin1 = Linear(dim_h*3, dim_h*3)
        self.lin1 = Linear(hidden_size*2, num_classes)
        self.classify = nn.Linear(hidden_size, num_classes)

    def forward(self, g):
        h = g.ndata['h'].float()
        h = F.relu(self.conv1(g, h))
        h = self.conv2(g, h)
        with g.local_scope():
            g.ndata['h'] = h
            hg = dgl.mean_nodes(g, 'h') # try sum_nodes
        return self.classify(hg)

In [None]:
# Check the first graph to get the feature dimension
first_graph = dataset[0][1]
print(first_graph.ndata)
in_feats = first_graph.ndata['h'].shape[1]
# hidden_size = 16

# Determine the number of tasks
num_classes = dataset.labels.shape[1]
model = GIN(in_feats, hidden_size, num_classes)

# Use DataParallel to wrap the model for parallel training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = nn.DataParallel(model)
model.to(device)

def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batched_graph, labels in dataloader:

        batched_graph = batched_graph.to(device)
        labels = labels.to(device)
        
        logits = model(batched_graph)
        # Debugging statement
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batched_graph, labels in dataloader:

            batched_graph = batched_graph.to(device)
            labels = labels.to(device)
            
            logits = model(batched_graph)
            preds = (logits > 0.5).float()
            total_correct += (preds == labels).sum().item()
            total_samples += labels.numel()
    return total_correct / total_samples

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr)

# Measure the time for training
start_time = time.time()

# Train the model
# num_epochs = 100
for epoch in range(num_epochs):
    loss = train(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Accuracy: {val_acc:.4f}")

end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

# Measure the time for evaluation on the test set
start_time = time.time()

# Evaluate the model on the test set
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy with GIN: {test_acc:.4f}")

end_time = time.time()
print(f"Evaluation time: {end_time - start_time:.2f} seconds")


In [None]:
# Explore GIN with edge features
# GINEConv
# classdgl.nn.pytorch.conv.GINEConv(apply_func=None, init_eps=0, learn_eps=False)

In [None]:
from tox21_recent import moltree_to_dglgraph

In [None]:
def smiles_to_weighted_graph(smiles):
    for graph in train_set:
        smiles = graph[0]
        mol = Chem.MolFromSmiles(smiles)
        # print(smiles)
    
    print(smiles_to_bigraph(smiles, node_featurizer=featurize_atoms, edge_featurizer=featurize_bonds))

    # Generate the molecule's adjacency matrix with bond order 
    G = Chem.GetAdjacencyMatrix(mol, useBO=True)
    
    # Convert the weighted adjacency matrix to a NumPy array
    adjacency_matrix = np.asarray(G)
    
    # Print the weighted adjacency matrix
    print(adjacency_matrix)
    
    # Convert weighted adjacency matrix to NetworkX graph
    G = nx.from_numpy_array(adjacency_matrix)
    

In [None]:
# loader takes away SMILES strings

In [None]:
list(train_loader)

In [None]:
list(train_set)

In [None]:
dir(Chem)

In [None]:
for graph in train_set:
    smiles = graph[0]
    mol = Chem.MolFromSmiles(smiles)
    # print(smiles)
    
    graph = smiles_to_bigraph(smiles, node_featurizer=featurize_atoms, edge_featurizer=featurize_bonds)
    dir(graph)
    # Generate the molecule's adjacency matrix with bond order 
    # G = Chem.GetAdjacencyMatrix(mol, useBO=True)
    
    # # Convert the weighted adjacency matrix to a NumPy array
    # adjacency_matrix = np.asarray(G)
    
    # # Print the weighted adjacency matrix
    # print(adjacency_matrix)
    
    # # Convert weighted adjacency matrix to NetworkX graph
    # G = nx.from_numpy_array(adjacency_matrix)
    # print(G)
    # print(type(G))
    # G = dgl.from_networkx(G)
    # print(G)
    # print(type(G))
    # #nx.draw(G, with_labels=True)
    

In [None]:
smiles = train_set[0][0]
mol = Chem.MolFromSmiles(smiles)
# graph = smiles_to_bigraph(smiles, node_featurizer=featurize_atoms, edge_featurizer=featurize_bonds)

node_featurizer = CanonicalAtomFeaturizer()
edge_featurizer = CanonicalBondFeaturizer()
graph = smiles_to_bigraph(smiles, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer)

print(graph.adjacency_matrix())
print(dir(graph))
# Generate the molecule's adjacency matrix with bond order 
# G = Chem.GetAdjacencyMatrix(mol, useBO=True)

# # Convert the weighted adjacency matrix to a NumPy array
# adjacency_matrix = np.asarray(G)

# # Print the weighted adjacency matrix
# print(adjacency_matrix)
    
    # # Convert weighted adjacency matrix to NetworkX graph
    # G = nx.from_numpy_array(adjacency_matrix)
    # print(G)
    # print(type(G))
    # G = dgl.from_networkx(G)
    # print(G)

In [None]:
dir(dgl)

In [None]:
print("Edge Features (Bond Order):")
print(graph.edata['type'])

In [None]:
# Verify the bond orders match RDKit adjacency matrix with bond order
adj_matrix_rdkit = Chem.GetAdjacencyMatrix(mol, useBO=True)
print("RDKit Adjacency Matrix with Bond Order:")
print(np.asarray(adj_matrix_rdkit))

In [None]:
# Featurize bonds and create the graph
smiles = train_set[0][0]
mol = Chem.MolFromSmiles(smiles)
graph = smiles_to_bigraph(smiles, node_featurizer=featurize_atoms, edge_featurizer=featurize_bonds)

# Print edge features
print(graph.edata['type'])

# Print the adjacency matrix from DGL
print(graph.adjacency_matrix())

# Generate the molecule's adjacency matrix with bond order from RDKit
G = Chem.GetAdjacencyMatrix(mol, useBO=True)

# Convert the weighted adjacency matrix to a NumPy array
adjacency_matrix = np.asarray(G)

# Print the weighted adjacency matrix
print(adjacency_matrix)


In [None]:
train_set[0]

In [None]:
dir(graph)

In [None]:
graph

In [None]:
graph.ntypes

In [None]:
import dgl.dgl_sparse
dgl.__version__
help(dgl.dgl_sparse)

In [None]:
from dgl.dgl_sparse import spmatrix
dir(spmatrix)

In [None]:
# !pip install torch==2.3.0

In [None]:
# !pip show dgl

In [None]:
dir(dgl)

In [None]:
# version that weights graph by index of bond types (Anh's version)

In [None]:
def featurize_atoms(mol):
    feats = []
    for atom in mol.GetAtoms():
        feats.append(atom.GetAtomicNum())
    return {'atomic': torch.tensor(feats).reshape(-1,1).float()}

def featurize_bonds(mol):
    feats = []
    bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,  #enumeration value 
                  Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    for bond in mol.GetBonds():
        btype = bond_types.index(bond.GetBondType())
        # One bond between atom u and v corresponds to two edges (u, v) and (v, u). Dataset[0][1] has 17 bonds and 34 edges. 
        feats.extend([btype, btype])
        # print('bond type:', int(bond.GetBondType()))
        # print('b type: ', btype)
    return {'type': torch.tensor(feats).reshape(-1, 1).float()}

In [None]:
# Version that weights graph by bond type integers (test version)

In [None]:
def featurize_bondsm(mol):
    feats = []
    bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,  #enumeration value 
                  Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    for bond in mol.GetBonds():
        btype = int(bond.GetBondType())
        # One bond between atom u and v corresponds to two edges (u, v) and (v, u). Dataset[0][1] has 17 bonds and 34 edges. 
        feats.extend([btype, btype])
    return {'type': torch.tensor(feats).reshape(-1, 1).float()}

In [None]:
featurize_bonds(mol)

In [None]:
mol