# CPSC 583 Final Project

### Import Libraries

In [None]:
!pip install torch_geometric
!pip install rdkit

In [None]:
import os
import torch

from torch_geometric.nn import GCNConv, SAGEConv, GATConv, global_mean_pool
import torch.nn.functional as F
from torch.nn import Linear
import numpy as np
from torch_geometric.datasets import MoleculeNet
from torch_geometric.datasets import QM9
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from torch_geometric.loader import DataLoader

### 1. Load & Preprocess the Data

In [None]:
# Load the Tox21 dataset
Tox21 = MoleculeNet('MoleculeNet', "Tox21")
# Load the QM9 dataset
QM9 = QM9(root='data/QM9')

# Preprocess the data
def preprocess_data(dataset):
    # Convert the molecular graphs to a graph representation
    data_list = [data for data in dataset]

    # Split the data into training, validation, and test sets
    train_data = data_list[:int(len(data_list)*0.7)]
    val_data = data_list[int(len(data_list)*0.7):int(len(data_list)*0.85)]
    test_data = data_list[int(len(data_list)*0.85):]

    return train_data, val_data, test_data

# Preprocess the Tox21 dataset
Tox21_train, Tox21_val, Tox21_test = preprocess_data(Tox21)

# Preprocess the QM9 dataset
QM9_train, QM9_val, QM9_test = preprocess_data(QM9)

# Create data loaders
Tox21_train_loader = DataLoader(Tox21_train, batch_size=64, shuffle=True)
Tox21_val_loader = DataLoader(Tox21_val, batch_size=64, shuffle=False)
Tox21_test_loader = DataLoader(Tox21_test, batch_size=64, shuffle=False)

QM9_train_loader = DataLoader(QM9_train, batch_size=64, shuffle=True)
QM9_val_loader = DataLoader(QM9_val, batch_size=64, shuffle=False)
QM9_test_loader = DataLoader(QM9_test, batch_size=64, shuffle=False)

### 2. GNN

In [None]:
class GNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()

        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.linear = Linear(hidden_channels, out_channels)
        self.relu = torch.nn.ReLU()

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = self.relu(x)

        x = self.conv2(x, edge_index)
        x = self.relu(x)

        x = self.conv3(x, edge_index)
        x = self.relu(x)

        x = global_mean_pool(x, batch)
        x = self.linear(x)

        return x

class Discriminator(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()

        self.fc1 = Linear(in_channels, hidden_channels)
        self.fc2 = Linear(hidden_channels, hidden_channels)
        self.fc3 = Linear(hidden_channels, 1)
        self.relu = torch.nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

### 3. Training

In [None]:
def train_gnn(model, loader, optimizer, criterion):
    model.train()
    for data in loader:
        optimizer.zero_grad()
        out = model(data.x.float(), data.edge_index, data.batch)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()

def train_discriminator(model, discriminator, loader, optimizer, criterion):
    model.eval()
    discriminator.train()
    for data in loader:
        optimizer.zero_grad()
        with torch.no_grad():
            out = model(data.x.float(), data.edge_index, data.batch)
        fake_data = out.detach()
        real_data = torch.randn_like(fake_data)
        fake_preds = discriminator(fake_data)
        real_preds = discriminator(real_data)
        loss = criterion(fake_preds, torch.zeros_like(fake_preds)) + criterion(real_preds, torch.ones_like(real_preds))
        loss.backward()
        optimizer.step()

### 4. Evaluation

In [None]:
def evaluate(model, loader, criterion):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for data in loader:
            out = model(data.x.float(), data.edge_index, data.batch)
            y_true_batch = data.y.cpu().numpy()
            y_pred_batch = out.cpu().numpy()

            nan_mask = np.isnan(y_true_batch)
            y_true_batch = np.where(nan_mask, 0, y_true_batch)

            y_true.append(y_true_batch)
            y_pred.append(y_pred_batch)

    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)

    nan_mask = np.isnan(y_pred)
    y_pred = np.where(nan_mask, 0, y_pred)

    y_pred = (y_pred > 0.5).astype(int)
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    return accuracy, roc_auc

### 5. Training Loop - Tox21

In [None]:
lr = 0.001
epochs = 50

gnn = GNN(in_channels=Tox21.num_features, hidden_channels=64, out_channels=Tox21.num_classes)
optimizer_gnn = torch.optim.Adam(gnn.parameters(), lr=lr)

discriminator = Discriminator(in_channels=Tox21.num_classes, hidden_channels=64)
optimizer_discriminator = torch.optim.Adam(discriminator.parameters(), lr=lr)

criterion_gnn = torch.nn.BCEWithLogitsLoss()
criterion_discriminator = torch.nn.BCELoss()

for epoch in range(epochs):
    train_gnn(gnn, Tox21_train_loader, optimizer_gnn, criterion_gnn)
    train_discriminator(gnn, discriminator, Tox21_train_loader, optimizer_discriminator, criterion_discriminator)

    accuracy, roc_auc = evaluate(gnn, Tox21_val_loader, criterion_gnn)
    print(f'Epoch: {epoch + 1}, Accuracy: {accuracy}, ROC AUC: {roc_auc}')

In [None]:
num_runs = 3
results = []

for run in range(num_runs):
    gnn = GNN(in_channels=Tox21.num_features, hidden_channels=64, out_channels=Tox21.num_classes)
    optimizer_gnn = torch.optim.Adam(gnn.parameters(), lr=lr)

    discriminator = Discriminator(in_channels=Tox21.num_classes, hidden_channels=64)
    optimizer_discriminator = torch.optim.Adam(discriminator.parameters(), lr=lr)

    for epoch in range(epochs):
        train_gnn(gnn, Tox21_train_loader, optimizer_gnn, criterion_gnn)
        train_discriminator(gnn, discriminator, Tox21_train_loader, optimizer_discriminator, criterion_discriminator)

    accuracy, roc_auc = evaluate(gnn, Tox21_test_loader, criterion_gnn)
    results.append((accuracy, roc_auc))

mean_results = np.mean(results, axis=0)
std_error_results = np.std(results, axis=0) / np.sqrt(num_runs)

print(f'Mean Results: Accuracy={mean_results[0]}, ROC AUC={mean_results[1]}')
print(f'Std Error Results: Accuracy={std_error_results[0]}, ROC AUC={std_error_results[1]}')

### 5. Training Loop - QM9

In [None]:
lr = 0.001
epochs = 50

gnn = GNN(in_channels=QM9.num_features, hidden_channels=64, out_channels=QM9.num_classes)
optimizer_gnn = torch.optim.Adam(gnn.parameters(), lr=lr)

discriminator = Discriminator(in_channels=QM9.num_classes, hidden_channels=64)
optimizer_discriminator = torch.optim.Adam(discriminator.parameters(), lr=lr)

criterion_gnn = torch.nn.BCEWithLogitsLoss()
criterion_discriminator = torch.nn.BCELoss()

for epoch in range(epochs):
    train_gnn(gnn, QM9_train_loader, optimizer_gnn, criterion_gnn)
    train_discriminator(gnn, discriminator, QM9_train_loader, optimizer_discriminator, criterion_discriminator)

    accuracy, roc_auc = evaluate(gnn, QM9_val_loader, criterion_gnn)
    print(f'Epoch: {epoch + 1}, Accuracy: {accuracy}, ROC AUC: {roc_auc}')

In [None]:
num_runs = 3
results = []

for run in range(num_runs):
    gnn = GNN(in_channels=QM9.num_features, hidden_channels=64, out_channels=QM9.num_classes)
    optimizer_gnn = torch.optim.Adam(gnn.parameters(), lr=lr)

    discriminator = Discriminator(in_channels=QM9.num_classes, hidden_channels=64)
    optimizer_discriminator = torch.optim.Adam(discriminator.parameters(), lr=lr)

    for epoch in range(epochs):
        train_gnn(gnn, QM9_train_loader, optimizer_gnn, criterion_gnn)
        train_discriminator(gnn, discriminator, QM9_train_loader, optimizer_discriminator, criterion_discriminator)

    accuracy, roc_auc = evaluate(gnn, QM9_test_loader, criterion_gnn)
    results.append((accuracy, roc_auc))

mean_results = np.mean(results, axis=0)
std_error_results = np.std(results, axis=0) / np.sqrt(num_runs)

print(f'Mean Results: Accuracy={mean_results[0]}, ROC AUC={mean_results[1]}')
print(f'Std Error Results: Accuracy={std_error_results[0]}, ROC AUC={std_error_results[1]}')