In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import torch 
from models import NodeClassifier 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from torch_geometric.data import Data, DataLoader
from torch_geometric.datasets import Planetoid 


cora_dataset = Planetoid(root = "/home/jik19004/FilesToRun/AdversarialGNN", name = "Cora", split = "public") 
pubmed_dataset= Planetoid(root = "/home/jik19004/FilesToRun/AdversarialGNN", name = "PubMed", split = "public") 
citeseer_dataset = Planetoid(root = "/home/jik19004/FilesToRun/AdversarialGNN", name = "CiteSeer", split = "public") 

In [4]:
print("Cora has {} number of classes".format((cora_dataset.num_classes)))
print("")
for key, value in cora_dataset.data: 
    print(key)
    
print("\n\nThe length of cora train mask: {}".format(sum(cora_dataset.data["train_mask"])))
print("The length of cora validation mask: {}".format(sum(cora_dataset.data["val_mask"])))
print("The length of cora test mask: {}".format(sum(cora_dataset.data["test_mask"])))

Cora has 7 number of classes

x
edge_index
y
train_mask
val_mask
test_mask


The length of cora train mask: 140
The length of cora validation mask: 500
The length of cora test mask: 1000




In [5]:
import torch.nn.functional as F 

train_mask = cora_dataset.data["train_mask"]
val_mask = cora_dataset.data["val_mask"]
test_mask = cora_dataset.data["test_mask"]

train_cora_x = cora_dataset.data["x"][train_mask]
train_cora_y = F.one_hot(cora_dataset.data["y"][train_mask], num_classes = 7)

val_cora_x = cora_dataset.data["x"][val_mask]
val_cora_y = F.one_hot(cora_dataset.data["y"][val_mask], num_classes = 7)

test_cora_x = cora_dataset.data["x"][test_mask]
test_cora_y = F.one_hot(cora_dataset.data["y"][test_mask], num_classes = 7)

In [6]:
from mechanisms import RandomizedResopnse, MultiBit 
from models import NodeClassifier 

eps = [0.1, 0.2, 0.5, 1, 1.5, 2]
random_response = RandomizedResopnse(eps[4],7)
print(random_response(test_cora_y[0]))
print(test_cora_y[0])

multibit = MultiBit(eps = 0.1,input_range = (100, 500))
print(multibit.eps)

tensor([0, 0, 0, 1, 0, 0, 0])
tensor([0, 0, 0, 1, 0, 0, 0])
0.1


In [7]:
edge_index = cora_dataset.data["edge_index"].numpy()

print("The dataset contains loops?: {}".format(cora_dataset.data.contains_self_loops()))
print("The dataset contains isolated nodes?: {}".format(cora_dataset.data.contains_isolated_nodes()))
print("The dataset is directed?: {}".format(cora_dataset.data.is_directed()))
print("Number of edgwes in the dataset: {}".format(cora_dataset.data.num_edges))    
print("The number of average degree in the nodes: {}".format(cora_dataset.data.num_edges/cora_dataset.data.num_nodes))  
print(edge_index[:, np.where(edge_index[0]==30)[0]])

The dataset contains loops?: False
The dataset contains isolated nodes?: False
The dataset is directed?: False
Number of edgwes in the dataset: 10556
The number of average degree in the nodes: 3.8980797636632203
[[  30   30   30   30   30   30]
 [ 697  738 1358 1416 2162 2343]]




# PreProcessing Data and Perturbing our Data 

In [8]:
cora_dataset = Planetoid(root = "/home/jik19004/FilesToRun/AdversarialGNN", name = "Cora", split = "public")
cora_dataset = cora_dataset.data

labeled_nodes = torch.where(cora_dataset.y >= 0)[0]  # Get indices of labeled nodes
num_labeled = labeled_nodes.size(0)
perm = torch.randperm(num_labeled)  # Random permutation of indices

train_split = int(0.5 * num_labeled)
val_split = int(0.25 * num_labeled)

train_idx = labeled_nodes[perm[:train_split]]
val_idx = labeled_nodes[perm[train_split:train_split + val_split]]
test_idx = labeled_nodes[perm[train_split + val_split:]]

cora_dataset.train_mask = torch.zeros(cora_dataset.num_nodes, dtype=torch.bool)
cora_dataset.val_mask = torch.zeros(cora_dataset.num_nodes, dtype=torch.bool)
cora_dataset.test_mask = torch.zeros(cora_dataset.num_nodes, dtype=torch.bool)
cora_dataset.train_mask[train_idx] = True
cora_dataset.val_mask[val_idx] = True
cora_dataset.test_mask[test_idx] = True

In [9]:
from transforms import FeaturePerturbation, LabelPerturbation 
from sklearn.preprocessing import MinMaxScaler 


cora_dataset = Planetoid(root = "/home/jik19004/FilesToRun/AdversarialGNN", name = "Cora", split = "public")
cora_dataset = cora_dataset.data 

#### spliting the data 
labeled_nodes = torch.where(cora_dataset.y >= 0)[0]  # Get indices of labeled nodes
num_labeled = labeled_nodes.size(0)
perm = torch.randperm(num_labeled)  # Random permutation of indices

train_split = int(0.5 * num_labeled)
val_split = int(0.25 * num_labeled)

train_idx = labeled_nodes[perm[:train_split]]
val_idx = labeled_nodes[perm[train_split:train_split + val_split]]
test_idx = labeled_nodes[perm[train_split + val_split:]]

cora_dataset.train_mask = torch.zeros(cora_dataset.num_nodes, dtype=torch.bool)
cora_dataset.val_mask = torch.zeros(cora_dataset.num_nodes, dtype=torch.bool)
cora_dataset.test_mask = torch.zeros(cora_dataset.num_nodes, dtype=torch.bool)
cora_dataset.train_mask[train_idx] = True
cora_dataset.val_mask[val_idx] = True
cora_dataset.test_mask[test_idx] = True ### modifying our train, val, and test partitions. 





scaler = MinMaxScaler(feature_range=(0,1))
data = cora_dataset.x 
train_mask = cora_dataset.train_mask 
val_mask = cora_dataset.val_mask 
test_mask = cora_dataset.test_mask 

train_data = data[train_mask].numpy() 
val_data = data[val_mask].numpy() 
test_data = data[test_mask].numpy() 

scaler.fit(train_data)
train_data = torch.Tensor(scaler.transform(train_data)) # Standardize our data into a fixed range.
val_data = torch.Tensor(scaler.transform(val_data))
test_data = torch.Tensor(scaler.transform(test_data))

data[train_mask] = train_data
data[val_mask] = val_data
data[test_mask] = test_data 
cora_dataset.x = data  

feature_preprocess = FeaturePerturbation(mechanism = "mbm", x_eps = 0.1)
cora_transform = feature_preprocess(cora_dataset) 
cora_transform = LabelPerturbation(y_eps = 2)(cora_transform)
print(cora_transform)

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708, 7], train_mask=[2708], val_mask=[2708], test_mask=[2708], T=[7, 7])




# Regular GCN model

In [10]:
from torch_geometric.nn import GCNConv 

class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.dropout = torch.nn.Dropout(p=0.5)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [11]:
import torch 
import math 
def Train_and_Evaluate(data, num_epochs, num_callbacks, criterion, device, y_eps = 2, num_classes = 7): 
    model = NodeClassifier(input_dim = 1433, model = "gcn", hidden_dim = 16, num_classes = 7, dropout = 0.01, 
                       x_steps = 4, y_steps =2, forward_correction = True)
# try to set the privacy epsilon budget to something that is very high, play around with hyper parameters. 
# try to play around with different splits of the training, validation, and testing partitions. 
    train_mask = data.train_mask 
    val_mask = data.val_mask 
    
    optimizer = torch.optim.Adam(params = model.parameters(),lr = 0.001, weight_decay = 0.001)
    training_output = data.y[train_mask].float() 
    validation_output = data.y[val_mask].float() 
    
    training_loss_list = [] 
    validation_loss_list = [] 
    
    model = model.to(device)
    data = data.to(device)
    training_output = training_output.to(device)
    validation_output = validation_output.to(device)
    val_accuracy = 0 
    train_accuracy = 0 
    best_val_accuracy = 0
    best_val_loss = np.inf 
    limit_val_accuracy = math.e.__pow__(y_eps)/(math.e.__pow__(y_eps) + num_classes - 1) * 100
    

    callback = 0 
    train_loss_val = 0 
    best_loss_val = np.inf 
    current_validation_loss = 0 
    for i in range(num_epochs): 
        model.train()
        optimizer.zero_grad()  
        training_loss, train_metrics = model.training_step(data) 
        training_loss.backward() 
        optimizer.step() # have the optimizer update the weights. 
        training_loss_list.append(training_loss.item()) 
        train_loss_val = training_loss.item()  
        train_accuracy = train_metrics["train/acc"]
        with torch.no_grad():
            model.eval()
            val_loss, val_metrics = model.validation_step(data) 
            validation_loss_list.append(val_loss) 
            current_validation_loss = val_loss 
            val_accuracy = val_metrics["val/acc"]
        
        if (i % 10 == 0):
            print("Training loss at epoch {}: {}. Validation loss: {}".format(i, train_loss_val, current_validation_loss))  
        
        if val_loss < best_val_loss and val_accuracy <= limit_val_accuracy and train_accuracy <= limit_val_accuracy:
            best_val_loss = val_loss
            torch.save(model, "/home/jik19004/FilesToRun/AdversarialGNN/LPGNN")
            callback = 0 
            
        else: 
            callback+=1 
            if callback >= num_callbacks: 
                break 
def Evaluate(model, data, device):
    model = model.to(device)
    data = data.to(device)
    loss, metrics = model.validation_step(data)
    print(metrics["test/acc"])    

In [12]:
import torch 
def Train_and_Evaluate_normal(model, data, num_epochs, num_callbacks, criterion, device): 
    train_mask = data.train_mask 
    val_mask = data.val_mask 
    
    optimizer = torch.optim.Adam(params = model.parameters())
    training_output = data.y[train_mask].float() 
    validation_output = data.y[val_mask].float() 
    LossFunc = criterion() 
    
    training_loss_list = [] 
    validation_loss_list = [] 
    
    model = model.to(device)
    data = data.to(device)
    training_output = training_output.to(device)
    validation_output = validation_output.to(device)
    
    callback = 0 
    train_loss_val = 0 
    best_loss_val = np.inf 
    current_validation_loss = 0 
    for i in range(num_epochs): 
        model.train() 
        predictedValues = model(data.x, data.edge_index) 
        training_predictions = predictedValues[train_mask] 
        optimizer.zero_grad() 
        training_loss = LossFunc(training_predictions, training_output) 
        training_loss.backward() 
        optimizer.step() # have the optimizer update the weights. 
        training_loss_list.append(training_loss.item()) 
        train_loss_val = training_loss.item()  
        
        with torch.no_grad():
            model.eval() 
            predictedValues = model(data.x, data.edge_index)
            predictedValues = predictedValues[val_mask]
            val_loss = LossFunc(predictedValues, validation_output)
            validation_loss_list.append(val_loss.item()) 
            current_validation_loss = val_loss.item() 
        
        if (i % 10 == 0):
            print("Training loss at epoch {}: {}. Validation loss: {}".format(i, train_loss_val, current_validation_loss))            
            
        if current_validation_loss < best_loss_val and train_loss_val < best_loss_val: 
            best_loss_val = current_validation_loss 
            torch.save(model, "/home/jik19004/FilesToRun/AdversarialGNN/LPGNN_normal")
            callback = 0 
        else: 
            callback+=1 
            if callback >= num_callbacks: 
                break 
def Evaluate_normal(model, data, loss, device):
    model = model.to(device)
    data = data.to(device)
    
    test_mask = data.test_mask 
    prediction = model(data.x, data.edge_index)
    test_preds = prediction[test_mask]
    test_output = data.y[test_mask]
    test_numeric = test_output.argmax(dim = 1)
    _, test_preds_max = test_preds.max(dim = 1)
    correct = (test_preds_max == test_numeric).sum().item()  # Count correct predictions

    rate = correct/len(test_output)
    LossFunc = loss() 
    return LossFunc(test_preds, test_output), rate 
    

In [86]:
regular_cora = Planetoid(root = "/home/jik19004/FilesToRun/AdversarialGNN", name = "Cora", split = "public")
regular_cora = regular_cora.data 
regular_model = GCN(in_channels = 1433, hidden_channels = 16, out_channels=7) 
Train_and_Evaluate(cora_transform, 500, np.inf, torch.nn.L1Loss, torch.device(0))

Training loss at epoch 0: 1.8882899284362793. Validation loss: 1.9425396919250488
Training loss at epoch 10: 1.7497422695159912. Validation loss: 1.9324451684951782
Training loss at epoch 20: 1.6503822803497314. Validation loss: 1.9229793548583984
Training loss at epoch 30: 1.566489338874817. Validation loss: 1.9139456748962402
Training loss at epoch 40: 1.4923007488250732. Validation loss: 1.904600977897644
Training loss at epoch 50: 1.424039363861084. Validation loss: 1.8946433067321777
Training loss at epoch 60: 1.3640096187591553. Validation loss: 1.8845158815383911
Training loss at epoch 70: 1.3092103004455566. Validation loss: 1.8745923042297363
Training loss at epoch 80: 1.2603241205215454. Validation loss: 1.8650163412094116
Training loss at epoch 90: 1.2161146402359009. Validation loss: 1.8557467460632324
Training loss at epoch 100: 1.1771162748336792. Validation loss: 1.8467885255813599
Training loss at epoch 110: 1.1407994031906128. Validation loss: 1.838183045387268
Trainin

Finding $K_{x}$ and $K_{y}$ with GraphSage and optimal weight decay + dropout rate. 

In [87]:
model = torch.load("/home/jik19004/FilesToRun/AdversarialGNN/LPGNN")
Evaluate(model, cora_transform, torch.device(0))

67.79911373707533
