In [3]:
import networkx as nx
import numpy as np
import pandas as pd
import math
import pyflagser
import statistics
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import time

from sklearn.model_selection import train_test_split

In [50]:
# Define the file path structure
file_path_template = "Features/{datasetname}{attribute}_degcent_subm2.csv"

# Function to load dataset by name
def load_dataset(datasetname):
    # Construct the file path
    b0_path = file_path_template.format(datasetname=datasetname,attribute='B0')
    b1_path = file_path_template.format(datasetname=datasetname,attribute='B1')
    c0_path = file_path_template.format(datasetname=datasetname,attribute='c0')
    c1_path = file_path_template.format(datasetname=datasetname,attribute='c1')
    
    try:
        # Load the CSV file into a DataFrame
        b0_data = pd.read_csv(b0_path)
        b1_data = pd.read_csv(b1_path)
        c0_data = pd.read_csv(c0_path)
        c1_data = pd.read_csv(c1_path)
        b1_data = b1_data.drop(columns=['Unnamed: 0'])
        b0_data = b0_data.drop(columns=['Unnamed: 0'])
        c1_data = c1_data.drop(columns=['Unnamed: 0'])
        c0_data = c0_data.drop(columns=['Unnamed: 0'])
        b0_array = b0_data.to_numpy()
        b1_array = b1_data.to_numpy()
        c0_array = c0_data.to_numpy()
        c1_array = c1_data.to_numpy()

        # Combine the arrays into a 3D array
        combined_array = np.stack((b0_array, b1_array,c0_array,c1_array), axis=-1)
        print(f"Successfully loaded dataset: {datasetname} features")
        return combined_array
    except FileNotFoundError:
        print(f"Error: File for dataset '{datasetname}' not found.")
        return None
def load_label(dataset):
    if dataset=='PROTEINS':
        url='https://raw.githubusercontent.com/AstritTola/TopER/refs/heads/main/Datasets/PROTEINS/PROTEINS_graph_labels.txt'
        graph_label=np.loadtxt(url)
        max_value = np.max(graph_label)
        graph_label[graph_label == max_value] = 0 #start graph label with 0
    elif dataset=='BZR':
        url='https://raw.githubusercontent.com/AstritTola/TopER/refs/heads/main/Datasets/BZR/BZR_graph_labels.txt'
        graph_label=np.loadtxt(url)
        min_value = np.min(graph_label)
        graph_label[graph_label == min_value] = 0 #start graph label with 0
    elif dataset=='COX2':
        url='https://raw.githubusercontent.com/AstritTola/TopER/refs/heads/main/Datasets/COX2/COX2_graph_labels.txt'
        graph_label=np.loadtxt(url)
        min_value = np.min(graph_label)
        graph_label[graph_label == min_value] = 0 #start graph label with 0
    elif dataset=='MUTAG':
        url='https://raw.githubusercontent.com/AstritTola/TopER/refs/heads/main/Datasets/MUTAG/MUTAG_graph_labels.txt'
        graph_label=np.loadtxt(url)
        min_value = np.min(graph_label)
        graph_label[graph_label == min_value] = 0 #start graph label with 0
    elif dataset=='IMDB-BINARY':
        url='https://raw.githubusercontent.com/AstritTola/TopER/refs/heads/main/Datasets/IMDB-BINARY/IMDB-BINARY_graph_labels.txt'
        graph_label=np.loadtxt(url)
    elif dataset=='IMDB-MULTI':
        url='https://raw.githubusercontent.com/AstritTola/TopER/refs/heads/main/Datasets/IMDB-MULTI/IMDB-MULTI_graph_labels.txt'
        graph_label=np.loadtxt(url)
        max_value = np.max(graph_label)
        graph_label[graph_label == max_value] = 0 #start graph label with 0
    elif dataset=='REDDIT-BINARY':
        graph_label=np.loadtxt('REDDIT-BINARY/REDDIT-BINARY_graph_labels.txt')
        min_value = np.min(graph_label)
        graph_label[graph_label == min_value] = 0 #start graph label with 0
        
    else:
        print('Label not avilable')
#     graph_label=np.loadtxt(url)
#     max_value = np.max(graph_label)
#     graph_label[graph_label == max_value] = 0 #start graph label with 0 
    return graph_label
    

In [45]:
def stat(acc_list):
    mean = statistics.mean(acc_list)
    stdev = statistics.stdev(acc_list)
    print(f'Final Accuracy using 10 fold CV: {mean*100:.2f} \u00B1 {stdev*100:.2f}%')
    #print("Final Accuracy using 10 fold CV:", mean:.2f, "\u00B1", stdev,"\n")
def print_stat(train_acc, test_acc):
    argmax=np.argmax(train_acc)
    best_result=test_acc[argmax]
    print(f'Test Accuracy = {best_result:.2f}%\n')
    return best_result
    

In [80]:
features=read_feature('REDDIT-BINARY')
graph_label=load_label('REDDIT-BINARY')

In [99]:
features=read_feature('PROTEINS')#BZR,COX2,REDDIT-BINARY,
graph_label=load_label('PROTEINS')

# Normalized features

In [100]:
normalized_list = [
    (array - np.mean(array, axis=0)) / np.std(array, axis=0)
    for array in features]
normalized_features=np.nan_to_num(normalized_list, nan=0)

  (array - np.mean(array, axis=0)) / np.std(array, axis=0)


In [101]:

X = torch.tensor(normalized_features, dtype=torch.float32)
y = torch.tensor(graph_label, dtype=torch.long)

num_samples = len(X)
print(num_samples)
num_timesteps = len(X[0])
print(num_timesteps)
num_features = len(X[0][0])
print(num_features)
num_classes = len(np.unique(y))
print(num_classes)
     

1113
20
4
2


# original features

In [66]:
features=load_dataset('REDDIT-BINARY')# BZR, PROTEINS,COX2,MUTAG,IMDB-BINARY,IMDB-MULTI
graph_label=load_label('REDDIT-BINARY')

X = torch.tensor(features, dtype=torch.float32)
y = torch.tensor(graph_label, dtype=torch.long)

num_samples = len(X)
print(num_samples)
num_timesteps = len(X[0])
print(num_timesteps)
num_features = len(X[0][0])
print(num_features)
num_classes = len(np.unique(y))
print(num_classes)
     

Successfully loaded dataset: REDDIT-BINARY features
2000
19
4
2


In [67]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score

# Define the Transformer model
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_heads, n_layers, num_timesteps):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, num_timesteps, hidden_dim))
        self.transformer = nn.Transformer(d_model=hidden_dim, nhead=n_heads, num_encoder_layers=n_layers, num_decoder_layers=n_layers)
        self.fc = nn.Linear(hidden_dim * num_timesteps, output_dim)  # Flatten the output of the transformer

    def forward(self, src):
        src_emb = self.embedding(src) + self.positional_encoding[:, :src.size(1), :]
        src_emb = src_emb.permute(1, 0, 2)  # (seq_len, batch, feature)
        transformer_output = self.transformer.encoder(src_emb)
        transformer_output = transformer_output.permute(1, 0, 2).contiguous().view(src.size(0), -1)  # Flatten
        predictions = self.fc(transformer_output)
        return predictions

def reset_weights(model):
    for layer in model.children():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()

# Define input and output dimensions (example placeholders)
input_dim = num_features
hidden_dim = 16
output_dim = num_classes
n_heads = 2
n_layers = 2
num_timesteps = num_timesteps  # Adjust based on your sequence length

# Initialize model, loss function, and optimizer
model = TransformerClassifier(input_dim, hidden_dim, output_dim, n_heads, n_layers, num_timesteps)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# K-Fold Cross Validation
kfold = KFold(n_splits=10, shuffle=True)
loss_per_fold = []
acc_per_fold = []
pre_per_fold=[]
rec_per_fold=[]
f1_per_fold=[]
fold_no = 1

for train_idx, test_idx in kfold.split(X):
    # Split data
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Create DataLoader
    train_data = TensorDataset(X_train, y_train)
    test_data = TensorDataset(X_test, y_test)
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

    # Lists to store metrics
    train_losses = []
    train_accuracies = []
    test_accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    # Train the model
    reset_weights(model)
    model.train()
    for epoch in tqdm(range(100), desc="Processing"):
        epoch_train_loss = 0
        correct_train = 0
        total_train = 0

        # Training loop
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()

            # Track training loss and accuracy
            epoch_train_loss += loss.item()
            _, predicted = torch.max(output, 1)
            total_train += y_batch.size(0)
            correct_train += (predicted == y_batch).sum().item()

        avg_train_loss = epoch_train_loss / len(train_loader)
        train_accuracy = correct_train / total_train
        train_losses.append(avg_train_loss)
        train_accuracies.append(train_accuracy)

        # Evaluate on the test set
        model.eval()
        correct_test = 0
        total_test = 0
        all_preds = []
        all_targets = []
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                output = model(X_batch)
                _, predicted = torch.max(output, 1)
                total_test += y_batch.size(0)
                correct_test += (predicted == y_batch).sum().item()

                # Store predictions and targets for metrics
                all_preds.extend(predicted.cpu().numpy())
                all_targets.extend(y_batch.cpu().numpy())

        test_accuracy = correct_test / total_test
        test_accuracies.append(test_accuracy)

        # Calculate precision, recall, and F1-score
        precision = precision_score(all_targets, all_preds, average='weighted',zero_division=0)
        recall = recall_score(all_targets, all_preds, average='weighted',zero_division=0)
        f1 = f1_score(all_targets, all_preds, average='weighted',zero_division=0)

        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

        model.train()  # Switch back to training mode

        # Print metrics for this epoch
        #print(f'Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Train Accuracy = {train_accuracy:.2f}%, Test Accuracy = {test_accuracy:.2f}%')
        #print(f'Precision = {precision:.2f}, Recall = {recall:.2f}, F1-Score = {f1:.2f}')
    #print(f'Score for fold {fold_no}: ')
    #accuracy=print_stat(train_accuracies,test_accuracies)
    accuracy=np.max(test_accuracies)
    pre=np.max(precisions)
    rec=np.max(recalls)
    f1=np.max(f1_scores)
    acc_per_fold.append(accuracy)
    pre_per_fold.append(pre)
    rec_per_fold.append(rec)
    f1_per_fold.append(f1)
    
    
    print(f'Score for fold {fold_no}: Test Accuracy = {accuracy:.2f}%')
#     with open("out_protiens.txt", "w") as file:
#         with redirect_stdout(file):
#             print(f'Score for fold {fold_no}: Test Accuracy = {accuracy:.2f}%')
    fold_no += 1
stat(acc_per_fold)


Processing: 100%|█████████████████████████████| 100/100 [01:59<00:00,  1.20s/it]


Score for fold 1: Test Accuracy = 0.90%


Processing: 100%|█████████████████████████████| 100/100 [02:00<00:00,  1.20s/it]


Score for fold 2: Test Accuracy = 0.90%


Processing: 100%|█████████████████████████████| 100/100 [02:00<00:00,  1.21s/it]


Score for fold 3: Test Accuracy = 0.88%


Processing: 100%|█████████████████████████████| 100/100 [02:00<00:00,  1.21s/it]


Score for fold 4: Test Accuracy = 0.91%


Processing: 100%|█████████████████████████████| 100/100 [02:01<00:00,  1.21s/it]


Score for fold 5: Test Accuracy = 0.90%


Processing: 100%|█████████████████████████████| 100/100 [02:01<00:00,  1.21s/it]


Score for fold 6: Test Accuracy = 0.91%


Processing: 100%|█████████████████████████████| 100/100 [02:02<00:00,  1.22s/it]


Score for fold 7: Test Accuracy = 0.84%


Processing: 100%|█████████████████████████████| 100/100 [02:02<00:00,  1.22s/it]


Score for fold 8: Test Accuracy = 0.91%


Processing: 100%|█████████████████████████████| 100/100 [02:01<00:00,  1.22s/it]


Score for fold 9: Test Accuracy = 0.93%


Processing: 100%|█████████████████████████████| 100/100 [02:02<00:00,  1.23s/it]

Score for fold 10: Test Accuracy = 0.91%
Final Accuracy using 10 fold CV: 89.75 ± 2.18%





In [68]:
stat(pre_per_fold)
stat(rec_per_fold)
stat(f1_per_fold)
    

Final Accuracy using 10 fold CV: 90.31 ± 1.88%
Final Accuracy using 10 fold CV: 89.75 ± 2.18%
Final Accuracy using 10 fold CV: 89.72 ± 2.22%


# random spliting