In [1]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import gzip
import pickle
import itertools
from sklearn.metrics import f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load the pickle file
with gzip.open('data/combined_graph_with3embedding_processed.pkl.gz', 'rb') as f:
    joinned_graph = pickle.load(f)

with gzip.open('data/combined_graph_with3embedding_processed_0.pkl.gz', 'rb') as f:
    mag_graph = pickle.load(f)

with gzip.open('data/combined_graph_with3embedding_processed_1.pkl.gz', 'rb') as f:
    arxiv_graph = pickle.load(f)

In [3]:
joinned_graph.ndata['label'].shape, mag_graph.ndata['label'].shape, arxiv_graph.ndata['label'].shape

(torch.Size([369343, 389]),
 torch.Size([200000, 389]),
 torch.Size([169343, 389]))

In [4]:
def train_test_val_split(graph, feat):
    """
    Get the indices of the training, validation, and test sets.
    """
    train_idx = graph.ndata['train_mask'].nonzero(as_tuple=True)[0]
    val_idx = graph.ndata['valid_mask'].nonzero(as_tuple=True)[0]
    test_idx = graph.ndata['test_mask'].nonzero(as_tuple=True)[0]

    feat_map = {
        'orig': 'feat',
        'e5': 'e5_feat',
        'ga': 'ga_embedding'
    }

    # Original features
    X_train = graph.ndata[feat_map[feat]][train_idx]
    X_test = graph.ndata[feat_map[feat]][test_idx]
    X_val = graph.ndata[feat_map[feat]][val_idx]

    # Labels
    y_train = graph.ndata['label'][train_idx].argmax(dim=1).long()
    y_test = graph.ndata['label'][test_idx].argmax(dim=1).long()
    y_val = graph.ndata['label'][val_idx].argmax(dim=1).long()

    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)
    val_dataset = TensorDataset(X_val, y_val)

    return train_dataset, test_dataset, val_dataset

In [9]:
class MLP(nn.Module):
    def __init__(self, in_feats, hidden_size, num_classes, dropout_rate=0.3):
        super(MLP, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(in_feats, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        x = self.linear_relu_stack(x)
        return x

def evaluate(model, dataloader, criterion, device):
    model.eval()  
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_samples += labels.size(0)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    avg_loss = total_loss / total_samples
    accuracy = correct_predictions / total_samples
    f1 = f1_score(all_labels, all_predictions, average='macro')
    return avg_loss, accuracy, f1

def train(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=100):
    model.to(device)
    best_val_accuracy = 0.0
    best_val_f1 = 0.0
    best_model_state = None
    best_val_epoch = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            features, labels = batch
            features, labels = features.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.detach().item()
        train_loss /= len(train_loader)

        # Validation
        val_loss, val_accuracy, val_f1 = evaluate(model, val_loader, criterion, device)
        if epoch % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
              
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_val_f1 = val_f1
            best_val_epoch = epoch
            best_model_state = model.state_dict()
            #print(f'Epoch [{epoch+1}/{num_epochs}]: Best validation accuracy improved to {best_val_accuracy:.4f}. Saving model.')
        
    return best_model_state, best_val_accuracy, best_val_f1, best_val_epoch


In [6]:
# Hyperparameters
in_feats = 128
hidden_size = 512
num_classes = 389
learning_rate = 0.005
dropout_rate = 0.3
batch_size = 1024
num_epochs = 100

In [7]:
def experiment(graph, feat, in_feats, hidden_size, num_classes, dropout_rate, batch_size, num_epochs):
    """
    Run the experiment with the specified graph and feature type.
    """

    # Model initialization
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = MLP(in_feats, hidden_size, num_classes, dropout_rate).to(device)
    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Split the dataset
    train_dataset, test_dataset, val_dataset = train_test_val_split(graph, feat)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Training
    best_model_state, best_val_accuracy, best_val_f1, best_val_epoch = train(model, train_loader, val_loader, criterion, optimizer, device, num_epochs)
    
    # Load best model state
    model.load_state_dict(best_model_state)
    train_loss, train_accuracy, train_f1 = evaluate(model, train_loader, criterion, device)

    # Evaluate on test set
    test_loss, test_accuracy, test_f1 = evaluate(model, test_loader, criterion, device)
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Test F1: {test_f1:.4f}')

    result = {
        'Best Val Epoch': best_val_epoch,
        'Training Loss (Best Val Model)': train_loss,
        'Best_Val_Accuracy': best_val_accuracy,
        'Train_Accuracy': train_accuracy,
        'Test_Accuracy': test_accuracy,
        'Test_f1': test_f1
    }

    return result


In [10]:
graphs = {"joinned": joinned_graph, "mag":mag_graph, "arxiv":arxiv_graph}
features = ['orig', 'e5', 'ga']

combinations = list(itertools.product(graphs.keys(), features))

results_dict = {}
for pairs in combinations:
    print(f"Running experiment for graph: {pairs[0]} with feature: {pairs[1]}")
    results_dict[pairs] = experiment(graphs[pairs[0]], pairs[1], in_feats, hidden_size, num_classes, dropout_rate, batch_size, num_epochs)


Running experiment for graph: joinned with feature: orig
Epoch [1/100], Train Loss: 3.0223, Validation Loss: 2.3373
Epoch [11/100], Train Loss: 2.2915, Validation Loss: 2.1744
Epoch [21/100], Train Loss: 2.2205, Validation Loss: 2.1686
Epoch [31/100], Train Loss: 2.1814, Validation Loss: 2.1768
Epoch [41/100], Train Loss: 2.1558, Validation Loss: 2.1958
Epoch [51/100], Train Loss: 2.1348, Validation Loss: 2.2090
Epoch [61/100], Train Loss: 2.1213, Validation Loss: 2.2242
Epoch [71/100], Train Loss: 2.1122, Validation Loss: 2.2255
Epoch [81/100], Train Loss: 2.1080, Validation Loss: 2.2288
Epoch [91/100], Train Loss: 2.0967, Validation Loss: 2.2528
Test Loss: 2.0686, Test Accuracy: 0.4501, Test F1: 0.0775
Running experiment for graph: joinned with feature: e5
Epoch [1/100], Train Loss: 2.9115, Validation Loss: 2.3713
Epoch [11/100], Train Loss: 2.4801, Validation Loss: 2.3668
Epoch [21/100], Train Loss: 2.4441, Validation Loss: 2.3959
Epoch [31/100], Train Loss: 2.4258, Validation Loss:

In [11]:
import pandas as pd
pd.options.display.float_format = '{:.4f}'.format

results_df = pd.DataFrame.from_dict(results_dict, orient='index')
results_df.reset_index(inplace=True)
results_df.columns = ['Graph', 'Feature', 'Best Val Epoch', 'Training Loss (Best Val Model)', 'Best_Val_Accuracy', 'Train_Accuracy', 'Test_Accuracy', 'Test_f1']
results_df['Model'] = 'MLP'
results_df['Graph'] = results_df['Graph'].map(lambda x: {'joinned': 'Joinned', 'mag': 'MAG', 'arxiv': 'Arxiv'}[x])
results_df['Feature'] = results_df['Feature'].map(lambda x: {'orig': 'Original', 'e5': 'E5 Embedding', 'ga': 'Graph Algin Embedding'}[x])
results_df[['Model','Graph', 'Feature', 'Best Val Epoch', 'Best_Val_Accuracy', 'Train_Accuracy', 'Test_Accuracy', 'Test_f1']]

Unnamed: 0,Model,Graph,Feature,Best Val Epoch,Best_Val_Accuracy,Train_Accuracy,Test_Accuracy,Test_f1
0,MLP,Joinned,Original,18,0.4276,0.4774,0.4501,0.0775
1,MLP,Joinned,E5 Embedding,10,0.3814,0.4285,0.1935,0.0472
2,MLP,Joinned,Graph Algin Embedding,28,0.2747,0.3899,0.1565,0.031
3,MLP,MAG,Original,56,0.3427,0.4281,0.3372,0.0502
4,MLP,MAG,E5 Embedding,24,0.3327,0.447,0.3355,0.0473
5,MLP,MAG,Graph Algin Embedding,11,0.3457,0.4027,0.2997,0.0393
6,MLP,Arxiv,Original,21,0.5755,0.7742,0.5462,0.3339
7,MLP,Arxiv,E5 Embedding,6,0.5072,0.6857,0.1242,0.0931
8,MLP,Arxiv,Graph Algin Embedding,77,0.2016,0.4633,0.0862,0.0232


In [12]:
results_df.to_csv('Experiment_Results/MLP.csv', index=False)