In [None]:
# The code is partly adapted and inspired from https://www.kaggle.com/code/awadelrahman/tutorial-graph-neural-networks-on-social-networks
# The dataset is from https://snap.stanford.edu/data/github-social.html

In [1]:
!pip install -U sentence-transformers
import torch
!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m94.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224

In [2]:
%matplotlib inline

import json
import collections
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.datasets import SNAPDataset

from torch_geometric.utils.convert import to_networkx
from torch_geometric.nn import GCNConv

import networkx as nx

In [3]:
from torch_geometric.transforms import RandomNodeSplit as masking

In [4]:
#upload the data files to /content
!unzip git_web_ml.zip

Archive:  git_web_ml.zip
   creating: git_web_ml/
  inflating: git_web_ml/musae_git_edges.csv  
  inflating: git_web_ml/musae_git_features.json  
  inflating: git_web_ml/musae_git_target.csv  
  inflating: git_web_ml/citing.txt   
  inflating: git_web_ml/README.txt   


In [5]:
with open("git_web_ml/musae_git_features.json") as json_data:
    data_raw = json.load(json_data)

edges=pd.read_csv("git_web_ml/musae_git_edges.csv")
target_df=pd.read_csv("git_web_ml/musae_git_target.csv")#.to_numpy()[:,2]

In [6]:
features=[]
features_ctr=[]
for i in range(len(data_raw)):
    features_ctr +=[len(data_raw[str(i)])]
    features += data_raw[str(i)]

In [7]:
def processing():
  res = {}
  print(len(data_raw))
  for i in range(len(data_raw)):
    if i % 500 == 0:
      print(i)
    featuresrep = np.array([0]*(max(features)+1))
    feature = data_raw[str(i)]
    featuresrep[feature] = 1
    res[str(i)] = list(featuresrep)
  return res

In [8]:
from tqdm import tqdm

In [None]:
enc = processing()

In [10]:
def getgraph(enc):
    node_features_list=list(enc.values())
    node_features=torch.tensor(node_features_list)
    node_labels=torch.tensor(target_df['ml_target'].values)
    edges_list=edges.values.tolist()
    edge_index01=torch.tensor(edges_list, dtype = torch.long).T
    edge_index02=torch.zeros(edge_index01.shape, dtype = torch.long)#.T
    edge_index02[0,:]=edge_index01[1,:]
    edge_index02[1,:]=edge_index01[0,:]
    edge_index0=torch.cat((edge_index01,edge_index02),axis=1)
    g = Data(x=node_features, y=node_labels, edge_index=edge_index0)
    g_light = Data(x=node_features[:,0:2],
                     y=node_labels   ,
                     edge_index=edge_index0[:,:55])
    return g

In [11]:
g = getgraph(enc)

In [12]:
mask=masking(split="train_rest", num_splits = 1, num_val = 0.3, num_test= 0.6)
g=mask(g)
print(g)
print()
print("training samples",torch.sum(g.train_mask).item())
print("validation samples",torch.sum(g.val_mask ).item())
print("test samples",torch.sum(g.test_mask ).item())

Data(x=[37700, 4005], edge_index=[2, 578006], y=[37700], train_mask=[37700], val_mask=[37700], test_mask=[37700])

training samples 3770
validation samples 11310
test samples 22620


In [13]:
# First try GCN

In [14]:
from torch.nn import Embedding, Linear
class SocialGCN(torch.nn.Module):
    def __init__(self,hiddensize):
        super(SocialGCN, self).__init__()


        self.conv1 = GCNConv(-1, hiddensize)

        self.conv2 = GCNConv(hiddensize, hiddensize)

        self.linear = Linear(hiddensize, 2)


    def forward(self, data):
        x = data.x.float()
        edge_index =  data.edge_index
          
        x = F.relu(self.conv1(x=x, edge_index=edge_index))
        x = F.relu(self.conv2(x=x, edge_index=edge_index))
        return self.linear(x)

In [16]:
def masked_loss(predictions,labels,mask):
    criterion=nn.CrossEntropyLoss()
    mask=mask.float()
    mask=mask/torch.mean(mask)
    loss=criterion(predictions,labels)
    loss=loss*mask
    loss=torch.mean(loss)
    return (loss)    

In [19]:
import torch
from sklearn.metrics import roc_auc_score

def masked_accuracy(predictions, labels, mask):
    mask = mask.float()
    mask /= torch.mean(mask)
    predictions = torch.softmax(predictions, dim=1)[:, 1]  # Use the probability of positive class
    auc = roc_auc_score(labels.detach().cpu().numpy(), predictions.detach().cpu().numpy(), sample_weight=mask.cpu().numpy())
    return auc


In [17]:
def train_social(net,data,epochs=10,lr=0.01):
    optimizer = torch.optim.Adam(net.parameters(), lr=lr) # 00001
    best_accuracy=0.0
    
    train_losses=[]
    train_accuracies=[]

    val_losses=[]
    val_accuracies=[]

    test_losses=[]
    test_accuracies=[]
    
    for ep in range(epochs+1):
        optimizer.zero_grad()
        out=net(data)
        loss=masked_loss(predictions=out,
                         labels=data.y,
                         mask=data.train_mask)
        loss.backward()
        optimizer.step()
        train_losses+=[loss]
        train_accuracy=masked_accuracy(predictions=out,
                                       labels=data.y, 
                                       mask=data.train_mask)
        train_accuracies+=[train_accuracy]
        
        val_loss=masked_loss(predictions=out,
                             labels=data.y, 
                             mask=data.val_mask)
        val_losses+=[val_loss]
        
        val_accuracy=masked_accuracy(predictions=out,
                                     labels=data.y, 
                                     mask=data.val_mask)
        val_accuracies+=[val_accuracy]

        test_accuracy=masked_accuracy(predictions=out,
                                      labels=data.y, 
                                      mask=data.test_mask)
        test_accuracies+=[test_accuracy]
        if ep % 10 == 0:
          print("Epoch {}/{}, Train_Loss: {:.4f}, Train_Accuracy: {:.4f}, Val_Accuracy: {:.4f}, Test_Accuracy: {:.4f}"
                    .format(ep,epochs, loss.item(), train_accuracy, val_accuracy,  test_accuracy))
          best_accuracy=val_accuracy
    return val_accuracy, test_accuracy


In [20]:
device = 'cuda'
net=SocialGCN(64).to(device)
valloss, testloss = train_social(net,g.to(device),epochs=100,lr=0.01)

Epoch 0/100, Train_Loss: 0.6897, Train_Accuracy: 0.3620, Val_Accuracy: 0.3652, Test_Accuracy: 0.3710
Epoch 10/100, Train_Loss: 0.4063, Train_Accuracy: 0.8960, Val_Accuracy: 0.8907, Test_Accuracy: 0.8910
Epoch 20/100, Train_Loss: 0.3337, Train_Accuracy: 0.9167, Val_Accuracy: 0.9086, Test_Accuracy: 0.9095
Epoch 30/100, Train_Loss: 0.3075, Train_Accuracy: 0.9308, Val_Accuracy: 0.9204, Test_Accuracy: 0.9229
Epoch 40/100, Train_Loss: 0.2887, Train_Accuracy: 0.9388, Val_Accuracy: 0.9288, Test_Accuracy: 0.9313
Epoch 50/100, Train_Loss: 0.2693, Train_Accuracy: 0.9482, Val_Accuracy: 0.9385, Test_Accuracy: 0.9409
Epoch 60/100, Train_Loss: 0.2472, Train_Accuracy: 0.9568, Val_Accuracy: 0.9482, Test_Accuracy: 0.9505
Epoch 70/100, Train_Loss: 0.2249, Train_Accuracy: 0.9651, Val_Accuracy: 0.9570, Test_Accuracy: 0.9597
Epoch 80/100, Train_Loss: 0.2053, Train_Accuracy: 0.9718, Val_Accuracy: 0.9637, Test_Accuracy: 0.9668
Epoch 90/100, Train_Loss: 0.1857, Train_Accuracy: 0.9772, Val_Accuracy: 0.9700, Tes

In [None]:
valloss, testloss

(tensor(0.9385, device='cuda:0'), tensor(0.9359, device='cuda:0'))

In [23]:
from torch_geometric.nn import SAGEConv, GCNConv
from torch_geometric.nn import GATConv, HypergraphConv

In [22]:
from torch.nn import Embedding, Linear
class SocialGAT(torch.nn.Module):
    def __init__(self,hiddensize):
        super(SocialGAT, self).__init__()


        self.conv1 = GATConv(-1, hiddensize)

        self.conv2 = GATConv(hiddensize, hiddensize)

        self.linear = Linear(hiddensize, 2)


    def forward(self, data):
        x = data.x.float()
        edge_index =  data.edge_index
          
        x = F.relu(self.conv1(x=x, edge_index=edge_index))
        x = F.relu(self.conv2(x=x, edge_index=edge_index))
        return self.linear(x)

In [24]:
device = 'cuda'
net=SocialGAT(64).to(device)
valloss, testloss = train_social(net,g.to(device),epochs=100,lr=0.01)

Epoch 0/100, Train_Loss: 0.6891, Train_Accuracy: 0.7034, Val_Accuracy: 0.6945, Test_Accuracy: 0.7037
Epoch 10/100, Train_Loss: 0.3388, Train_Accuracy: 0.9056, Val_Accuracy: 0.8967, Test_Accuracy: 0.8959
Epoch 20/100, Train_Loss: 0.2991, Train_Accuracy: 0.9307, Val_Accuracy: 0.9252, Test_Accuracy: 0.9250
Epoch 30/100, Train_Loss: 0.2663, Train_Accuracy: 0.9466, Val_Accuracy: 0.9388, Test_Accuracy: 0.9409
Epoch 40/100, Train_Loss: 0.2327, Train_Accuracy: 0.9603, Val_Accuracy: 0.9526, Test_Accuracy: 0.9544
Epoch 50/100, Train_Loss: 0.1967, Train_Accuracy: 0.9740, Val_Accuracy: 0.9663, Test_Accuracy: 0.9675
Epoch 60/100, Train_Loss: 0.1578, Train_Accuracy: 0.9835, Val_Accuracy: 0.9786, Test_Accuracy: 0.9793
Epoch 70/100, Train_Loss: 0.1272, Train_Accuracy: 0.9895, Val_Accuracy: 0.9865, Test_Accuracy: 0.9869
Epoch 80/100, Train_Loss: 0.1022, Train_Accuracy: 0.9940, Val_Accuracy: 0.9917, Test_Accuracy: 0.9919
Epoch 90/100, Train_Loss: 0.0717, Train_Accuracy: 0.9968, Val_Accuracy: 0.9956, Tes

In [None]:
valloss, testloss

(tensor(0.9708, device='cuda:0'), tensor(0.9707, device='cuda:0'))

In [25]:
from torch.nn import Embedding, Linear
class SocialSAGE(torch.nn.Module):
    def __init__(self,hiddensize):
        super(SocialSAGE, self).__init__()


        self.conv1 = SAGEConv(-1, hiddensize)

        self.conv2 = SAGEConv(hiddensize, hiddensize)

        self.linear = Linear(hiddensize, 2)


    def forward(self, data):
        x = data.x.float()
        edge_index =  data.edge_index
          
        x = F.relu(self.conv1(x=x, edge_index=edge_index))
        x = F.relu(self.conv2(x=x, edge_index=edge_index))
        return self.linear(x)

In [None]:
torch.cuda.empty_cache()

In [26]:
net=SocialSAGE(64).to(device)
valloss, testloss = train_social(net,g.to(device),epochs=100,lr=0.01)

Epoch 0/100, Train_Loss: 0.7229, Train_Accuracy: 0.3445, Val_Accuracy: 0.3282, Test_Accuracy: 0.3480
Epoch 10/100, Train_Loss: 0.4023, Train_Accuracy: 0.9147, Val_Accuracy: 0.9069, Test_Accuracy: 0.9100
Epoch 20/100, Train_Loss: 0.3156, Train_Accuracy: 0.9339, Val_Accuracy: 0.9212, Test_Accuracy: 0.9262
Epoch 30/100, Train_Loss: 0.2492, Train_Accuracy: 0.9587, Val_Accuracy: 0.9464, Test_Accuracy: 0.9508
Epoch 40/100, Train_Loss: 0.2060, Train_Accuracy: 0.9717, Val_Accuracy: 0.9624, Test_Accuracy: 0.9665
Epoch 50/100, Train_Loss: 0.1589, Train_Accuracy: 0.9837, Val_Accuracy: 0.9776, Test_Accuracy: 0.9808
Epoch 60/100, Train_Loss: 0.1060, Train_Accuracy: 0.9930, Val_Accuracy: 0.9916, Test_Accuracy: 0.9924
Epoch 70/100, Train_Loss: 0.0527, Train_Accuracy: 0.9981, Val_Accuracy: 0.9981, Test_Accuracy: 0.9979
Epoch 80/100, Train_Loss: 0.0217, Train_Accuracy: 0.9997, Val_Accuracy: 0.9997, Test_Accuracy: 0.9996
Epoch 90/100, Train_Loss: 0.0072, Train_Accuracy: 1.0000, Val_Accuracy: 1.0000, Tes

In [None]:
valloss, testloss

(tensor(0.9996), tensor(0.9997))