In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import networkx as nx
import json
import pandas as pd
import torch
import os
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
import numpy as np
print(torch.__version__)

# The PyG built-in GCNConv


In [None]:
# Install torch geometric
if 'IS_GRADESCOPE_ENV' not in os.environ:
  !pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.13.1+cu116.html
  !pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.13.1+cu116.html
  !pip install torch-geometric
  !pip install ogb

!pip install node2vec

In [None]:
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
from tqdm import tqdm
from node2vec import Node2Vec
from torch_geometric.nn import GCNConv

import torch_geometric.transforms as T
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator

In [None]:
# /content/drive/MyDrive/CS286-Reddit/reddit_edges.json
# /content/drive/MyDrive/CS286-Reddit/reddit_target.csv

In [5]:
f = open('/content/drive/MyDrive/CS286-Reddit/reddit_edges.json')
content = json.load(f)
df = pd.read_csv('/content/drive/MyDrive/CS286-Reddit/reddit_target.csv')

G_list = list(content.values())
Gs = [nx.Graph(i) for i in G_list[0:1000]]
y = list(df.target)[0:1000]

In [None]:
emb_list = []
for i in Gs:
  node2vec = Node2Vec(i, dimensions=128, walk_length=10, num_walks=150)
  model = node2vec.fit(window=10)
  v = torch.tensor(model.wv.vectors, dtype=torch.float)
  emb_list.append(v)
# torch.save(emb_list,'/content/drive/MyDrive/CS286-Reddit/node2vec_embedding.pt')

In [None]:
# emb_list = torch.load('/content/drive/MyDrive/CS286-Reddit/node2vec_embedding.pt')

In [18]:
y = torch.tensor([[i] for i in y])

In [19]:
edge_indices = []
for i in G_list[0:1000]:
  edge_indices.append(torch.tensor(i, dtype=torch.long).T)
# torch.save(edge_indices,'/content/drive/MyDrive/CS286-Reddit/edge_indices.pt')

In [20]:
raw_data = list(zip(*(emb_list,edge_indices,y)))

In [21]:
data = []
for i in raw_data:
  data.append(Data(x=i[0], y=i[2], edge_index=i[1]))

In [22]:
trainset, testset= train_test_split(data, test_size=0.25, random_state=42)
# trainset, valset= train_test_split(trainset, test_size=0.2, random_state=42)

In [23]:
train_loader = DataLoader(trainset, batch_size=16, shuffle=True, num_workers=0)
# valid_loader = DataLoader(valset, batch_size=32, shuffle=False, num_workers=0)
test_loader = DataLoader(testset, batch_size=16, shuffle=False, num_workers=0)

In [None]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GraphConv
from torch_geometric.nn import global_mean_pool, global_add_pool, global_max_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(123)
        self.conv1 = GraphConv(128, hidden_channels)
        self.bn1 = torch.nn.BatchNorm1d(hidden_channels)
        self.conv2 = GraphConv(hidden_channels, hidden_channels)
        self.bn2 = torch.nn.BatchNorm1d(hidden_channels)
        # self.conv3 = GraphConv(hidden_channels, hidden_channels)
        # self.bn3 = torch.nn.BatchNorm1d(hidden_channels)
        self.hidden1 = Linear(hidden_channels, hidden_channels)
        self.re1 = torch.nn.ReLU()
        self.hidden2 = Linear(hidden_channels, hidden_channels)
        self.re2 = torch.nn.ReLU()
        self.lin = Linear(hidden_channels, 2)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = x.relu()
        # x = self.conv3(x, edge_index)
        # x = self.bn3(x)
        # x = x.relu()
        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.hidden1(x)
        x = self.re1(x)
        x = self.hidden2(x)
        x = self.re2(x)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=256)
print(model)

In [None]:
from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = GCN(hidden_channels=256)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 120):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')