### GCN implementation for Fake News Detection on Gossipcop dataset

# Setup


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import the pytorch library into environment and check its version
import os
import torch
print("Using torch", torch.__version__)

Using torch 2.1.0+cu118


In [None]:
# Install torch geometric
import os
if 'IS_GRADESCOPE_ENV' not in os.environ:
  !pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-2.1.0+cu118.html
  !pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-2.1.0+cu118.html
  !pip install torch-geometric
  !pip install -q git+https://github.com/snap-stanford/deepsnap.git

import torch_geometric
torch_geometric.__version__

Looking in links: https://pytorch-geometric.com/whl/torch-2.1.0+cu118.html
Looking in links: https://pytorch-geometric.com/whl/torch-2.1.0+cu118.html
  Preparing metadata (setup.py) ... [?25l[?25hdone


'2.4.0'

In [None]:
import argparse
import os.path as osp
import numpy as np
from math import ceil
import pandas as pd
import copy
from tqdm import tqdm, trange

import torch
import torch.nn.functional as F
from torch.nn import Linear

from torch_geometric.datasets import UPFD
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GATConv, GCNConv, SAGEConv, global_max_pool, DenseSAGEConv
from torch_geometric.explain import Explainer, GNNExplainer
from torch_geometric.transforms import ToUndirected
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from torch_geometric.nn import global_mean_pool as gmp

import torch.nn.functional as F
from torch_geometric.data import DenseDataLoader
import torch_geometric.transforms as T
from torch_geometric.nn import DenseSAGEConv, dense_diff_pool
from torch.utils.data import random_split
import matplotlib.pyplot as plt
from torch_geometric.nn import global_mean_pool

# GCN
## Load Data

In [None]:
def load_data(split):
  """
  Load train, validation, and test data from the UPFD dataset in PyG. Concact node
  features *profile* and *bert*, which are Twitter user's profile attributes and
  historical tweets encoded through BERT respectively.

  -------------------------------------
  split: 'train', 'val', or 'test' for retrieving the respective portion of UPFD.

  -------------------------------------
  Return: PyG dataset object.
  """

  data_profile =  UPFD('/tmp/test', "gossipcop", "profile", split, ToUndirected())
  data_bert =  UPFD('/tmp/test', "gossipcop", "bert", split, ToUndirected())
  data_profile.data.x = torch.cat((data_profile.data.x, data_bert.data.x),dim =1)

  return data_profile

In [None]:
# Retrieve data
train_data = load_data('train')
test_data = load_data('test')
val_data = load_data('val')

# Prepare data loader for GNN
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
val_loader = DataLoader(val_data, batch_size=128, shuffle=False)
test_loader = DataLoader(test_data, batch_size=128, shuffle=False)



In [None]:
for step, data in enumerate(train_loader):
    print(f'Step {step + 1}, number of graphs in the current batch: {data.num_graphs}')
    print(f'Step {step + 1}, number of nodes in the current batch: {data.num_nodes}')
    print(f'Step {step + 1}, the graph id to which each node belongs is: {data.batch}')
    print()

Step 1, number of graphs in the current batch: 128
Step 1, number of nodes in the current batch: 7349
Step 1, the graph id to which each node belongs is: tensor([  0,   0,   0,  ..., 127, 127, 127])

Step 2, number of graphs in the current batch: 128
Step 2, number of nodes in the current batch: 8649
Step 2, the graph id to which each node belongs is: tensor([  0,   0,   0,  ..., 127, 127, 127])

Step 3, number of graphs in the current batch: 128
Step 3, number of nodes in the current batch: 7022
Step 3, the graph id to which each node belongs is: tensor([  0,   0,   0,  ..., 127, 127, 127])

Step 4, number of graphs in the current batch: 128
Step 4, number of nodes in the current batch: 7612
Step 4, the graph id to which each node belongs is: tensor([  0,   0,   0,  ..., 127, 127, 127])

Step 5, number of graphs in the current batch: 128
Step 5, number of nodes in the current batch: 6995
Step 5, the graph id to which each node belongs is: tensor([  0,   0,   0,  ..., 127, 127, 127])



In [None]:
data.x.shape

torch.Size([3810, 778])

## GCN Architecture

In [None]:


class GCN(torch.nn.Module):
  def __init__(self, args):
    super(GCN, self).__init__()

    self.GCNConv1 = GCNConv(args.num_features, args.hidden_dim) #first GCNConv module
    self.batchnorm = torch.nn.BatchNorm1d(args.hidden_dim)
    self.GCNConv2 = GCNConv(args.hidden_dim, args.hidden_dim) #second GCNConv module
    self.Linear = torch.nn.Linear(args.hidden_dim, args.num_classes) #linear function
    self.relu = torch.nn.ReLU() #relu function

  def forward(self, x, edge_index, batch):
    output = None
    output = self.GCNConv1(x, edge_index)
    output = self.batchnorm(output)
    output = self.relu(output)
    output = self.GCNConv2(output, edge_index)
    output = self.batchnorm(output)
    output = global_mean_pool(output, batch)
    output = self.Linear(output)
    output = F.log_softmax(output, dim=-1)

    return output


In [None]:
data.batch.shape

torch.Size([3810])

## GCN Model Training

In [None]:
"""
parser add argument (optimizer, loss_fn, eval_metrics defualt accuracy)
"""

def train(model, data, optimizer, loss_fn):
  """
  Train the model using given data, optimizer, and loss_fn
  --------------------------------------
  model: the model we specified
  data: train data multiple batches of graphs, therefore needs to loop through each batch of graph
        and add loss of each graph results.
  """
  model.train()
  out_loss = 0.0
  for i, data in enumerate(data):

    optimizer.zero_grad()
    out = model(data.x, data.edge_index, data.batch)
    loss = loss_fn(out, data.y)

    loss.backward()
    optimizer.step()

    out_loss += loss.item()

  return out_loss


In [None]:
data.y.shape

torch.Size([69])

## GCN Model Performance

In [None]:
@torch.no_grad()
def evaluate(model, loader, eval_metric = 'all'):
  """
  Evaluate performance of the a GCN model based on the eval_metric.
  """

  # Put model in evaluation mode
  model.eval()

  pred = []
  label = []

  for data in loader:
    pred_y = model(data.x, data.edge_index, data.batch).argmax(dim=-1, keepdim=True).tolist()
    pred.extend(pred_y)
    label.extend(data.y)

  acc = accuracy_score(label, pred)
  f1 = f1_score(label, pred)
  auc = roc_auc_score(label, pred)

  return acc, f1, auc


## Run GCN Model

In [None]:
parser = argparse.ArgumentParser()

parser.add_argument('--seed', type=int, default=777, help='random seed')
parser.add_argument('--device', type=str, default='cpu', help='specify cuda devices')

# hyper-parameters
parser.add_argument('--dataset', type=str, default='gossipcop', help='[politifact, gossipcop]')
parser.add_argument('--batch_size', type=int, default=128, help='batch size')
parser.add_argument('--lr', type=float, default=0.01, help='learning rate')
parser.add_argument('--weight_decay', type=float, default=0.01, help='weight decay')
parser.add_argument('--hidden_dim', type=int, default=128, help='hidden size')
parser.add_argument('--dropout', type=float, default=0.0, help='dropout ratio')
parser.add_argument('--epochs', type=int, default=100, help='maximum number of epochs')
parser.add_argument('--concat', type=bool, default=True, help='whether concat news embedding and graph embedding')
parser.add_argument('--multi_gpu', type=bool, default=False, help='multi-gpu mode')
parser.add_argument('--feature', type=str, default='bert', help='feature type, [profile, spacy, bert, content]')
parser.add_argument('--model', type=str, default='sage', help='model type, [gcn, gat, sage]')
parser.add_argument('--eval', type = str, default = 'accuracy', help = 'evaluation type')
parser.add_argument('--num_layers', type=str, default = 2, help = 'num of GNN layers')

args, _ = parser.parse_known_args()
torch.manual_seed(args.seed)
if torch.cuda.is_available():
	torch.cuda.manual_seed(args.seed)

args.num_classes = train_data.num_classes
args.num_features = train_data.num_features

model = GCN(args)
model = model.to(args.device)

In [None]:
#if __name__ == '__main__':

best_model = None
best_val_value = 0

evaluator = args.eval

optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay = args.weight_decay)
loss_fn = torch.nn.CrossEntropyLoss()

# Lists to store values for plotting
train_losses = []
train_eval_values = []
val_eval_values = []

model.train()
for epoch in tqdm(range(1, args.epochs)):
  loss = train(model, train_loader, optimizer, loss_fn)
  train_eval = evaluate(model, train_loader)
  val_eval = evaluate(model, val_loader)

  # Append values for plotting
  train_losses.append(loss)
  train_eval_values.append(train_eval[0])
  val_eval_values.append(val_eval[0])

  if val_eval[0] > best_val_value:
    best_val_value = val_eval[0]
    best_model = copy.deepcopy(model)
  print(f'loss_train:{loss:.4f}, train_{evaluator}:{train_eval[0]:.4f}, val_{evaluator}:{val_eval[0]:.4f} ')
acc, f1, auc = evaluate(best_model, test_loader)
print(f'Test results: acc {acc:.4f}, f1 score: {f1:.4f}, auc: {auc:.4f}' )



  0%|          | 0/99 [00:00<?, ?it/s]


ValueError: ignored

In [None]:
# Plotting

plt.figure(figsize=(12, 4))

  # Plot Loss
plt.subplot(1, 3, 1)
plt.plot(train_losses, label='Train Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()

  # Plot Accuracy
plt.subplot(1, 3, 2)
plt.plot(train_eval_values, label='Train Accuracy')
plt.plot(val_eval_values, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

  # Assuming 'evaluate' function returns acc, f1, auc
acc, f1, auc = evaluate(best_model, test_loader)

  # Plot F1 Score and AUC
plt.subplot(1, 3, 3)
plt.bar(['Accuracy', 'F1 Score', 'AUC'], [acc, f1, auc])
plt.ylabel('Metrics')
plt.title('Test Metrics')

plt.tight_layout()
plt.show()