In [16]:
import pandas as pd
import numpy as np
import os
import json
import networkx as nx
from torch_geometric.utils import from_networkx
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler

path = '../preprocessed_data/politifact/'
labels = ['fake', 'real']

data_list = []
for label in labels:
    for news_article in os.listdir(os.path.join(path, label)):
        path_n = os.path.join(path, label, news_article)
        
        with open(os.path.join(path_n, f"{news_article}_graph.txt")) as f:
            edges = json.load(f)

        # edges = np.array(edges).T
        
        node_features = pd.read_csv(os.path.join(path_n, f"{news_article}_nf.csv"))
        node_features['type'] = node_features['type'].map({'tweet': 1, 'retweet':2})
        ids = node_features['id']
        node_features = node_features.drop(['id'], axis=1)
        ss = StandardScaler()
        node_features = pd.DataFrame(data=ss.fit_transform(node_features), columns=node_features.columns)
        node_features = node_features.to_dict(orient="records")

        g = nx.DiGraph()
        for id, i in zip(ids, node_features):
            g.add_node(str(id), **i)
        g.add_edges_from(edges)
        nx.set_node_attributes(g, {news_article: {k: 0 for k in i.keys()}})
        g = nx.convert_node_labels_to_integers(g)
        data = from_networkx(g, group_node_attrs=list(i.keys()))

        data.y = 1 if label == "real" else 0

        data_list.append(data)

In [1]:
import torch
data_list = torch.load("gdl_data.txt")

In [4]:
import random

random.shuffle(data_list)

train_dataset = data_list[:500]
test_dataset = data_list[500:]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 500
Number of test graphs: 289


In [5]:
len([1 for i in train_dataset if i.y == 0])

245

In [6]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# for step, data in enumerate(train_loader):
#     print(f'Step {step + 1}:')
#     print('=======')
#     print(f'Number of graphs in the current batch: {data.num_graphs}')
#     print(data)
#     print()

In [7]:
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GraphConv, GCNConv
from torch_geometric.nn import global_mean_pool, global_add_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GraphConv(12, hidden_channels)
        self.conv2 = GraphConv(hidden_channels, hidden_channels)
        self.conv3 = GraphConv(hidden_channels, hidden_channels)
        self.conv4 = GraphConv(hidden_channels, hidden_channels)
        self.conv5 = GraphConv(hidden_channels, hidden_channels)
        self.lin1 = Linear(hidden_channels, 32)
        self.lin2 = Linear(32, 2)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)
        x = x.relu()
        x = self.conv4(x, edge_index)
        x = x.relu()
        x = self.conv5(x, edge_index)

        # 2. Readout layer
        x = global_add_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = self.lin1(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        
        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GraphConv(12, 64)
  (conv2): GraphConv(64, 64)
  (conv3): GraphConv(64, 64)
  (conv4): GraphConv(64, 64)
  (conv5): GraphConv(64, 64)
  (lin1): Linear(in_features=64, out_features=32, bias=True)
  (lin2): Linear(in_features=32, out_features=2, bias=True)
)


In [8]:
model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.
    
    return loss

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 20):
    loss = train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.3f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

Epoch: 001, Loss: 231234368.000, Train Acc: 0.5100, Test Acc: 0.5156
Epoch: 002, Loss: 80198896.000, Train Acc: 0.4960, Test Acc: 0.5017
Epoch: 003, Loss: 90719328.000, Train Acc: 0.5240, Test Acc: 0.5156
Epoch: 004, Loss: 13584580.000, Train Acc: 0.5000, Test Acc: 0.5087
Epoch: 005, Loss: 104387.008, Train Acc: 0.5180, Test Acc: 0.4671
Epoch: 006, Loss: 181.353, Train Acc: 0.4400, Test Acc: 0.4187
Epoch: 007, Loss: 14.213, Train Acc: 0.4740, Test Acc: 0.4118
Epoch: 008, Loss: 7.058, Train Acc: 0.4780, Test Acc: 0.4083
Epoch: 009, Loss: 19.973, Train Acc: 0.5440, Test Acc: 0.5502
