# Problem 6

In [1]:
# Import libraries
import numpy as np
#import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch_geometric.datasets import Planetoid
import networkx as nx
from torch_geometric.utils import to_networkx
from torch.cuda import is_available

device = 'cpu'
if is_available():
	device = 'gpu'

print(f'Using device {device}')

Using device cpu


In [2]:
# Import dataset
dataset = Planetoid(root = '/tmp/Cora', name = 'PubMed')
dataset

PubMed()

## Exploratory Data Analysis

In this problem we utilize the PubMed dataset, a citation network with nodes representing scientific publications from the PubMed database, primarily in the biomedical field. Edges indicate citations between these publications. Each node features a TF-IDF weighted word vector from the publication's abstract from a dictionary which consists of 500 unique words and a class label that denotes the publication's subject category.

### 1. Network Structure

To begin with, assess the number of publications (nodes), citation links (edges), average citations per publication, citation distribution, and clustering coefficient.



In [3]:
data = dataset[0]
data

Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])

In [4]:
data.x

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1046, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0194, 0.0080,  ..., 0.0000, 0.0000, 0.0000],
        [0.1078, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0266, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [5]:
data.edge_index.T

tensor([[ 1378,     0],
        [ 1544,     0],
        [ 6092,     0],
        ...,
        [12278, 19714],
        [ 4284, 19715],
        [16030, 19716]])

PubMed dataset classes:

...

In [6]:
from collections import Counter
print('Class distribution')
sorted(Counter(data.y.tolist()).items())

Class distribution


[(0, 4103), (1, 7739), (2, 7875)]

In [7]:
# Number of publications (nodes)
num_nodes = data.num_nodes
print(f'Number of publications (nodes): {num_nodes}')

# Number of citation links (Edges)
num_edges = data.num_edges
print(f'Number of citation links (edges): {num_edges}')

# Average citations per publication
avg_cit = num_edges / num_nodes
print(f'Average citations per publication: {avg_cit:.2f}')

# Citation distribution
citation_counts = Counter(data.edge_index[1].tolist())
citations = list(citation_counts.values())

# Citation Distribution
citation_counts = Counter(data.edge_index[1].tolist())
citations = list(citation_counts.values())
print(citations)
print(len(citations))

Number of publications (nodes): 19717
Number of citation links (edges): 88648
Average citations per publication: 4.50
[5, 3, 3, 1, 1, 2, 22, 17, 1, 9, 6, 1, 10, 1, 1, 6, 29, 6, 8, 8, 1, 2, 2, 1, 4, 1, 5, 3, 1, 2, 1, 1, 1, 1, 3, 17, 1, 1, 4, 1, 8, 4, 1, 1, 1, 1, 11, 31, 18, 2, 1, 1, 1, 1, 6, 2, 1, 3, 7, 3, 25, 1, 22, 1, 2, 3, 1, 2, 1, 7, 1, 3, 5, 1, 1, 1, 3, 9, 1, 2, 3, 2, 1, 2, 3, 7, 6, 1, 14, 1, 1, 2, 1, 1, 1, 4, 1, 2, 1, 13, 2, 19, 3, 2, 1, 15, 1, 1, 5, 2, 23, 1, 17, 2, 1, 2, 24, 27, 2, 1, 1, 1, 1, 1, 16, 1, 1, 2, 1, 7, 1, 2, 11, 1, 1, 1, 12, 1, 1, 3, 2, 6, 4, 2, 15, 9, 3, 7, 2, 1, 10, 1, 20, 1, 1, 2, 2, 1, 1, 7, 2, 1, 1, 2, 9, 2, 1, 3, 17, 2, 1, 1, 1, 4, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 14, 2, 1, 1, 1, 1, 2, 2, 5, 8, 2, 13, 1, 1, 1, 3, 5, 5, 1, 10, 9, 3, 1, 1, 1, 28, 6, 8, 2, 1, 1, 3, 1, 5, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1, 7, 1, 1, 1, 4, 9, 7, 1, 2, 1, 1, 25, 1, 1, 6, 1, 9, 1, 2, 2, 2, 1, 10, 2, 1, 1, 2, 18, 1, 3, 1, 2, 4, 2, 7, 3, 3, 8, 2, 2, 2, 2, 1, 1, 1, 1, 1, 9, 1, 1, 1, 10,

In [8]:
import seaborn as sns

# Log-log plot
counts, bin_edges = np.histogram(citations, bins=np.logspace(np.log10(1), np.log10(max(citations)), num=100))
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

plt.figure(figsize=(10, 6))
plt.loglog(bin_centers, counts, marker='o', linestyle='none')
plt.title('Citation Distribution (Log-Log Plot)')
plt.xlabel('Number of Citations')
plt.ylabel('Frequency')
plt.grid(True, which="both", ls="--")
plt.show()

: 

In [None]:


# Clustering Coefficient
# Convert the graph to a NetworkX graph for easier computation of clustering coefficient
G = to_networkx(data, to_undirected=True)

clustering_coefficients = nx.clustering(G)
avg_clustering_coefficient = sum(clustering_coefficients.values()) / num_nodes
print(f'Average clustering coefficient: {avg_clustering_coefficient:.4f}')


### 2. Network Features

### 3. Class distribution

## Tasks

### Node Classification

Construct and train a Graph Neural Network (GNN) to categorize publications
within the PubMed dataset into their corresponding subject categories. Test different GNN architectures like GCN, GraphSAGE, and GAT.

### Link Prediction

Build a GNN-based model to predict the presence of citation links between
publications.