In [1]:
!pip install torch-geometric
!pip install pyg_lib torch_scatter torch_sparse -f https://data.pyg.org/whl/torch-2.6.0+cu124.html

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1
Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html
Collecting pyg_lib
  Downloading https://data.pyg.org/whl/torch-2.6.0%2Bcu124/pyg_lib-0.4.0%2Bpt26cu124-cp311-cp311-linux_x86_64.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB

In [2]:
import torch
import torch.nn.functional as F
from torch.nn import ModuleList
from torch.optim import Adam

from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

from torch_geometric.nn import GCNConv
from torch_geometric.nn import HypergraphConv

import pandas as pd

import random
from collections import defaultdict
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# 1-hop 하이퍼엣지 생성
def generate_1hop_hyperedge_index(data):
    edge_index = data.edge_index
    num_nodes = data.num_nodes
    edge_dict = defaultdict(set)

    # 1-hop 이웃 관계 구성 (양방향으로 간주)
    for src, tgt in edge_index.t().tolist():
        edge_dict[src].add(tgt)
        edge_dict[tgt].add(src)

    # hyperedge 생성: 각 노드 + 그 이웃들 = 하나의 hyperedge
    node_list = []
    hyperedge_list = []
    for hyperedge_id, node in enumerate(range(num_nodes)):
        group = edge_dict[node] | {node}  # 자신 포함
        for n in group:
            node_list.append(n)
            hyperedge_list.append(hyperedge_id)

    hyperedge_index = torch.tensor([node_list, hyperedge_list], dtype=torch.long)
    return hyperedge_index

In [4]:
# gaussian noise
def add_gaussian_noise(data, sigma=0.1, seed=42):
    torch.manual_seed(seed)
    noise = torch.randn_like(data.x) * sigma
    data_noisy = data.clone()
    data_noisy.x = data.x + noise
    return data_noisy

# perturbation
def perturb_edges(data, perturb_ratio=0.05, seed=42):
    random.seed(seed)
    num_edges = data.edge_index.size(1)
    num_nodes = data.num_nodes
    data_perturbed = data.clone()

    # edge 삭제
    num_remove = int(num_edges * perturb_ratio)
    edge_indices = list(range(num_edges))
    remove_indices = random.sample(edge_indices, num_remove)

    mask = torch.ones(num_edges, dtype=torch.bool)
    mask[remove_indices] = False
    data_perturbed.edge_index = data.edge_index[:, mask]

    # edge 추가
    num_add = num_remove  # 삭제 수 == 추가 수
    added_edges = []
    while len(added_edges) < num_add:
        u = random.randint(0, num_nodes - 1)
        v = random.randint(0, num_nodes - 1)
        if u == v:
            continue
        # 중복 방지
        if ((data_perturbed.edge_index[0] == u) & (data_perturbed.edge_index[1] == v)).any():
            continue
        added_edges.append([u, v])
        added_edges.append([v, u])  # 무방향 그래프 가정

    if added_edges:
        added_edges = torch.tensor(added_edges).t().contiguous()
        data_perturbed.edge_index = torch.cat([data_perturbed.edge_index, added_edges], dim=1)

    return data_perturbed

# label noise
def add_label_noise(data, noise_ratio=0.1, seed=42):
    torch.manual_seed(seed)
    data_noisy = data.clone()
    num_nodes = data.y.size(0)
    num_noisy = int(num_nodes * noise_ratio)

    all_indices = torch.randperm(num_nodes)
    noisy_indices = all_indices[:num_noisy]

    num_classes = int(data.y.max().item() + 1)
    for idx in noisy_indices:
        original_label = data_noisy.y[idx].item()
        new_label = random.choice([c for c in range(num_classes) if c != original_label])
        data_noisy.y[idx] = new_label

    return data_noisy

In [5]:
# dataset: Cora, Citeseer, Pubmed
normalize = NormalizeFeatures()
datasets = {}

# 원본, gaussain noise, perturbation, label noise
augmentations = {
    "": lambda d: d,  # 원본
    "_gaussiannoise": lambda d: add_gaussian_noise(d.clone(), sigma=0.1),
    "_perturbation": lambda d: perturb_edges(d.clone(), perturb_ratio=0.05),
    "_labelnoise": lambda d: add_label_noise(d.clone(), noise_ratio=0.05),
}

planetoid_names = ['Cora', 'Citeseer', 'Pubmed']

for name in planetoid_names:
    base_data = Planetoid(root=f'data/{name}', name=name, transform=normalize)[0]

    for suffix, aug_func in augmentations.items():
        data_aug = aug_func(base_data)
        data_aug.hyperedge_index = generate_1hop_hyperedge_index(data_aug)
        datasets[name + suffix] = data_aug

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Dow

In [6]:
# hypergraph 정보
def summarize_hypergraph(data):
    if not hasattr(data, 'hyperedge_index'):
        return "No hyperedge_index"

    he = data.hyperedge_index
    num_nodes = data.num_nodes
    num_hyperedges = he[1].max().item() + 1 if he.numel() > 0 else 0

    # 각 하이퍼엣지가 연결한 노드 수
    edge_sizes = Counter(he[1].tolist())
    avg_size = sum(edge_sizes.values()) / len(edge_sizes) if edge_sizes else 0

    return f"{num_hyperedges} hyperedges, {avg_size:.2f} avg size"

# dataset 정보
for name, data in datasets.items():
    print(f"\n Dataset: {name}")
    print(f" - Nodes         : {data.num_nodes}")
    print(f" - Edges         : {data.num_edges}")
    print(f" - Features      : {data.num_node_features}")
    print(f" - Classes       : {data.y.unique().numel()}")
    print(f" - Hypergraph    : {summarize_hypergraph(data)}")


 Dataset: Cora
 - Nodes         : 2708
 - Edges         : 10556
 - Features      : 1433
 - Classes       : 7
 - Hypergraph    : 2708 hyperedges, 4.90 avg size

 Dataset: Cora_gaussiannoise
 - Nodes         : 2708
 - Edges         : 10556
 - Features      : 1433
 - Classes       : 7
 - Hypergraph    : 2708 hyperedges, 4.90 avg size

 Dataset: Cora_perturbation
 - Nodes         : 2708
 - Edges         : 10557
 - Features      : 1433
 - Classes       : 7
 - Hypergraph    : 2708 hyperedges, 5.09 avg size

 Dataset: Cora_labelnoise
 - Nodes         : 2708
 - Edges         : 10556
 - Features      : 1433
 - Classes       : 7
 - Hypergraph    : 2708 hyperedges, 4.90 avg size

 Dataset: Citeseer
 - Nodes         : 3327
 - Edges         : 9104
 - Features      : 3703
 - Classes       : 6
 - Hypergraph    : 3327 hyperedges, 3.74 avg size

 Dataset: Citeseer_gaussiannoise
 - Nodes         : 3327
 - Edges         : 9104
 - Features      : 3703
 - Classes       : 6
 - Hypergraph    : 3327 hyperedg

In [7]:
# GCN
class GCN(torch.nn.Module):
    def __init__(self, in_channels, out_channels, hidden_channels=16, num_layers=2, dropout=0.5):
        super().__init__()
        self.convs = ModuleList()
        self.convs.append(GCNConv(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(GCNConv(hidden_channels, hidden_channels))
        self.convs.append(GCNConv(hidden_channels, out_channels))
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for i, conv in enumerate(self.convs[:-1]):
            x = F.relu(conv(x, edge_index))
            x = F.dropout(x, p=self.dropout, training=self.training)
        return self.convs[-1](x, edge_index)

    def get_hidden_embeddings(self, data):
        x, edge_index = data.x, data.edge_index
        for conv in self.convs[:-1]:
            x = F.relu(conv(x, edge_index))
            x = F.dropout(x, p=self.dropout, training=False)
        return x

# HyperGCN
class HyperGCN(torch.nn.Module):
    def __init__(self, in_channels, out_channels, hidden_channels=16, num_layers=2, dropout=0.5):
        super().__init__()
        self.convs = ModuleList()
        self.convs.append(HypergraphConv(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(HypergraphConv(hidden_channels, hidden_channels))
        self.convs.append(HypergraphConv(hidden_channels, out_channels))
        self.dropout = dropout

    def forward(self, data):
        x, hyperedge_index = data.x, data.hyperedge_index
        for i, conv in enumerate(self.convs[:-1]):
            x = F.relu(conv(x, hyperedge_index))
            x = F.dropout(x, p=self.dropout, training=self.training)
        return self.convs[-1](x, hyperedge_index)

    def get_hidden_embeddings(self, data):
        x, hyperedge_index = data.x, data.hyperedge_index
        for conv in self.convs[:-1]:
            x = F.relu(conv(x, hyperedge_index))
            x = F.dropout(x, p=self.dropout, training=False)
        return x

In [8]:
# train, test
def train(model, data, optimizer, epoch=None):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    if epoch is not None and epoch % 10 == 0:
        print(f"[Epoch {epoch:>3}] Loss: {loss.item():.4f}")

@torch.no_grad()
def test(model, data):
    model.eval()
    out = model(data)
    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        pred = out[mask].argmax(dim=1)
        acc = (pred == data.y[mask]).sum().item() / mask.sum().item()
        accs.append(acc)
    return accs

In [9]:
# training, validation, test
def run_experiment(name="Cora", model_class=GCN, epochs=200, lr=0.01, weight_decay=5e-4, verbose=False):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data = datasets[name].to(device)

    num_features = data.num_features
    num_classes = data.y.unique().numel()

    model = model_class(num_features, num_classes).to(device)
    optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_val_acc = 0.0
    test_acc_at_best_val = 0.0
    final_train_acc = 0.0

    for epoch in range(1, epochs + 1):
        train(model, data, optimizer, epoch if verbose else None)
        accs = test(model, data)
        train_acc, val_acc, test_acc = accs

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            test_acc_at_best_val = test_acc
            final_train_acc = train_acc

        if verbose and epoch % 10 == 0:
            print(f"[Epoch {epoch:>3}] Val Acc: {val_acc:.4f} | Test Acc: {test_acc:.4f}")

    return {
        "Model": model_class.__name__,
        "Dataset": name,
        "Train Acc @ Best Val": final_train_acc,
        "Best Val Acc": best_val_acc,
        "Test Acc @ Best Val": test_acc_at_best_val
    }

In [10]:
# 실행
def run_all_experiments(model_classes=[GCN, HyperGCN], dataset_dict=datasets, verbose=False):
    results = []

    for model_cls in model_classes:
        for name in dataset_dict.keys():
            print(f"\n---------- {model_cls.__name__} | {name} ----------")
            result = run_experiment(name=name, model_class=model_cls, verbose=verbose)
            results.append(result)

    return pd.DataFrame(results)

# 결과 저장 및 출력
df = run_all_experiments(verbose=True)
print("Final Results:")
print(df)


---------- GCN | Cora ----------
[Epoch  10] Loss: 1.8494
[Epoch  10] Val Acc: 0.6420 | Test Acc: 0.6860
[Epoch  20] Loss: 1.6934
[Epoch  20] Val Acc: 0.6860 | Test Acc: 0.7110
[Epoch  30] Loss: 1.5202
[Epoch  30] Val Acc: 0.7240 | Test Acc: 0.7600
[Epoch  40] Loss: 1.3259
[Epoch  40] Val Acc: 0.7300 | Test Acc: 0.7700
[Epoch  50] Loss: 1.1503
[Epoch  50] Val Acc: 0.7660 | Test Acc: 0.7980
[Epoch  60] Loss: 0.9695
[Epoch  60] Val Acc: 0.7740 | Test Acc: 0.7950
[Epoch  70] Loss: 0.8752
[Epoch  70] Val Acc: 0.7720 | Test Acc: 0.7980
[Epoch  80] Loss: 0.7267
[Epoch  80] Val Acc: 0.7780 | Test Acc: 0.8110
[Epoch  90] Loss: 0.6984
[Epoch  90] Val Acc: 0.7820 | Test Acc: 0.8140
[Epoch 100] Loss: 0.6075
[Epoch 100] Val Acc: 0.7860 | Test Acc: 0.8160
[Epoch 110] Loss: 0.5659
[Epoch 110] Val Acc: 0.7800 | Test Acc: 0.8100
[Epoch 120] Loss: 0.4933
[Epoch 120] Val Acc: 0.7820 | Test Acc: 0.8140
[Epoch 130] Loss: 0.4701
[Epoch 130] Val Acc: 0.7940 | Test Acc: 0.8130
[Epoch 140] Loss: 0.4784
[Epoc

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def extract_base_dataset(name):
    return name.split('_')[0]

def extract_aug_type(name):
    if '_gaussiannoise' in name:
        return 'Gaussian Noise'
    elif '_perturbation' in name:
        return 'Edge Perturbation'
    elif '_labelnoise' in name:
        return 'Label Noise'
    else:
        return 'Original'

def plot_grouped_test_accuracy(df, dataset_name, save_path="test_accuracy_plot"):
    df = df.copy()

    # 원하는 데이터셋만 필터링
    df = df[df['Dataset'].str.startswith(dataset_name)]

    df['Augmentation'] = df['Dataset'].apply(extract_aug_type)
    df['Augmentation'] = pd.Categorical(
        df['Augmentation'],
        ['Original', 'Gaussian Noise', 'Edge Perturbation', 'Label Noise'],
        ordered=True
    )

    df['Model'] = df['Model'].replace({'HyperGCN': 'HGCN'})

    plt.figure(figsize=(8, 5))
    sns.barplot(
        data=df,
        x="Augmentation",
        y="Test Acc @ Best Val",
        hue="Model",
        ci="sd",
        dodge=True
    )

    plt.title(f"Test Accuracy on {dataset_name}")
    plt.ylim(0, 1)
    plt.ylabel("Accuracy")
    plt.xlabel("Perturbation Type")
    plt.legend(title="Model")
    plt.grid(True, linestyle='--', alpha=0.3)
    plt.tight_layout()

    if save_path:
        save_path += "_" + dataset_name + ".png"
        plt.savefig(save_path, dpi=300)
        plt.close()
    else:
        plt.show()

plot_grouped_test_accuracy(df, 'Cora')
plot_grouped_test_accuracy(df, 'Citeseer')
plot_grouped_test_accuracy(df, 'Pubmed')


The `ci` parameter is deprecated. Use `errorbar='sd'` for the same effect.

  sns.barplot(

The `ci` parameter is deprecated. Use `errorbar='sd'` for the same effect.

  sns.barplot(

The `ci` parameter is deprecated. Use `errorbar='sd'` for the same effect.

  sns.barplot(
