In [1]:
import csv
import networkx as nx
import numpy as np
from GC2Tree import GC2Tree

#### Prepare Datasets
First, we check if the datasets for benchmarking exists. If not we download them.

In [2]:
import os
from zipfile import ZipFile
from urllib.request import urlretrieve

names = [
    'citeseer-doc-classification',
    'cora',
]

downloadurls = [
    'https://linqs-data.soe.ucsc.edu/public/datasets/citeseer-doc-classification/citeseer-doc-classification.zip',
    'https://linqs-data.soe.ucsc.edu/public/datasets/cora/cora.zip',
]

if not os.path.exists('datasets'):
    os.makedirs('datasets')
for name, url in zip(names, downloadurls):
    if not os.path.exists('datasets/' + name):
        urlretrieve(url, 'datasets/' + name + '.zip')
        with ZipFile('datasets/' + name + '.zip', 'r') as zipObj:
            zipObj.extractall('datasets/')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

def evaluate_baseline(X, y, adj, max_depth=10, n_runs=10, test_ratio=0.2):
    train_accuracies = []
    test_accuracies = []
    for _ in range(n_runs):
        X_with_dummy = np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)
        X_plus = np.concatenate([X_with_dummy, adj.dot(X_with_dummy), adj.dot(adj.dot(X_with_dummy))], axis=1)
        train_X, test_X, train_y, test_y = train_test_split(X_plus,  y, test_size=test_ratio, random_state=42)
        clf = DecisionTreeClassifier(max_depth=12)
        clf.fit(train_X, train_y),
        train_accuracies.append(clf.score(train_X, train_y))
        test_accuracies.append(clf.score(test_X, test_y))
    print(f'Decision Tree with access to ode features, 2-hop neighbor features, {n_runs} runs, {test_ratio} test ratio')
    print(f'Average train accuracy: {np.mean(train_accuracies):.3f}')
    print(f'Average test accuracy: {np.mean(test_accuracies):.3f}')
    print(f'Test Accuracy at best train accuracy: {test_accuracies[np.argmax(train_accuracies)]:.3f}')
    
def evaluate_gc2_tree(X, y, adj, depth=3, tree_depth=8, n_runs=10, test_ratio=0.2):
    train_accuracies = []
    test_accuracies = []
    for _ in range(n_runs):
        test_samples = np.random.choice(len(y), size=int(0.1*len(y)), replace=False)
        test_mask = np.zeros(len(y), dtype=bool)
        test_mask[test_samples] = True
        y_no_test = y.copy()
        y_no_test[test_samples] = -1
        clf = GC2Tree(depth=4, tree_depth=8)
        clf.fit(adj, X, y_no_test, test_mask=test_mask)
        train_accuracies.append(clf.training_accuracy(y, test_mask))
        test_accuracies.append(clf.score(y, test_mask))
    print(f'GC2Tree with access with {n_runs} runs, {test_ratio} test ratio')
    print(f'Average train accuracy: {np.mean(train_accuracies):.3f}')
    print(f'Average test accuracy: {np.mean(test_accuracies):.3f}')
    print(f'Test Accuracy at best train accuracy: {test_accuracies[np.argmax(train_accuracies)]:.3f}')


In [4]:
datasets = [
    'citeseer',
    'cora',
]

for name, dataset in zip(names, datasets):
    with open(f'datasets/{name}/{dataset}.content') as f:
        reader = csv.reader(f, delimiter='\t')
        content = list(reader)

    with open(f'datasets/{name}/{dataset}.cites') as f:
        reader = csv.reader(f, delimiter='\t')
        edges = list(reader)

    node_ids = {row[0]: i for i, row in enumerate(content)}
    X = np.array([
        row[1:-1] for row in content    
    ], dtype=float)
    y = np.array([
        row[-1] for row in content
    ])

    adj = np.zeros((len(node_ids), len(node_ids)))
    for edge in edges:
        if edge[0] in node_ids and edge[1] in node_ids:
            adj[node_ids[edge[0]], node_ids[edge[1]]] = 1
            adj[node_ids[edge[1]], node_ids[edge[0]]] = 1
    print(f'Loaded {dataset} dataset with {len(node_ids)} nodes and {len(edges)} edges')
    evaluate_baseline(X, y, adj, n_runs=5)
    evaluate_gc2_tree(X, y, adj, n_runs=5)

Loaded citeseer dataset with 3312 nodes and 4732 edges
Decision Tree with access to node features, 1-hop neighbor features, 5 runs, 0.2 test ratio
Average train accuracy: 0.8500566251415629
Average test accuracy: 0.6328808446455505
GC2Tree with access to node features, 1-hop neighbor features, 5 runs, 0.2 test ratio
Average train accuracy: 0.827
Average test accuracy: 0.640
Loaded cora dataset with 2708 nodes and 5429 edges
Decision Tree with access to node features, 1-hop neighbor features, 5 runs, 0.2 test ratio
Average train accuracy: 0.9240073868882732
Average test accuracy: 0.7450184501845019
GC2Tree with access to node features, 1-hop neighbor features, 5 runs, 0.2 test ratio
Average train accuracy: 0.898
Average test accuracy: 0.756


In [12]:
y = np.array([1 if formulas[1].evaluate(graph, i) else 0 for i in range(graph.number_of_nodes())])
y.mean()

In [9]:
from c2 import *

formulas = [
    GuardedExistsGeq(4, Var.y, Or(GuardedExistsLeq(7, Var.x, E(Var.x, Var.y)), GuardedExistsGeq(13, Var.x, E(Var.x, Var.y)))),
    GuardedExistsGeq(9, Var.y, GuardedExistsLeq(7, Var.x, GuardedExistsGeq(13, Var.y, E(Var.x, Var.y))))
]
graph = nx.fast_gnp_random_graph(10000, 0.001)
adj = nx.adjacency_matrix(graph)
X = np.zeros((graph.number_of_nodes(), 0))
X_with_dummy = np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)
X_plus = np.concatenate([X_with_dummy, adj.dot(X_with_dummy), adj.dot(adj.dot(X_with_dummy))], axis=1)
for formula in formulas:
    y = np.array([1 if formula.evaluate(graph, i) else 0 for i in range(graph.number_of_nodes())])
    
    print(f'Formula: {formula}')
    evaluate_baseline(X_plus, y, adj, n_runs=5)
    evaluate_gc2_tree(X, y, adj, depth=3, n_runs=1)

KeyboardInterrupt: 

In [None]:
test_samples = np.random.choice(len(y), size=int(0.2*len(y)), replace=False)
test_mask = np.zeros(len(y), dtype=bool)
test_mask[test_samples] = True


strategy =  [
    [0, 1, 1],
    [1, 1, 0],
    [1, 0, 0],
]

In [None]:
clf = GC2Tree(2, 2, strategy=strategy)
clf.fit(adj, X, y, test_mask=test_mask)
clf.score(y, test_mask)

0.5505