In [1]:
%load_ext autoreload
%autoreload 2

import logging

logging.basicConfig(level=logging.WARNING)
logging.getLogger('anonymigraph').setLevel(logging.INFO)
logging.getLogger('anonymigraph.metrics').setLevel(logging.WARNING)


In [2]:
from anonymigraph.metrics.evaluator import Evaluator
from anonymigraph.metrics.utility.structural.privacy_metrics import PercentageKDegreeAnonMetric

from anonymigraph.metrics.utility.structural import (
    DegreeCentralityMetric,
    EigenvectorMetric,
    PageRankMetric,
    ClosenessCentralityMetric,
    LocalClusteringCoefficientMetric,
    WLColorMetric,

    ConnectedComponentsMetric,
    NumberOfEdgesMetric,
    NumberOfNodesMetric,
    NumberOfTrianglesMetric,
    MeanDegreeMetric,
    MaxDegreeMetric,
    MedianDegreeMetric,
    AverageClusteringCoefficientMetric,
    TransitivityMetric,

    EdgeJaccardMetric,
    KatzCentralityMetric,

)

from anonymigraph.anonymization import (
    KDegreeAnonymizer,
    RandomEdgeAddDelAnonymizer,
    ConfigurationModelAnonymizer,
    NestModelAnonymizer,
    PygmalionModelAnonymizer,
    PrivateColorAnonymizer,
)


In [3]:
import pandas as pd
from collections import defaultdict
import numpy as np
import pickle

def get_latex_table(data, precision=5):
    """Helper function to get the latex code for the table"""

    G_values = {}
    cleaned_data = {}
    for key, metrics in data.items():
        new_entry = {}
        for metric_name, metric_value in metrics.items():
            if isinstance(metric_value, dict):
                G_values[metric_name] = metric_value["G"]
                new_entry[metric_name] = metric_value["Ga"]
            else:
                new_entry[metric_name] = metric_value

        cleaned_data[key] = new_entry

    for metric in list(cleaned_data.values())[0].keys():
        if metric not in G_values:
            G_values[metric] = None

    # Categories
    utility_scalar_metrics = ['|V|', '|E|', 'C']
    utility_distributions_metrics = ['Deg.','Katz','Ev.','CC','LCC']
    all_metrics = list(list(cleaned_data.values())[0].keys())
    privacy_metrics = [m for m in all_metrics
                    if m not in utility_scalar_metrics and m not in utility_distributions_metrics]

    df = pd.DataFrame(cleaned_data).T
    ordered_metrics = utility_scalar_metrics + utility_distributions_metrics + privacy_metrics
    df = df[ordered_metrics]

    df_str = df.copy()
    best_indices = {}
    for col in df.columns:
        best_indices[col] = df[col].nsmallest(3).index
    for c_idx, col in enumerate(df_str.columns):
        # Decide precision based on column:
        if c_idx in [0, 1]:
            df_str[col] = df_str[col].apply(lambda x: f"{int(x)}")
        elif c_idx in list(range(2,8)):
            df_str[col] = df_str[col].apply(lambda x: f"{x:.{precision}f}")
        else:
            df_str[col] = df_str[col].apply(lambda x: f"{x:.2f}")

        if c_idx in list(range(3,10)):
            for idx in best_indices[col]:
                df_str.loc[idx, col] = f"\\textbf{{{df_str.loc[idx, col]}}}"

    df_str.loc[len(df)] = G_values
    latex_str = df_str.to_latex(index=True,
                                caption="",
                                label="tab:",
                                bold_rows=True,
                                column_format="l" + "c"*(df.shape[1]),
                                formatters=[str] * len(df.columns)
                            )

    return latex_str

def get_statics_of_samples(sample_data):
    """Helper function which aggregates sample runs to mean and stds."""
    averages = defaultdict(lambda: defaultdict(list))

    for entry in sample_data:
        for method, metrics in entry.items():
            for metric, value in metrics.items():
                if isinstance(value, dict):
                    averages[method][metric].append(value['Ga'])
                else:
                    averages[method][metric].append(value)

    means = {
        method: {
            metric: float(np.mean(vals)) for metric, vals in metrics.items()
        }
        for method, metrics in averages.items()
    }

    rel_stds = {
        method: {
            metric: float(np.std(vals)/np.mean(vals)) for metric, vals in metrics.items()
        }
        for method, metrics in averages.items()
    }

    return means, rel_stds



# Polbooks (|V| = 105)

In [None]:
import networkx as nx
from scipy.sparse.linalg import eigs
import numpy as np
import os
import urllib.request

if not os.path.exists('polbooks.gml'):
    urllib.request.urlretrieve('https://networkdata.ics.uci.edu/data/polbooks/polbooks.gml', 'polbooks.gml')

G = nx.read_gml('polbooks.gml')
G = nx.convert_node_labels_to_integers(G)

eigenvalues, _ = eigs(nx.adjacency_matrix(G).astype(np.float64), k=1, which='LM')
max_alpha = 1 / np.abs(eigenvalues).max()

alpha=0.5*max_alpha
beta=1
print(G)
print("Alpha:", alpha)

In [None]:
polbooks_samples_data = []
for seed in range(42, 42+4): # 4 samples with different seeds
    # METRICS
    metrics = {
        # Important
        # Graph Level
        "|V|": NumberOfNodesMetric(),
        "|E|": NumberOfEdgesMetric(),
        "C": AverageClusteringCoefficientMetric(),
        #"|Δ|": NumberOfTrianglesMetric(),
        #"Transitivity": TransitivityMetric(),

        # Node Level
        "Deg.": DegreeCentralityMetric(),
        "Katz": KatzCentralityMetric(alpha=alpha),
        "Ev.": EigenvectorMetric(),
        "LCC": LocalClusteringCoefficientMetric(),
        "CC": ClosenessCentralityMetric(),
        #"TVD WL Colors d=2": WLColorMetric(depth=2),

        # Graph Level
        #"|CC|": ConnectedComponentsMetric(),
        #"Median Deg.": MedianDegreeMetric(),
        #"Avg. Deg.": MeanDegreeMetric(),
        #"Max Deg.": MaxDegreeMetric(),
        #"PageRank":	PageRankMetric(),

        r"\(\vert E \cap E'\vert\)": EdgeJaccardMetric(),
        r"\% 4-degree Anon": PercentageKDegreeAnonMetric(k=4),
        r"\% 16-degree Anon": PercentageKDegreeAnonMetric(k=16),
    }

    methods = {}
    #methods["Configuration Model"] = ConfigurationModelAnonymizer()

    methods[r"NeSt \(d=1\)"] = NestModelAnonymizer(depth=1, r=10)
    methods[r"NeSt \(d=2\)"] = NestModelAnonymizer(depth=2, r=10)

    #methods["PrivateColor(w=1e1)"] = PrivateColorAnonymizer(w=1e1, alpha=alpha, is_eager=True, use_optimal1d=False)
    methods[r"Eager \(w=10^{-2}\)"] = PrivateColorAnonymizer(w=1e-2, alpha=alpha, is_eager=True, use_optimal1d=False)
    methods[r"Eager \(w=10^{-3}\)"] = PrivateColorAnonymizer(w=1e-3, alpha=alpha, is_eager=True, use_optimal1d=False)
    methods[r"Eager \(w=10^{-4}\)"] = PrivateColorAnonymizer(w=1e-4, alpha=alpha, is_eager=True, use_optimal1d=False)

    methods[r"16\% Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(16/100*G.number_of_edges()))
    #methods[f"{8}% Random Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(8/100*G.number_of_edges()))
    methods[r"4\% Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(4/100*G.number_of_edges()))
    #methods[f"{2}% Random Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(2/100*G.number_of_edges()))
    methods[r"1\% Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(1/100*G.number_of_edges()))

    methods[r"64-Degree Anonymity"] = KDegreeAnonymizer(k=64)
    #methods[f"{32}-Degree Anonymity"] = KDegreeAnonymizer(k=32)
    methods[r"16-Degree Anonymity"] = KDegreeAnonymizer(k=16)
    #methods[f"{8}-Degree Anonymity"] = KDegreeAnonymizer(k=8)
    methods[r"4-Degree Anonymity"] = KDegreeAnonymizer(k=4)
    #methods[f"{2}-Degree Anonymity"] = KDegreeAnonymizer(k=2)

    methods[r"Pygmalion \(\epsilon = 1\)"] = PygmalionModelAnonymizer(eps = 1)
    methods[r"Pygmalion \(\epsilon = 100\)"] = PygmalionModelAnonymizer(eps = 100)
    methods[r"Pygmalion \(\epsilon = \infty\)"] = PygmalionModelAnonymizer(eps = 100_000_000)

    evaluator = Evaluator(metrics, use_igraph=True)

    data = {}

    for method_name, method in methods.items():
        print(f"Anonymizing with method {method_name}")
        Ga = method.anonymize(G, random_seed=seed)
        print(f"Evaluating method {method_name}")
        data[method_name] = evaluator.evaluate(G, Ga)

    polbooks_samples_data.append(data)

os.makedirs('cache', exist_ok=True)
with open('cache/exp3_comp_polbooks_samples_data.pkl', 'wb') as f:
    pickle.dump(polbooks_samples_data, f)

In [None]:
with open('cache/exp3_comp_polbooks_samples_data.pkl', 'rb') as f:
    polbooks_samples_data = pickle.load(f)

means, rel_stds = get_statics_of_samples(polbooks_samples_data)
print(get_latex_table(polbooks_samples_data[0]))
print(get_latex_table(means))
print(get_latex_table(rel_stds))

\textbf{NeSt \(d=1\)} & 105 & 441 & 0.15275 & \textbf{0.00000} & 0.00182 & 0.01673 & 0.09551 & 0.33478 & 0.09 & \textbf{0.74} & 0.21 \\
\textbf{NeSt \(d=2\)} & 105 & 441 & 0.26719 & \textbf{0.00000} & \textbf{0.00028} & 0.01853 & 0.07457 & 0.22034 & 0.56 & \textbf{0.74} & 0.21 \\
\textbf{Eager \(w=10^{-2}\)} & 105 & 441 & 0.16057 & \textbf{0.00000} & 0.00036 & 0.01825 & 0.09209 & 0.32696 & 0.18 & \textbf{0.74} & 0.21 \\
\textbf{Eager \(w=10^{-3}\)} & 105 & 441 & 0.21006 & 0.00000 & \textbf{0.00017} & 0.01822 & 0.08972 & 0.27746 & 0.35 & 0.74 & 0.21 \\
\textbf{Eager \(w=10^{-4}\)} & 105 & 441 & 0.36911 & 0.00000 & \textbf{0.00002} & 0.01650 & 0.05881 & 0.11842 & 0.77 & 0.74 & 0.21 \\
\textbf{16\% Edge Add/Del} & 105 & 441 & 0.30970 & 0.00609 & 0.00334 & 0.01655 & 0.05986 & 0.17782 & 0.73 & 0.81 & 0.22 \\
\textbf{4\% Edge Add/Del} & 105 & 441 & 0.43467 & 0.00256 & 0.00138 & \textbf{0.00566} & \textbf{0.02893} & \textbf{0.05286} & 0.93 & 0.76 & 0.30 \\
\textbf{1\% Edge Add/Del} & 105 & 441 & 0.47648 & 0.00114 & 0.00067 & \textbf{0.00998} & \textbf{0.01200} & \textbf{0.01182} & 0.98 & 0.75 & 0.26 \\
\textbf{64-Degree Anonymity} & 105 & 315 & 0.13381 & 0.03626 & 0.02002 & 0.06270 & \textbf{0.02161} & 0.35372 & 0.45 & 1.00 & 1.00 \\
\textbf{16-Degree Anonymity} & 105 & 436 & 0.33699 & 0.00861 & 0.00293 & 0.01642 & 0.05555 & 0.15054 & 0.77 & 1.00 & 1.00 \\
\textbf{4-Degree Anonymity} & 105 & 439 & 0.44248 & 0.00147 & 0.00080 & \textbf{0.01443} & 0.03475 & \textbf{0.04505} & 0.93 & 1.00 & 0.21 \\
\textbf{Pygmalion \(\epsilon = 1\)} & 185 & 700 & 0.03193 & 0.08780 & 0.07194 & 0.06814 & 0.09623 & 0.45559 & \textbf{0.02} & 0.81 & 0.37 \\
\textbf{Pygmalion \(\epsilon = 100\)} & 111 & 420 & 0.12614 & 0.01263 & 0.00346 & 0.01819 & 0.07022 & 0.36139 & \textbf{0.03} & 0.76 & 0.35 \\
\textbf{Pygmalion \(\epsilon = \infty\)} & 105 & 415 & 0.13945 & 0.00476 & 0.00300 & 0.02131 & 0.08569 & 0.34807 & \textbf{0.04} & 0.79 & 0.37 \\

# CA-GrQc (|V| = 5242)
collaboration network https://snap.stanford.edu/data/ca-GrQc.html

In [None]:
import gzip
import networkx as nx
import urllib.request
import os
from scipy.sparse.linalg import eigs
import numpy as np


if not os.path.exists('ca-GrQc.txt.gz'):
    urllib.request.urlretrieve('https://snap.stanford.edu/data/ca-GrQc.txt.gz', 'ca-GrQc.txt.gz')

with gzip.open('ca-GrQc.txt.gz', 'rt') as f:
    G = nx.read_edgelist(f)

# relabel and remove self loops
G = nx.convert_node_labels_to_integers(G)
G.remove_edges_from(nx.selfloop_edges(G)) # There are 12 self loops in the original graph

eigenvalues, _ = eigs(nx.adjacency_matrix(G).astype(np.float64), k=1, which='LM')
max_alpha = 1 / np.abs(eigenvalues).max()

alpha=0.5*max_alpha
beta=1
print(G)
print("Alpha:", alpha)

In [None]:
ca_samples_data = []
for seed in range(55, 55+4): # 4 samples with different seeds
    # METRICS
    metrics = {
        # Important
        # Graph Level
        "|V|": NumberOfNodesMetric(),
        "|E|": NumberOfEdgesMetric(),
        "C": AverageClusteringCoefficientMetric(),
        #"|Δ|": NumberOfTrianglesMetric(),
        #"Transitivity": TransitivityMetric(),

        # Node Level
        "Deg.": DegreeCentralityMetric(),
        "Katz": KatzCentralityMetric(alpha=alpha),
        "Ev.": EigenvectorMetric(),
        "LCC": LocalClusteringCoefficientMetric(),
        "CC": ClosenessCentralityMetric(),
        #"TVD WL Colors d=2": WLColorMetric(depth=2),

        # Graph Level
        #"|CC|": ConnectedComponentsMetric(),
        #"Median Deg.": MedianDegreeMetric(),
        #"Avg. Deg.": MeanDegreeMetric(),
        #"Max Deg.": MaxDegreeMetric(),
        #"PageRank":	PageRankMetric(),

        r"\(\vert E \cap E'\vert\)": EdgeJaccardMetric(),
        r"\% 16-degree Anon": PercentageKDegreeAnonMetric(k=16),
        r"\% 64-degree Anon": PercentageKDegreeAnonMetric(k=64),
    }

    methods = {}
    #methods["Configuration Model"] = ConfigurationModelAnonymizer()

    methods[r"NeSt \(d=1\)"] = NestModelAnonymizer(depth=1, r=10)
    methods[r"NeSt \(d=2\)"] = NestModelAnonymizer(depth=2, r=10)

    #methods["PrivateColor(w=1e1)"] = PrivateColorAnonymizer(w=1e1, alpha=alpha, is_eager=True, use_optimal1d=False)
    methods[r"Eager \(w=10^{-2}\)"] = PrivateColorAnonymizer(w=1e-2, alpha=alpha, is_eager=True, use_optimal1d=False)
    methods[r"Eager \(w=10^{-3}\)"] = PrivateColorAnonymizer(w=1e-3, alpha=alpha, is_eager=True, use_optimal1d=False)
    methods[r"Eager \(w=10^{-4}\)"] = PrivateColorAnonymizer(w=1e-4, alpha=alpha, is_eager=True, use_optimal1d=False)

    methods[r"16\% Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(16/100*G.number_of_edges()))
    #methods[f"{8}% Random Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(8/100*G.number_of_edges()))
    methods[r"4\% Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(4/100*G.number_of_edges()))
    #methods[f"{2}% Random Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(2/100*G.number_of_edges()))
    methods[r"1\% Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(1/100*G.number_of_edges()))

    methods[r"128-Degree Anonymity"] = KDegreeAnonymizer(k=128)
    methods[r"64-Degree Anonymity"] = KDegreeAnonymizer(k=64)
    #methods[f"{32}-Degree Anonymity"] = KDegreeAnonymizer(k=32)
    methods[r"16-Degree Anonymity"] = KDegreeAnonymizer(k=16)
    #methods[f"{8}-Degree Anonymity"] = KDegreeAnonymizer(k=8)
    #methods[f"{2}-Degree Anonymity"] = KDegreeAnonymizer(k=2)

    methods[r"Pygmalion \(\epsilon = 1\)"] = PygmalionModelAnonymizer(eps = 1)
    methods[r"Pygmalion \(\epsilon = 100\)"] = PygmalionModelAnonymizer(eps = 100)
    methods[r"Pygmalion \(\epsilon = \infty\)"] = PygmalionModelAnonymizer(eps = 100_000_000)

    evaluator = Evaluator(metrics, use_igraph=True)

    data = {}

    for method_name, method in methods.items():
        print(f"Anonymizing with method {method_name}")
        Ga = method.anonymize(G, random_seed=seed)
        print(f"Evaluating method {method_name}")
        data[method_name] = evaluator.evaluate(G, Ga)

    ca_samples_data.append(data)

os.makedirs('cache', exist_ok=True)
with open('cache/exp3_comp_ca_samples_data.pkl', 'wb') as f:
    pickle.dump(ca_samples_data, f)

In [None]:
with open('cache/exp3_comp_ca_samples_data.pkl', 'rb') as f:
    ca_samples_data = pickle.load(f)

print(len(ca_samples_data))
means, rel_stds = get_statics_of_samples(ca_samples_data)
print(get_latex_table(ca_samples_data[0]))
print(get_latex_table(means, precision=7))
print(get_latex_table(rel_stds))


# Enron (|V| = 36692)
Enron email communication network https://snap.stanford.edu/data/email-Enron.html


In [None]:
import gzip
import networkx as nx
import urllib.request
import os
from scipy.sparse.linalg import eigs
import numpy as np


if not os.path.exists('email-Enron.txt.gz'):
    urllib.request.urlretrieve('https://snap.stanford.edu/data/email-Enron.txt.gz', 'email-Enron.txt.gz')

with gzip.open('email-Enron.txt.gz', 'rt') as f:
    G = nx.read_edgelist(f)

# relabel and remove self loops
G = nx.convert_node_labels_to_integers(G)

eigenvalues, _ = eigs(nx.adjacency_matrix(G).astype(np.float64), k=1, which='LM')
max_alpha = 1 / np.abs(eigenvalues).max()

alpha=0.5*max_alpha
beta=1
print(G)
print("Alpha:", alpha)

In [12]:
import sys
sys.setrecursionlimit(100_000) # for k-degree anon technique

In [None]:
enron_samples_data = []
for seed in range(42, 42+4): # 4 samples with different seeds
    # METRICS
    metrics = {
        # Important
        # Graph Level
        "|V|": NumberOfNodesMetric(),
        "|E|": NumberOfEdgesMetric(),
        "C": AverageClusteringCoefficientMetric(),
        #"|Δ|": NumberOfTrianglesMetric(),
        #"Transitivity": TransitivityMetric(),

        # Node Level
        "Deg.": DegreeCentralityMetric(),
        "Katz": KatzCentralityMetric(alpha=alpha),
        "Ev.": EigenvectorMetric(),
        "LCC": LocalClusteringCoefficientMetric(),
        "CC": ClosenessCentralityMetric(),
        #"TVD WL Colors d=2": WLColorMetric(depth=2),

        # Graph Level
        #"|CC|": ConnectedComponentsMetric(),
        #"Median Deg.": MedianDegreeMetric(),
        #"Avg. Deg.": MeanDegreeMetric(),
        #"Max Deg.": MaxDegreeMetric(),
        #"PageRank":	PageRankMetric(),

        r"\(\vert E \cap E'\vert\)": EdgeJaccardMetric(),
        r"\% 16-degree Anon": PercentageKDegreeAnonMetric(k=16),
        r"\% 64-degree Anon": PercentageKDegreeAnonMetric(k=64),
    }

    methods = {}
    #methods["Configuration Model"] = ConfigurationModelAnonymizer()

    methods[r"NeSt \(d=1\)"] = NestModelAnonymizer(depth=1, r=10)
    methods[r"NeSt \(d=2\)"] = NestModelAnonymizer(depth=2, r=10)

    #methods["PrivateColor(w=1e1)"] = PrivateColorAnonymizer(w=1e1, alpha=alpha, is_eager=True, use_optimal1d=False)
    methods[r"Eager \(w=10^{-2}\)"] = PrivateColorAnonymizer(w=1e-2, alpha=alpha, is_eager=True, use_optimal1d=True)
    methods[r"Eager \(w=10^{-3}\)"] = PrivateColorAnonymizer(w=1e-3, alpha=alpha, is_eager=True, use_optimal1d=True)
    methods[r"Eager \(w=10^{-4}\)"] = PrivateColorAnonymizer(w=1e-4, alpha=alpha, is_eager=True, use_optimal1d=True)

    methods[r"16\% Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(16/100*G.number_of_edges()))
    #methods[f"{8}% Random Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(8/100*G.number_of_edges()))
    methods[r"4\% Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(4/100*G.number_of_edges()))
    #methods[f"{2}% Random Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(2/100*G.number_of_edges()))
    methods[r"1\% Edge Add/Del"] = RandomEdgeAddDelAnonymizer(m = int(1/100*G.number_of_edges()))

    methods[r"128-Degree Anonymity"] = KDegreeAnonymizer(k=128)
    methods[r"64-Degree Anonymity"] = KDegreeAnonymizer(k=64)
    #methods[f"{32}-Degree Anonymity"] = KDegreeAnonymizer(k=32)
    methods[r"16-Degree Anonymity"] = KDegreeAnonymizer(k=16)
    #methods[f"{8}-Degree Anonymity"] = KDegreeAnonymizer(k=8)
    #methods[f"{2}-Degree Anonymity"] = KDegreeAnonymizer(k=2)

    methods[r"Pygmalion \(\epsilon = 1\)"] = PygmalionModelAnonymizer(eps = 1)
    methods[r"Pygmalion \(\epsilon = 100\)"] = PygmalionModelAnonymizer(eps = 100)
    methods[r"Pygmalion \(\epsilon = \infty\)"] = PygmalionModelAnonymizer(eps = 100_000_000)

    evaluator = Evaluator(metrics, use_igraph=True)

    data = {}

    for method_name, method in methods.items():
        print(f"Anonymizing with method {method_name}")
        Ga = method.anonymize(G, random_seed=seed)
        print(f"Evaluating method {method_name}")
        data[method_name] = evaluator.evaluate(G, Ga)

    enron_samples_data.append(data)

os.makedirs('cache', exist_ok=True)
with open('cache/exp3_comp_enron_samples_data.pkl', 'wb') as f:
    pickle.dump(enron_samples_data, f)

In [None]:
with open('cache/exp3_comp_enron_samples_data.pkl', 'rb') as f:
    enron_samples_data = pickle.load(f)

print(len(enron_samples_data))
means, rel_stds = get_statics_of_samples(enron_samples_data)
print(get_latex_table(enron_samples_data[0]))
print(get_latex_table(means, precision=7))
print(get_latex_table(rel_stds))