# Define Graph Bound / Convergence tool for graphs

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from scipy.sparse.linalg import eigsh
import networkx as nx
import numpy as np
from variation_utils import calculate_katz, calculate_kmeans_cluster_loss

In [None]:
import numpy as np
import random
from scipy.sparse import dok_matrix, csr_matrix, issparse
from numba import get_num_threads, njit, prange
from numba.typed import Dict, List  # pylint: disable=no-name-in-module


def create_partition_matrix(colors):
    n = len(colors)
    m = np.max(colors) + 1

    valid_colors = set(range(m + 1))  # Set of all integers from 0 to max_color
    assert set(colors).issubset(valid_colors), "All colors must be within the range 0 to the maximum color."

    H = np.zeros((n, m), dtype=int)

    for j in range(n):
        H[j, colors[j]] = 1

    return H

@njit
def internal_sample_new_adjacency_matrix(N, colors, with_self_loops = False):
    n = N.shape[0]

    color_to_nodes = Dict()
    tmp = Dict()
    tmp[0] = 1
    color_to_nodes[0] = tmp
    del color_to_nodes[0]

    for idx, color in enumerate(colors):
        if color not in color_to_nodes:
            tmp = Dict()
            tmp[0] = 1
            del tmp[0]
            color_to_nodes[color] = tmp
        color_to_nodes[color][idx] = 1

    row_indices = List([-1])
    row_indices.pop()
    col_indices = List([-1])
    col_indices.pop()

    for c, nodes in color_to_nodes.items():
        if not nodes:
            continue
        nodes = set(nodes.keys())

        for i in range(n):
            num_neighbors = int(N[i][c])  # Colored degree from matrix N
            if num_neighbors > 0:
                possible_nodes = list(nodes) if with_self_loops else list(nodes - {i})
                if num_neighbors <= len(possible_nodes):
                    sampled_neighbors = np.random.choice(np.array(possible_nodes), size=num_neighbors, replace=False)
                    row_indices.extend([i] * num_neighbors)
                    col_indices.extend(sampled_neighbors)

    return row_indices, col_indices

def calc_edge_probabilities(A, N, H, with_self_loops=False):

    color_count = np.sum(H, axis = 0)
    if with_self_loops:
        denominator = color_count - np.zeros_like(H)
    else:
        denominator = (color_count-H)

    prob = np.zeros_like(N, dtype=float)
    safe_divide = denominator != 0
    prob[safe_divide] = N[safe_divide] / denominator[safe_divide]

    edge_probabilities = np.repeat(prob.flatten(), N.flatten().astype(np.int64))
    return edge_probabilities

def sample_new_adjacency_matrix(N, colors, with_self_loops = False):
    n = N.shape[0]
    row_indices, col_indices = internal_sample_new_adjacency_matrix(N, colors, with_self_loops=with_self_loops)
    data = np.ones(len(row_indices), dtype=int)
    A_prime = csr_matrix((data, (row_indices, col_indices)), shape=(n, n))
    return csr_matrix(A_prime)


def nest_preserve_colored_out_degree(A, colors, seed = None, with_self_loops = False):
    if seed:
        np.random.seed(seed)
        random.seed(seed)

    if not with_self_loops:
        if issparse(A):
            diag_elements = A.diagonal()
        else:
            diag_elements = np.diag(A)
        assert np.all(diag_elements  == 0), "The adjacency matrix A should not contain any self-loops."
    H = create_partition_matrix(colors)
    N = A @ H
    A_prime = sample_new_adjacency_matrix(N, colors, with_self_loops=with_self_loops)

    probs = calc_edge_probabilities(A, N, H, with_self_loops=with_self_loops)

    # Check A @ H == A' @ H (algebraic formulation of the entire theory if this fails we ded)
    assert np.allclose(N, A_prime @ H)
    print("Fundamental Algebraic Identity is true!")
    return A_prime, probs

def calc_expected_number_of_edges_with_frobenius(A, colors):
    P_par = get_P_parallel(colors)
    out = np.linalg.norm(A @ P_par, ord="fro")**2

    print("SLOW; ASSUMES OUT-VAR NEST with_self_loops=TRUE: Using paper equation to calc the expected number of edges", out)
    return out

def get_P_parallel(colors):
    n = len(colors)
    unique_colors = np.unique(colors)
    m = len(unique_colors)
    color_to_idx = {color: idx for idx, color in enumerate(unique_colors)}

    H = np.zeros((n, m), dtype=int)

    for j in range(n):
        H[j, color_to_idx[colors[j]]] = 1

    HtH_inv = np.diag(1 / np.diag((H.T @ H)))
    P_par = H @ HtH_inv @ H.T
    return P_par



In [None]:
# Example usage
colors = np.array([1, 2, 2, 3, 3, 3])
Atest = np.array([
    [0, 0, 0, 0, 0, 0],
    [0, 0, 1, 1, 0, 0],
    [1, 0, 0, 1, 1, 0],
    [1, 1, 0, 0, 1, 1],
    [1, 1, 1, 1, 0, 1],
    [1, 1, 1, 1, 1, 0]
])

A_prime, edge_probs = nest_preserve_colored_out_degree(Atest, colors, with_self_loops=False)

print("New Adjacency Matrix A':")
print(A_prime.toarray())

### spectral norm != spectral radius for non symmetric graphs

In [None]:
from anonymigraph.anonymization._external.nest_model._rewire import _rewire
import optimal1dclustering
import math
from scipy.sparse.linalg import eigs
from scipy.sparse.linalg import svds
from anonymigraph.anonymization._external.nest_model.fast_wl import WL_fast

def calc_min_cluster_size(arr):
    arr = np.array(arr)
    unique, counts = np.unique(arr, return_counts=True)
    min_size = counts.min()
    return min_size


def get_convergence_data_of_nest(G, min_cluster_size = 1, alpha=0.1, beta = 1,  printing = False, max_k = float("Inf"), max_iter=None, tol=None, random_seed=44, with_self_loops=False):
    A_G = nx.adjacency_matrix(G).astype(np.float64)  # Get the adjacency matrix as a sparse matrix
    number_of_edges = G.number_of_edges()
    _, s, _ = svds(A_G, k=1)
    spectral_norm_G = s[0]
    eigenvalues, eigenvectors = eigs(A_G.astype(np.float64), k=1, which='LM')  # 'LM': Largest Magnitude, tol is tolerance
    spectral_radius_G = np.abs(eigenvalues).max()

    print("Spectral radius:", spectral_radius_G, "Spectral norm:", spectral_norm_G)
    print(f"Spectral Radius: {spectral_radius_G}, alpha_max = {1/spectral_radius_G}")


    assert alpha < 1/spectral_radius_G, f"for katz to converge alpha needs to be smaller than 1/spectral_radius ({1/spectral_radius_G})"

    katz_cent = calculate_katz(A_G, alpha = alpha, beta=beta, num_iters=max_iter, tol=tol)
    katz_cent_norm = np.linalg.norm(katz_cent)
    katz_cent_norm_bound = beta * np.sqrt(A_G.shape[0]) /(1 - alpha * spectral_norm_G) # 1/(1 - alpha * spectral_norm) * norm(beta)
    print("katz_cent_norm < katz_cent_norm_bound", katz_cent_norm, "< ", katz_cent_norm_bound)

    data_dict = {}
    for k in range(1, min(G.number_of_nodes()//min_cluster_size, max_k)):
        print(f"{k}: ")
        results = {}
        mode = 2
        colors, centroids = optimal1dclustering.cluster(
            katz_cent, k, mode=mode, min_cluster_size=min_cluster_size
        )
        colors = np.array(colors)
        clusterLoss = calculate_kmeans_cluster_loss(katz_cent, colors, centroids, mode = mode)



        # Get graph spectrum statistics
        A_Ga, edge_probs = nest_preserve_colored_out_degree(A_G, colors, seed = random_seed, with_self_loops=with_self_loops)
        A_Ga = A_Ga.astype(np.float64)

        assert A_Ga.nnz == A_G.nnz # Check if A_G and A_Ga have same number of edges
        number_overlapping_edges = A_G.multiply(A_Ga).sum()

        _, s, _ = svds(A_Ga, k=1)
        spectral_norm_Ga = s[0]

        print(f"Sampled A_Ga spectral norm {spectral_norm_Ga}")

        katz_cent_Ga = calculate_katz(A_Ga, alpha = alpha, beta=beta, num_iters=max_iter, tol=tol)

        katz_l2_diff = np.linalg.norm(katz_cent_Ga - katz_cent)
        katz_l2_diff_bound_neumann = (alpha * (spectral_radius_G + spectral_norm_Ga) * clusterLoss) / (1 - alpha * spectral_norm_Ga)

        P_par = get_P_parallel(colors)
        expected_katz_cent_Ga = calculate_katz(A_G @ P_par, alpha = alpha, beta=beta, num_iters=max_iter, tol=tol)
        expected_katz_l2_diff = np.linalg.norm(expected_katz_cent_Ga - katz_cent)

        results['katz_l2_diff'] = katz_l2_diff
        results['expected_katz_l2_diff'] = expected_katz_l2_diff
        results['katz_l2_diff_bound_neumann'] = katz_l2_diff_bound_neumann
        results['Clustering Loss'] = clusterLoss
        results['percentage_overlapping_edges'] = number_overlapping_edges/number_of_edges
        results['expected_percentage_overlapping_edges'] = calc_expected_number_of_edges_with_frobenius(A_G, colors)/number_of_edges #np.sum(edge_probs)/number_of_edges
        results['edge_probs'] = edge_probs

        data_dict[k] = results

    edges = np.array(G.edges(), dtype=np.uint32)
    all_depth_colors = WL_fast(edges, labels=None, max_iter=None)

    print(f"Calculating Performance of Original Nest, with {len(all_depth_colors)} depths")
    original_nest = {}
    for i in range(len(all_depth_colors)):

        print(f"{i}: Calculating Nest at depth {i}")

        colors = all_depth_colors[i]

        k_approx = len(np.unique(colors))
        if k_approx > max_k:
            continue

        A_Ga, edge_probs = nest_preserve_colored_out_degree(A_G, colors, seed = random_seed, with_self_loops=with_self_loops)
        A_Ga = A_Ga.astype(np.float64)

        results = {}
        katz_cent_Ga = calculate_katz(A_Ga, alpha = alpha, beta=beta, num_iters=max_iter, tol=tol)
        katz_l2_diff = np.linalg.norm(katz_cent_Ga - katz_cent)

        results['katz_l2_diff'] = katz_l2_diff

        original_nest[k_approx] = results

    return {"data_dict": data_dict, "original_nest":original_nest, "katz_cent_norm":katz_cent_norm, "lam_G":spectral_radius_G, "katz_alpha":alpha, "katz_cent_norm_bound": katz_cent_norm_bound}


import plotly.graph_objects as go
import plotly.graph_objects as go
from plotly.offline import iplot

def plot_convergence_data(data, title, plot_nest=True):
    data_dict = data["data_dict"]

    # Create traces for each metric
    traces = []
    traces.append(go.Scatter(x=list(data_dict.keys()),
                                    y=[data_dict[k]['Clustering Loss'] for k in data_dict.keys()],
                                    mode='lines',#+markers',
                                    name=r'$\ell$'))
    # Add a constant line at y = 5
    traces.append(go.Scatter(x=list(data_dict.keys()),
                            y=[data["katz_cent_norm"],]*len(data_dict),  # This creates a list of 5's of the same length as the data_dict keys
                            mode='lines',
                            name='Norm of Katz Centrality for G'))
    traces.append(go.Scatter(x=list(data_dict.keys()),
                            y=[data["katz_cent_norm_bound"],]*len(data_dict),  # This creates a list of 5's of the same length as the data_dict keys
                            mode='lines',
                            name='Bound for Norm of Katz Centrality for G'))
    traces.append(go.Scatter(x=list(data_dict.keys()),
                                    y=[data_dict[k]['katz_l2_diff'] for k in data_dict.keys()],
                                    mode='lines',#+markers',
                                    name=r'Katz L2 Norm Diff (Directed Katz Variation NeSt)'))
    traces.append(go.Scatter(x=list(data_dict.keys()),
                                    y=[data_dict[k]['expected_katz_l2_diff'] for k in data_dict.keys()],
                                    mode='lines',#+markers',
                                    name=r'Expected Katz L2 Norm Diff (Directed Katz Variation NeSt)'))
    traces.append(go.Scatter(x=list(data_dict.keys()),
                                    y=[data_dict[k]['katz_l2_diff_bound_neumann'] for k in data_dict.keys()],
                                    mode='lines',#+markers',
                                    name=r'Katz L2 Norm Diff Bound (Neumann)'))
    traces.append(go.Scatter(x=list(data_dict.keys()),
                                    y=[data_dict[k]['percentage_overlapping_edges'] for k in data_dict.keys()],
                                    mode='lines',#+markers',
                                    name='Percentage of overlapping edges between G and Ga',
                                    yaxis='y2'))
    traces.append(go.Scatter(x=list(data_dict.keys()),
                                    y=[data_dict[k]['expected_percentage_overlapping_edges'] for k in data_dict.keys()],
                                    mode='lines',#+markers',
                                    name='Expected percentage of overlapping edges between G and Ga',
                                    yaxis='y2'))
    if plot_nest:
        original_nest = data["original_nest"]

        traces.append(go.Scatter(x=list(original_nest.keys()),
                y=[original_nest[k]['katz_l2_diff'] for k in original_nest.keys()],
                mode='markers',
                name="Original Nest k=len(wl-colors)",
                marker=dict(symbol='x', size=10)))


    layout_loglog = go.Layout(title=title,
                            xaxis_title='k',
                            yaxis_title='Metric Values (Log Scale)',
                            yaxis_type='log',  # Set y-axis to log scale
                            # Add a second y-axis to the layout
                            yaxis=dict(
                                exponentformat='e',  # Use scientific notation for the primary y-axis
                            ),
                            yaxis2=dict(title='Edge Percentages',
                                        titlefont=dict(color='rgba(148, 103, 189, 1)'),
                                        tickfont=dict(color='rgba(148, 103, 189, 1)'),
                                        overlaying='y',  # This places the second y-axis on top of the first
                                        side='right',  # This places the second y-axis on the right
                                        type='linear',
                                        range=[0, 1],),  # Set the second y-axis to linear scale
                            hovermode='closest',
                            height=900,
                            legend=dict(
                                    orientation="h",
                                    x=0.5,
                                    y=-0.1,
                                    xanchor="center",
                                    yanchor="top"
                                )
                            )




    # Add the new trace to the figure
    fig = go.Figure(data=traces, layout=layout_loglog)
    fig.layout.template = 'simple_white+gridon' # 'presentation'

    fig.show()

    only_plot_every_xth = 10
    last_key = max(data_dict.keys())
    subsampled_data_dict = {key: data_dict[key] for key in data_dict if key % only_plot_every_xth == 0 or key == last_key or key == 1}


    initial_key = 1
    fig = go.Figure(data=[go.Histogram(x=subsampled_data_dict[initial_key]["edge_probs"],
                                        xbins=dict(
                                            size = 0.01
                                        ),
                                        histnorm='probability',
                                        )])

    # Update layout for better visualization
    fig.update_layout(
        title_text='Histogram of probabilities of an particular edge being present in the anonymized graph given it was present in the original graph', # title of the plot
        xaxis_title_text='Probability of an particular edge being present in the anonymized graph given it was present in the original graph', # xaxis label
        yaxis_title_text='Proportion of edges with the given conditional probability', # yaxis label
        template='simple_white+gridon',
        xaxis=dict(
            range=[0, 1]  # setting the range for the x-axis
        ),
        sliders=[{
            "steps": [{
                "label": str(key),
                "method": "update",
                "args": [{
                    "x": [subsampled_data_dict[key]["edge_probs"]],
                    "name": f"Key {key}"
                }, {"title": f"Histogram of probabilities for key {key}"}]
            } for key in subsampled_data_dict.keys()],
            "active": list(data_dict.keys()).index(initial_key),
            "currentvalue": {
                "prefix": "Currently viewing: k=",
                "visible": True
            },
            "transition": {"duration": 300},
            "pad": {"t": 50}  # Optional: add padding to the top of the slider
        }]
    )

    # Display the plot
    iplot(fig)

# Erdos Renyis

In [None]:
# Get a Graph
n = 400
p = 3/n

G = nx.erdos_renyi_graph(n, p, directed=True)

print(G, f"number of selfloops {nx.number_of_selfloops(G)}")

data_dict = get_convergence_data_of_nest(G, min_cluster_size = 1, alpha=0.04, beta=1, max_k=400, with_self_loops=True)
plot_convergence_data(data_dict, f"spectral_radius*alpha = {data_dict["katz_alpha"]*data_dict["lam_G"]:.3f} | Directed Erdos Renyi Graph: n={n}, p={p}")

In [None]:
# Get a Graph
n = 400
p = 3/n

G = nx.erdos_renyi_graph(n, p, directed=True)

print(G, f"number of selfloops {nx.number_of_selfloops(G)}")

data_dict = get_convergence_data_of_nest(G, min_cluster_size = 1, alpha=0.3, beta=1, max_k=400, with_self_loops=True)
plot_convergence_data(data_dict, f"spectral_radius*alpha = {data_dict["katz_alpha"]*data_dict["lam_G"]:.3f} | Directed Erdos Renyi Graph: n={n}, p={p}")

In [None]:
# Get a Graph
n = 400
p = 8/400

G = nx.erdos_renyi_graph(n, p, directed=True)

print(G, f"number of selfloops {nx.number_of_selfloops(G)}")

data_dict = get_convergence_data_of_nest(G, min_cluster_size = 1, alpha=0.04, beta=1, max_k=400, with_self_loops=True)
plot_convergence_data(data_dict, f"spectral_radius*alpha = {data_dict["katz_alpha"]*data_dict["lam_G"]:.3f} | Directed Erdos Renyi Graph: n={n}, p={p}")

In [None]:
n = 400
p = 20/n

G = nx.erdos_renyi_graph(n, p, directed=True)

print(G, f"number of selfloops {nx.number_of_selfloops(G)}")

data_dict = get_convergence_data_of_nest(G, min_cluster_size = 1, alpha=0.04, beta=1, max_k=400, with_self_loops=True)
plot_convergence_data(data_dict, f"spectral_radius*alpha = {data_dict["katz_alpha"]*data_dict["lam_G"]:.3f} | Directed Erdos Renyi Graph: n={n}, p={p}")

In [None]:
n = 150
p = 90/n

G = nx.erdos_renyi_graph(n, p, directed=True)

print(G, f"number of selfloops {nx.number_of_selfloops(G)}")

data_dict = get_convergence_data_of_nest(G, min_cluster_size = 1, alpha=0.01, beta=1, max_k=150, with_self_loops=False)
plot_convergence_data(data_dict, f"spectral_radius*alpha = {data_dict["katz_alpha"]*data_dict["lam_G"]:.3f} | Directed Erdos Renyi Graph: n={n}, p={p}")

In [None]:
import networkx as nx

n = 600

G = nx.scale_free_graph(n, seed=8)
G = nx.DiGraph([(u, v) for u, v in G.edges() if u != v])

print(G, f"number of selfloops {nx.number_of_selfloops(G)}")

data_dict = get_convergence_data_of_nest(G, min_cluster_size = 1, alpha=0.05, beta=1, max_k=600, with_self_loops=False)
plot_convergence_data(data_dict, f"spectral_radius*alpha = {data_dict["katz_alpha"]*data_dict["lam_G"]:.3f} | Directed Scale-Free Graph: n={n}")

In [None]:
# Get a Graph
n = 400
tau1 = 3
tau2 = 1.5
mu = 0.1
G = nx.LFR_benchmark_graph(
    n, tau1, tau2, mu, average_degree=10, min_community=80, seed=10
)
G.remove_edges_from(nx.selfloop_edges(G))
G = G.to_directed()
print(G)

data_dict = get_convergence_data_of_nest(G, min_cluster_size = 1, alpha=0.02, beta=1, max_k=400, with_self_loops=False)
plot_convergence_data(data_dict, f"spectral_radius*alpha = {data_dict["katz_alpha"]*data_dict["lam_G"]:.3f} | LFR Graph: n={n}, τ1={tau1}, τ2={tau2}, μ={mu}, AvgDeg=10, MinComm=80")

In [None]:
import networkx as nx
import numpy as np

# Parameters
n = 200  # number of nodes

np.random.seed(42)
degree_sequence = np.random.zipf(a=1.2, size=n)
degree_sequence = [d for d in degree_sequence if 0 < d < n]
if sum(degree_sequence) % 2 == 1:
    degree_sequence[-1] += 1
print(degree_sequence)
# Create a directed graph from a degree sequence
G = nx.directed_configuration_model(degree_sequence, degree_sequence, seed=42)
G = nx.DiGraph([(u, v) for u, v in G.edges() if u != v])

print(G, f"number of selfloops {nx.number_of_selfloops(G)}")

# Assuming 'get_convergence_data_of_nest' and 'plot_convergence_data' are defined
data_dict = get_convergence_data_of_nest(G, min_cluster_size=1, alpha=0.02, beta=1, max_k=n, with_self_loops=False)
plot_convergence_data(data_dict, f"spectral_radius*alpha = {data_dict["katz_alpha"]*data_dict["lam_G"]:.3f} | Directed Graph with Zipf Degrees: n={n}")


In [None]:
import networkx as nx
import numpy as np

# Parameters
n = 200  # number of nodes

np.random.seed(42)
degree_sequence = np.random.zipf(a=1.9, size=n)
degree_sequence = [d for d in degree_sequence if 0 < d < n]
if sum(degree_sequence) % 2 == 1:
    degree_sequence[-1] += 1
print(degree_sequence)
# Create a directed graph from a degree sequence
G = nx.directed_configuration_model(degree_sequence, degree_sequence, seed=42)
G = nx.DiGraph([(u, v) for u, v in G.edges() if u != v])

print(G, f"number of selfloops {nx.number_of_selfloops(G)}")

# Assuming 'get_convergence_data_of_nest' and 'plot_convergence_data' are defined
data_dict = get_convergence_data_of_nest(G, min_cluster_size=1, alpha=0.03, beta=1, max_k=n, with_self_loops=False)
plot_convergence_data(data_dict, f"spectral_radius*alpha = {data_dict["katz_alpha"]*data_dict["lam_G"]:.3f} | Directed Graph with Zipf Degrees: n={n}")


In [None]:
import networkx as nx
import numpy as np

# Parameters
n = 200  # number of nodes

np.random.seed(42)
degree_sequence = np.random.zipf(a=1.7, size=n)
degree_sequence = [d for d in degree_sequence if 0 < d < n]
if sum(degree_sequence) % 2 == 1:
    degree_sequence[-1] += 1
print(degree_sequence)
# Create a directed graph from a degree sequence
G = nx.directed_configuration_model(degree_sequence, degree_sequence, seed=42)
G = nx.DiGraph([(u, v) for u, v in G.edges() if u != v])

print(G, f"number of selfloops {nx.number_of_selfloops(G)}")

# Assuming 'get_convergence_data_of_nest' and 'plot_convergence_data' are defined
data_dict = get_convergence_data_of_nest(G, min_cluster_size=1, alpha=0.03, beta=1, max_k=n, with_self_loops=False)
plot_convergence_data(data_dict, f"spectral_radius*alpha = {data_dict["katz_alpha"]*data_dict["lam_G"]:.3f} | Directed Graph with Zipf Degrees: n={n}")


In [None]:
import networkx as nx
import numpy as np

# Parameters
n = 300  # number of nodes

# Create a directed graph from a degree sequence
G = nx.random_k_out_graph(n, 10, 1, self_loops=False, seed=555)
G = nx.DiGraph([(u, v) for u, v in G.edges() if u != v])

print(G, f"number of selfloops {nx.number_of_selfloops(G)}")

# Assuming 'get_convergence_data_of_nest' and 'plot_convergence_data' are defined
data_dict = get_convergence_data_of_nest(G, min_cluster_size=1, alpha=0.03, beta=1, max_k=n, with_self_loops=False)
plot_convergence_data(data_dict, f"spectral_radius*alpha = {data_dict["katz_alpha"]*data_dict["lam_G"]:.3f} | Directed Graph with Zipf Degrees: n={n}")
