## Algorithm seems to be broken in SecGraph too on detailed review. Not straightforward to fix.

In [None]:
import networkx as nx
import numpy as np
from scipy.optimize import linear_sum_assignment
import numpy as np
import networkx as nx
from scipy.linalg import pinv
from anonymigraph.anonymization.random_edge_add_del import RandomEdgeAddDelAnonymizer

class CachedShortestPath:
    def __init__(self, G):
        self.cache = {}
        self.G = G

    def distance(self, source, target):
        if source not in self.cache:
            self.cache[source] = nx.single_source_shortest_path_length(self.G, source)

        return self.cache[source].get(target, np.Inf)

class CachedResistanceDistance:
    def __init__(self, G):
        self.G = G
        self.L_pinv = None  # Store the pseudoinverse of the adjusted Laplacian
        self.cache = {}  # Cache for resistance distances

    def compute_laplacian_pseudoinverse(self):
        L = nx.laplacian_matrix(self.G).toarray()  # Get the Laplacian matrix as a NumPy array
        n = len(self.G.nodes())  # Number of nodes in the graph
        adjusted_L = L + 1/n
        self.L_pinv = pinv(adjusted_L)  # Compute and store the pseudoinverse of the adjusted Laplacian

    def distance(self, source, target):
        # Ensure the Laplacian pseudoinverse is computed
        if self.L_pinv is None:
            self.compute_laplacian_pseudoinverse()

        # Check if the distance is already cached
        if (source, target) in self.cache:
            return self.cache[(source, target)]
        elif (target, source) in self.cache:  # Resistance distance is symmetric
            return self.cache[(target, source)]

        # Compute resistance distance using the adjusted Laplacian pseudoinverse
        R_ij = self.L_pinv[source, source] + self.L_pinv[target, target] - 2 * self.L_pinv[source, target]
        self.cache[(source, target)] = R_ij
        return R_ij



def bayesian_using_cos_sim(G, Ga):
    if G.number_of_nodes() != Ga.number_of_nodes():
        raise ValueError("Graph node sizes do not match")

    sorted_G_nodes = sorted(G.nodes(), key=lambda x: G.degree(x), reverse=True)
    sorted_Ga_nodes = sorted(Ga.nodes(), key=lambda x: Ga.degree(x), reverse=True)

    n = 2
    g1_matched, g2_matched = [], []
    g1_anchor, g2_anchor = [], []

    G_lengths_cache = CachedShortestPath(G)
    Ga_lengths_cache = CachedShortestPath(G)

    while len(g1_matched) < G.number_of_nodes():
        g1_nodes_consider = sorted_G_nodes[:n]
        g2_nodes_consider = sorted_Ga_nodes[:n]

        print("cand g1", g1_nodes_consider)
        print("cand g2", g2_nodes_consider)

        g1_cand_finger = get_candidate_fingerprint(g1_anchor, g1_nodes_consider, G, G_lengths_cache)
        g2_cand_finger = get_candidate_fingerprint(g2_anchor, g2_nodes_consider, Ga, Ga_lengths_cache)

        #score_matrix  = normalize(compute_score_matrix(g1_nodes_consider, g2_nodes_consider, g1_cand_finger, g2_cand_finger))
        #score_matrix = normalize(vectorized_compute_score_matrix_euclidean(g1_nodes_consider, g2_nodes_consider, g1_cand_finger, g2_cand_finger))
        score_matrix = vectorized_compute_score_matrix_euclidean(g1_nodes_consider, g2_nodes_consider, g1_cand_finger, g2_cand_finger)
        row_ind, col_ind = linear_sum_assignment(score_matrix) # outputs two lists that for the same index indicate the matches
        print("row_ind", row_ind)
        print("col_ind", col_ind)

        matches = [(row_ind[i], col_ind[i], score_matrix[row_ind[i], col_ind[i]]) for i in range(len(row_ind))]
        matches.sort(key=lambda x: x[2], reverse=False)

        g1_matched, g2_matched = [], []
        g1_anchor, g2_anchor = [], []
        for i, (g1_idx, g2_idx, score) in enumerate(matches):
            #print(score, g1_nodes_consider[g1_idx], g2_nodes_consider[g2_idx])
            g1_matched.append(g1_nodes_consider[g1_idx])
            g2_matched.append(g2_nodes_consider[g2_idx])

            if i < len(matches) // 2:
                g1_anchor.append(g1_nodes_consider[g1_idx])
                g2_anchor.append(g2_nodes_consider[g2_idx])

        n = min(n * 2, G.number_of_nodes())


    return dict(zip(g1_matched, g2_matched))

def normalize(matrix):
    outer_sums = np.sqrt(np.outer(matrix.sum(axis=1), matrix.sum(axis=0)))
    normalized_matrix = matrix / outer_sums
    return normalized_matrix

def get_candidate_fingerprint(anchors, nodes_consider, G, G_lengths_cache):
    fingerprints = {}
    for node in nodes_consider:
        fingerprint = [G.degree(node)]  # Degree as the first element
        for anchor in anchors:
            #Cache does same as nx.shortest_path_length(G, source=node, target=anchor) but efficient
            fingerprint.append(G_lengths_cache.distance(node, anchor))
        fingerprints[node] = np.array(fingerprint, dtype=np.float64)
    return fingerprints

import numpy as np


def vectorized_compute_score_matrix_euclidean(g1_nodes, g2_nodes, g1_finger, g2_finger):
    g1_matrix = np.array([g1_finger[node] for node in g1_nodes])
    g2_matrix = np.array([g2_finger[node] for node in g2_nodes])

    combined_matrix = np.concatenate([g1_matrix, g2_matrix], axis=0)

    # Step 2: Compute the mean and standard deviation of the combined matrix
    combined_mean = np.mean(combined_matrix, axis=0)
    combined_std = np.std(combined_matrix, axis=0)
    combined_std = np.where(combined_std == 0, 1, combined_std)
    #print("means", combined_mean)
    #print("stds", combined_std)


    #print(combined_std.shape)
    #print(combined_std[0])
    #combined_std[0] /= 10

    # Step 3: Normalize g1_matrix and g2_matrix using combined mean and std
    g1_normed = (g1_matrix - combined_mean) / combined_std
    g2_normed = (g2_matrix - combined_mean) / combined_std

    # Compute the L2 norm (Euclidean distance) between each pair of vectors
    # Expanding the Euclidean distance formula: ||a-b||^2 = ||a||^2 + ||b||^2 - 2*a.b
    g1_sq_norms = np.sum(g1_normed**2, axis=1, keepdims=True)  # ||a||^2 for each vector in g1
    g2_sq_norms = np.sum(g2_normed**2, axis=1, keepdims=True).T  # ||b||^2 for each vector in g2, transposed to align with matrix multiplication rules

    # Compute squared Euclidean distance
    squared_euclidean_distance = g1_sq_norms + g2_sq_norms - 2 * g1_normed @ g2_normed.T
    #print("g1_normed?", g1_normed)
    #print("g2_normed", g2_normed)
    #print("euclidian_dist", squared_euclidean_distance)
    # euclidean_distance = np.sqrt(np.maximum(squared_euclidean_distance, 0))

    return squared_euclidean_distance


def vectorized_compute_score_matrix_cosine(g1_nodes, g2_nodes, g1_finger, g2_finger):
    g1_matrix = np.array([g1_finger[node] for node in g1_nodes])
    g2_matrix = np.array([g2_finger[node] for node in g2_nodes])

    g1_norms = np.linalg.norm(g1_matrix, axis=1, keepdims=True)
    g2_norms = np.linalg.norm(g2_matrix, axis=1, keepdims=True)
    g1_norms[g1_norms == 0] = 1
    g2_norms[g2_norms == 0] = 1

    dot_product = g1_matrix @ g2_matrix.T
    cosine_similarity = dot_product / (g1_norms @ g2_norms.T)

    return cosine_similarity


def compute_score_matrix(g1_nodes, g2_nodes, g1_finger, g2_finger):
    matrix_size = max(len(g1_nodes), len(g2_nodes))
    score_matrix = np.zeros((matrix_size, matrix_size))
    for i, g1_node in enumerate(g1_nodes):
        for j, g2_node in enumerate(g2_nodes):
            score_matrix[i, j] = cosine_similarity(g1_finger[g1_node], g2_finger[g2_node])
    return score_matrix

def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)

    if norm_v1 == 0 or norm_v2 == 0:
        return 0
    else:
        similarity = dot_product / (norm_v1 * norm_v2)

    return similarity

import networkx as nx
import random

def sample_edges(G, s, seed=10):
    random.seed(seed)
    G1 = nx.Graph()
    G2 = nx.Graph()
    G1.add_nodes_from(G.nodes())
    G2.add_nodes_from(G.nodes())

    for edge in G.edges():
        if random.random() < s:
            G1.add_edge(*edge)
        if random.random() < s:
            G2.add_edge(*edge)

    return G1, G2

def shuffle_and_sort_graph(Ga):
    """
    Shuffles the labels of the nodes in the input graph Ga, and then sorts the nodes,
    creating a new graph with the nodes sorted and edges preserved according to the
    shuffled labels.

    Parameters:
    Ga (nx.Graph): The original graph to be shuffled and sorted.

    Returns:
    nx.Graph: A new graph with nodes sorted after being shuffled.
    """
    original_labels = list(Ga.nodes())
    random_labels = original_labels.copy()
    random.shuffle(random_labels)
    label_mapping = dict(zip(original_labels, random_labels))
    Ga_relabelled = nx.relabel_nodes(Ga, label_mapping)
    Ga_sorted = nx.Graph()
    sorted_nodes = sorted(Ga_relabelled.nodes(data=True))
    Ga_sorted.add_nodes_from(sorted_nodes)
    Ga_sorted.add_edges_from(Ga_relabelled.edges(data=True))
    return Ga_sorted, label_mapping

import networkx as nx
import cProfile

def main():
    G = nx.barabasi_albert_graph(30, 4, seed=10)
    #G = nx.erdos_renyi_graph(1000, 12/1000)
    degrees = [degree for _, degree in G.degree()]
    print(f"Max Degree: {max(degrees)}, Mean Degree: {sum(degrees) / len(degrees):.2f}")

    #m = int(1/100*G.number_of_edges())
    #print(G, m)
    #Ga = RandomEdgeAddDelAnonymizer(m = m).anonymize(G, random_seed = 10)

    # Generate the sampled graphs G1 and G2
    s_square = 2
    G, Ga = sample_edges(G, np.sqrt(s_square), seed = 10)
    #print(nx.is_isomorphic(G, Ga))

    Ga, re_id_truth = shuffle_and_sort_graph(Ga)

    # Step 6: Output the relabeled graph and the mapping of labels.
    print("Ground Truh Re-Identification:", re_id_truth)

    match_bayesian = bayesian_using_cos_sim(G, Ga)
    print("accuracy reidentification bayesian", sum(match_bayesian[g1_node] == re_id_truth[g1_node] for g1_node, _ in re_id_truth.items())/len(re_id_truth))

    k = 5
    top_k_nodes = sorted(G.degree(), key=lambda x: x[1], reverse=True)[:k]
    print(f"accuracy reidentification bayesian top {k}", sum(match_bayesian[g_node] == re_id_truth[g_node] for g_node, _ in top_k_nodes)/k)

    ####
    sorted_G_nodes = sorted(G.nodes(), key=lambda x: G.degree(x), reverse=True)
    sorted_Ga_nodes = sorted(Ga.nodes(), key=lambda x: Ga.degree(x), reverse=True)
    match_degree = dict(zip(sorted_G_nodes, sorted_Ga_nodes))
    print("accuracy reidentification degree", sum(match_degree[g1_node] == re_id_truth[g1_node] for g1_node, _ in re_id_truth.items())/len(re_id_truth))

main()
#cProfile.run('main()', 'profile_output.prof')