In [1]:
import requests
import gzip
import networkx as nx
import io

def get_hepph_graph():
    # Step 1: Download the .gz file
    url = 'https://snap.stanford.edu/data/cit-HepPh.txt.gz'
    response = requests.get(url)
    response.raise_for_status()  # Check that the request was successful

    # Step 2: Unzip the file
    with gzip.open(io.BytesIO(response.content), 'rt') as f:
        # Step 3: Parse the contents and create the graph
        G = nx.DiGraph()
        for line in f:
            if line.startswith('#'):
                continue  # Skip comment lines
            from_node, to_node = line.strip().split()
            G.add_edge(from_node, to_node)

    return G

# Example usage:
hepph_graph = get_hepph_graph()

mapping = {old_label: new_label for new_label, old_label in enumerate(hepph_graph.nodes())}
G = nx.relabel_nodes(hepph_graph, mapping)

print(G)


DiGraph with 34546 nodes and 421578 edges


In [2]:
assert sorted(G.nodes) == list(range(G.number_of_nodes()))

In [3]:
from scipy.sparse.linalg import eigs
import numpy as np

from randcolorgraphs.utils.calculate_katz import calculate_katz

def get_adj_and_katz(G, alpha=0.1, beta=1.0):
    A_G = nx.adjacency_matrix(G).astype(np.float64)

    eigenvalues, _ = eigs(A_G, k=1, which='LM')  # 'LM': Largest Magnitude, tol is tolerance
    spectral_radius_G = np.abs(eigenvalues).max()
    print(f"Spectral Radius: {spectral_radius_G}, alpha_max = {1/spectral_radius_G}")
    print("calculating katz centrality")
    katz_centrality = calculate_katz(A_G, alpha = alpha, beta=beta)
    print("done calculating katz")

    # Reorder nodes based on Katz centrality
    sorted_indices = np.argsort(katz_centrality)
    sorted_nodes = np.array(G.nodes())[sorted_indices]

    # Generate the adjacency matrix
    adj_matrix = nx.adjacency_matrix(G, nodelist=sorted_nodes)

    sorted_katz_vector = katz_centrality[sorted_indices]

    assert np.allclose(sorted_katz_vector, calculate_katz(adj_matrix.astype(np.float64), alpha = alpha, beta=beta))
    return adj_matrix, sorted_katz_vector

In [4]:
A_G, katz_centrality = get_adj_and_katz(G, alpha = 0.03)
A_G_dense = A_G.todense()

print("katz vector mean and max", np.mean(katz_centrality), np.max(katz_centrality))

Spectral Radius: 5.662920319737484, alpha_max = 0.17658733366150506
calculating katz centrality
Katz converged after 25 iterations.
done calculating katz
Katz converged after 25 iterations.
katz vector mean and max 1.649706382561998 28.68418325734212


In [5]:
import kmeans1d

from randcolorgraphs.objectives.get_cluster_loss_ell_sqr import get_cluster_loss_ell_sqr
from randcolorgraphs.objectives.get_expected_edgeoverlap import get_expected_edgeoverlap
from randcolorgraphs.algorithms.linear_scalarization.optimal_contiguous.optimal_contiguous_linear_scalarization_algo import optimal_contiguous_linear_scalarization_algo
import pandas as pd

from fast_pareto import is_pareto_front

def evaluate_cluster_assignment(katz_centrality, A, clusters):
    ell_sqr = get_cluster_loss_ell_sqr(katz_centrality, clusters)
    expected_edge_overlap = get_expected_edgeoverlap(A, clusters)
    return [ell_sqr, expected_edge_overlap]

### Analyze all possible clusters for their performance regarding specific merics and booleans
df = pd.DataFrame(columns=["ell_sqr", "expected_edge_overlap", "method_type"])

for k in range(1, 20, 1):
    print(f"Doing k={k}")
    colors, _ = kmeans1d.cluster(katz_centrality, k)
    clusters = np.array(colors)
    df.loc[len(df)] = evaluate_cluster_assignment(katz_centrality, A_G, clusters) + [f"Kmeans1d (k={k}) on Katz centrality"]

for k in range(35, 700, 30):
    print(f"Doing k={k}")
    colors, _ = kmeans1d.cluster(katz_centrality, k)
    clusters = np.array(colors)
    df.loc[len(df)] = evaluate_cluster_assignment(katz_centrality, A_G, clusters) + [f"Kmeans1d (k={k}) on Katz centrality"]

Doing k=1
Doing k=2
Doing k=3
Doing k=4
Doing k=5
Doing k=6
Doing k=7
Doing k=8
Doing k=9
Doing k=10
Doing k=11
Doing k=12
Doing k=13
Doing k=14
Doing k=15
Doing k=16
Doing k=17
Doing k=18
Doing k=19
Doing k=35
Doing k=65
Doing k=95
Doing k=125
Doing k=155
Doing k=185
Doing k=215
Doing k=245
Doing k=275
Doing k=305
Doing k=335
Doing k=365
Doing k=395
Doing k=425
Doing k=455
Doing k=485
Doing k=515
Doing k=545
Doing k=575
Doing k=605
Doing k=635
Doing k=665
Doing k=695


In [6]:

# OUT OF MEMORY
#for w in np.exp(-np.linspace(-1,12,3)):
#    clusters = optimal_contiguous_linear_scalarization_algo(katz_centrality, A_G_dense, w)
#    print(f"Done w={w}")
#    df.loc[len(df)] = evaluate_cluster_assignment(katz_centrality, A_G, clusters) + [f"Opt. Cont. Scal. (w={w})"]


In [7]:
#from greedy_algo_fast import greedy_search
#import scipy.sparse as sp
#
#def get_edge_vector_from_adj_matrix(adj_matrix):
#    sparse_matrix = sp.csr_matrix(adj_matrix)
#    row_indices, col_indices = sparse_matrix.nonzero()
#    edge_vector = np.vstack((row_indices, col_indices)).T
#    return edge_vector
#
#edges = get_edge_vector_from_adj_matrix(A_G)
#
#for w in np.exp(-np.linspace(-1,12,3)):
#    clusters, _ = greedy_search(katz_centrality, edges, np.array([0]*len(katz_centrality)), w=w, max_interaction_dist=1, max_iter=400)
#    print(f"Done w={w}")
#    df.loc[len(df)] = evaluate_cluster_assignment(katz_centrality, A_G, clusters) + [f"Greedy Search (w={w})"]

In [21]:
import pandas as pd

from greedy_algo_fast import greedy_search
import scipy.sparse as sp


def find_optimal_start_k(df, w):
    kmeans_rows = df[df['method_type'].str.contains(r'Kmeans1d \(k=')]
    obj = kmeans_rows['ell_sqr'] + w * kmeans_rows['expected_edge_overlap']
    min_index = obj.idxmin()
    method_type = kmeans_rows.loc[min_index, 'method_type']
    k = int(method_type.split('Kmeans1d (k=')[1].split(')')[0])
    return k

def get_edge_vector_from_adj_matrix(adj_matrix):
    sparse_matrix = sp.csr_matrix(adj_matrix)
    row_indices, col_indices = sparse_matrix.nonzero()
    edge_vector = np.vstack((row_indices, col_indices)).T
    return edge_vector

edges = get_edge_vector_from_adj_matrix(A_G)

for w in [0.005, 0.0005, 0.00005]:
    optimal_k = find_optimal_start_k(df, w)

    print(f'The optimal k for (w={w}) is: {optimal_k}')
    colors, _ = kmeans1d.cluster(katz_centrality, k)
    clusters = np.array(colors)

    print("Doing dist=1")
    clusters, _ = greedy_search(katz_centrality, edges, clusters, w=w, max_interaction_dist=1, max_iter=10_000)
    print("Doing dist=2")
    clusters, _ = greedy_search(katz_centrality, edges, clusters, w=w, max_interaction_dist=2, max_iter=10_000)
    print("Doing dist=3")
    clusters, _ = greedy_search(katz_centrality, edges, clusters, w=w, max_interaction_dist=2, max_iter=10_000)
    print(f"Done w={w}")
    df.loc[len(df)] = evaluate_cluster_assignment(katz_centrality, A_G, clusters) + [f"Warm Start Greedy Search (w={w})"]

The optimal k for (w=0.005) is: 125
Doing dist=1
!!!
TODO THIS CODE ISN'T WELL TESTED CLUSTERS ARE ALMOST NEVER EXECUTED!!! Cluster 591 that was deleted!
!!!
Iteration 0 Objective: 55.0127853087929 move_type merge-split expected_edge_overlap 10987.644542161821
!!!
TODO DELETING CLUSTERS: THIS CODE ISN'T WELL TESTED CLUSTERS ARE ALMOST NEVER EXECUTED!!!
!!!
!!!
TODO THIS CODE ISN'T WELL TESTED CLUSTERS ARE ALMOST NEVER EXECUTED!!! Cluster 588 that was deleted!
!!!
Iteration 1 Objective: 54.386042130841446 move_type merge-split expected_edge_overlap 10862.144542161821
!!!
TODO DELETING CLUSTERS: THIS CODE ISN'T WELL TESTED CLUSTERS ARE ALMOST NEVER EXECUTED!!!
!!!
!!!
TODO THIS CODE ISN'T WELL TESTED CLUSTERS ARE ALMOST NEVER EXECUTED!!! Cluster 600 that was deleted!
!!!
Iteration 2 Objective: 53.91801873937263 move_type merge-split expected_edge_overlap 10768.44454216182
!!!
TODO DELETING CLUSTERS: THIS CODE ISN'T WELL TESTED CLUSTERS ARE ALMOST NEVER EXECUTED!!!
!!!
!!!
TODO THIS CODE 

In [22]:
from utils.colour_refinement_out_neigh_algorithm import colour_refinement_out_neigh_algorithm

for d in range(3):
    clusters = colour_refinement_out_neigh_algorithm(A_G, d)
    print(f"Done depth {d}")
    df.loc[len(df)] = evaluate_cluster_assignment(katz_centrality, A_G, clusters) + [f"Out-NeSt Model (d={d})"]


Done depth 0
Done depth 1
Done depth 2


In [23]:

dfplot = df.copy()
dfplot["is_pareto_front"] = is_pareto_front(df[['ell_sqr', 'expected_edge_overlap']].values)

In [24]:
import pandas as pd
import plotly.express as px

def get_method_type_name(method_type):
    if 'Kmeans' in method_type:
        return 'Kmeans'
    elif 'Warm Start Greedy Search' in method_type:
        return 'Warm Start Greedy Search'
    elif 'Greedy Search' in method_type:
        return 'Greedy Search'
    elif 'Opt. Cont. Scal.' in method_type:
        return 'Opt. Cont. Scal.'
    elif 'Out-NeSt Model' in method_type:
        return 'Out-NeSt Model'
    else:
        return 'Other'

dfplot['method_type_name'] = dfplot['method_type'].apply(get_method_type_name)
dfplot['(Expected Edge Overlap)/|E|'] = dfplot['expected_edge_overlap'] / G.number_of_edges()

# Determining shape based on is_pareto_front
dfplot['shape'] = dfplot['is_pareto_front'].apply(lambda x: 'circle-open' if x else 'square-open')

# Plotting with Plotly
fig = px.scatter(
    dfplot,
    x='ell_sqr',
    y='(Expected Edge Overlap)/|E|',
    hover_data=['method_type'],
    symbol='shape',
    color='method_type_name',
    symbol_sequence=['circle-open', 'square-open']
)

fig.update_traces(marker=dict(size=10, line=dict(width=2)))

# Updating layout for legend
fig.update_layout(
    height=800,
    legend_title_text='Method Type',
    legend=dict(
        title='Method Type',
        itemsizing='constant',
        traceorder='normal',
        font=dict(size=10)
    )
)

fig.show()


In [25]:

fig.update_xaxes(type='log', tickformat=".0e")
fig.show()