In [None]:
# import urllib.request
# url = "https://raw.githubusercontent.com/DerwenAI/disparity_filter/main/disparity.py"
# urllib.request.urlretrieve(url, "disparity.py")


('disparity.py', <http.client.HTTPMessage at 0x111c4dc50>)

In [12]:
import disparity
import pickle
import networkx as nx
import pandas as pd
import numpy as np
import powerlaw
import matplotlib.pyplot as plt

In [13]:
def graph_stats(G):
    """Returns various statistics of a given graph G."""
    stats = {}
    stats['num_nodes'] = G.number_of_nodes()
    stats['num_edges'] = G.number_of_edges()
    stats['density'] = nx.density(G)
    
    degrees = [d for n, d in G.degree()]
    stats['mean_degree'] = np.mean(degrees)
    stats['std_degree'] = np.std(degrees)
    stats['global_clustering_coeff'] = nx.transitivity(G)
    
    stats['degree_assortativity'] = nx.degree_assortativity_coefficient(G)
    
    # Giant Connected Component (GCC)
    largest_cc = max(nx.connected_components(G), key=len)
    GCC = G.subgraph(largest_cc)
    stats['GCC_size'] = len(GCC)
    
    # Power-law fitting
    try:
        fit = powerlaw.Fit(degrees)
        stats['power_law_alpha'] = fit.alpha  # Power-law exponent
        stats['power_law_xmin'] = fit.xmin  # Minimum value where power law applies
        stats['ks_test_statistic'] = fit.D  # KS test statistic (lower is better)
        
        # Compare power law with other distributions
        distribution_list = ['lognormal', 'exponential', 'truncated_power_law']
        comparison_results = {}
        
        for dist in distribution_list:
            R, p = fit.distribution_compare('power_law', dist)
            comparison_results[dist] = (R, p)
        
        # Find the best alternative distribution (lowest R and p-value)
        best_fit = min(comparison_results.items(), key=lambda x: (x[1][0], x[1][1]))
        best_dist, (best_R, best_p) = best_fit
        
        stats['best_powerlaw_comparison'] = f"power law vs {best_dist}: R = {best_R:.3f}, p = {best_p:.3f}"
    
    except Exception as e:
        stats['power_law_test_error'] = str(e)
    
    return stats

In [14]:
with open('../shared-folder-gald/data/unipartite_og.pkl', "rb") as f:
    HC = pickle.load(f)

In [5]:
# min_alpha_ptile=0.5
# min_degree=2

# alpha_measures = disparity.disparity_filter(HC)
# quantiles, num_quant = disparity.calc_alpha_ptile(alpha_measures)
# alpha_cutoff = quantiles[round(num_quant * min_alpha_ptile)]

# disparity.cut_graph(HC, min_alpha_ptile, min_degree)

In [None]:
# def disparity_filter(G, alpha=0.05):
#     """
#     Apply disparity filter to a weighted network to extract the backbone.

#     Parameters:
#         G (networkx.Graph): Weighted graph (can be directed or undirected)
#         alpha (float): Significance level for edge filtering

#     Returns:
#         backbone (networkx.Graph): Backbone network with statistically significant edges
#     """
#     if not nx.is_weighted(G):
#         raise ValueError("Graph must be weighted.")

#     is_directed = G.is_directed()
#     backbone = nx.DiGraph() if is_directed else nx.Graph()
#     backbone.add_nodes_from(G.nodes(data=True))

#     for node in G.nodes():
#         neighbors = list(G[node])
#         k = len(neighbors)
#         if k <= 1:
#             continue

#         strength = sum(G[node][nbr].get("weight", 1.0) for nbr in neighbors)

#         for nbr in neighbors:
#             w = G[node][nbr].get("weight", 1.0)
#             p_ij = w / strength
#             alpha_ij = (1 - p_ij) ** (k - 1)

#             # Test significance
#             if alpha_ij < alpha:
#                 if is_directed or not backbone.has_edge(nbr, node):
#                     backbone.add_edge(node, nbr, weight=w)

#     return backbone


In [None]:
# backbone = disparity_filter(HC, alpha=0.05)

In [20]:
edge_data = [
    {'src': u, 'trg': v, 'nij': d['weight']}
    for u, v, d in HC.edges(data=True)
]

# Convert to DataFrame
df = pd.DataFrame(edge_data)

# Save to CSV
# df.to_csv('HC_edgelist.csv', index=False)

In [None]:
import backboning as bb

table, _, _ = bb.read(df, triangular_input = True, column_of_interest = 'weight', undirected = True, sep = ",")

TypeError: argument of type 'method' is not iterable

In [21]:
disparity_applied = bb.disparity_filter(df, undirected = True)

Calculating DF score...
  table_sum = table.groupby(table["src"]).sum().reset_index()
  table = table.drop("edge", 1)
  table = table.drop("score_min", 1)
  table = table.drop("variance_max", 1)


In [38]:
thresh_applied = bb.thresholding(disparity_applied, threshold=0.95).drop(columns=["score"])

In [42]:
backbone = nx.from_pandas_edgelist(
    thresh_applied,
    source="src",
    target="trg",
    edge_attr="nij",
    create_using=nx.Graph()  # ensures it's undirected
)

In [43]:
HC_stats = graph_stats(backbone)

Calculating best minimal value for power law fit
xmin progress: 99%

Assuming nested distributions


In [44]:
for key, val in HC_stats.items():
    print(key,'-', val)

num_nodes - 93923
num_edges - 1578479
density - 0.00035787347976596543
mean_degree - 33.61219296657901
std_degree - 152.96947193302572
global_clustering_coeff - 0.11255418217654023
degree_assortativity - -0.06542236704138689
GCC_size - 92258
power_law_alpha - 2.4141039457565894
power_law_xmin - 141.0
ks_test_statistic - 0.01988228975623818
best_powerlaw_comparison - power law vs truncated_power_law: R = -14.319, p = 0.000
