In [1]:
import networkx as nx
import pickle
import os
from collections import Counter
from collections import defaultdict
import numpy as np

In [None]:
original_graph = nx.read_edgelist()

In [2]:
def create_dict_of_pkl(folder):
    ret = {}
    for filename in os.listdir(folder):
        if filename.endswith('.pkl'):
            filepath = os.path.join(folder, filename)
            with open(filepath, 'rb') as f:
                o = pickle.load(f)
                ret[filename] = o
    return ret

In [3]:
graphs = create_dict_of_pkl('../data/trimmed_networks_pkl')
for graph_name, graph in graphs.items():
    print(f"Graph: {graph_name}, Nodes: {graph.number_of_nodes()}, Edges: {graph.number_of_edges()}")

graph_data = create_dict_of_pkl('../data/communities_and_modularity')

Graph: 1_resource_allocation_naive.pkl, Nodes: 9213, Edges: 15588
Graph: 1_simple_disparity_filter.pkl, Nodes: 16075, Edges: 443701
Graph: 2_resource_allocation_naive.pkl, Nodes: 12770, Edges: 42691
Graph: 2_simple_disparity_filter.pkl, Nodes: 14911, Edges: 336278


In [4]:
for name,graph in graphs.items():
    print(f"Density for {name}: {nx.density(graph)}")

Density for 1_resource_allocation_naive.pkl: 0.0003673376068732571
Density for 1_simple_disparity_filter.pkl: 0.0034343571556976664
Density for 2_resource_allocation_naive.pkl: 0.0005236227887221726
Density for 2_simple_disparity_filter.pkl: 0.0030251299674289227


In [45]:
for name,graph in graphs.items():
    if name == "2_simple_disparity_filter.pkl":
        components = nx.connected_components(graph)
        largest_component_nodes = max(components, key=len)
        # Create a subgraph of the largest connected component sp diameter could be observed
        largest_component_subgraph = graph.subgraph(largest_component_nodes).copy()

        print(f"Diameter for {name}: {nx.diameter(largest_component_subgraph)}")

#Diameter for 1_resource_allocation_naive.pkl: 53
#Diameter for 1_simple_disparity_filter.pkl: 12
#Diameter for 2_resource_allocation_naive.pkl: 21

Diameter for 2_simple_disparity_filter.pkl: 13


In [4]:
for name, graph in graphs.items():
    graph_nijs = [edge[2]["nij"] for edge in graph.edges(data=True)]
    graph_scores = [edge[2]["score"] for edge in graph.edges(data=True)]
    print(f"average nij for {name}: {np.average(graph_nijs)}")
    print(f"average score for {name}: {np.average(graph_scores)}")
    print()

average nij for 1_resource_allocation_naive.pkl: 0.13249297032906915
average score for 1_resource_allocation_naive.pkl: 0.06624648516453455

average nij for 1_simple_disparity_filter.pkl: 1.6234108104331522
average score for 1_simple_disparity_filter.pkl: 0.6423426981327841

average nij for 2_resource_allocation_naive.pkl: 0.07882542830326944
average score for 2_resource_allocation_naive.pkl: 0.0394127141516347

average nij for 2_simple_disparity_filter.pkl: 1.8015570450639053
average score for 2_simple_disparity_filter.pkl: 0.6740094634135435



In [85]:
original_graph = nx.read_edgelist("../projections_with_backbonings/projections/simple.csv",delimiter=",")

FileNotFoundError: [Errno 2] No such file or directory: '../projections_with_backbonings/projections/simple.csv'

In [None]:
graph = nx.read_edgelist("../projections_with_backbonings/projections/resource_allocation.csv")
components = nx.connected_components(graph)
largest_component_nodes = max(components, key=len)
# Create a subgraph of the largest connected component sp diameter could be observed
largest_component_subgraph = graph.subgraph(largest_component_nodes).copy()
name = "original projected graph"
print(f"Diameter for {name}: {nx.diameter(largest_component_subgraph)}")

FileNotFoundError: [Errno 2] No such file or directory: '../projections_with_backbonings/projections/resource_allocation.csv'

In [4]:
def calculate_average_purity(graph, discovered_communities, ground_truth_communities):
    # Create a mapping of nodes to ground truth labels
    ground_truth_labels = {}
    for i, ground_truth_community in enumerate(ground_truth_communities):
        for node in ground_truth_community:
            ground_truth_labels[node] = i  # Assign an integer label to each ground truth community

    total_purity = 0
    total_nodes = len(graph.nodes)

    print(f"ground truth labels: {ground_truth_labels}")
    print(len(discovered_communities))
    # Calculate purity for each discovered community
    for ci, community in enumerate(discovered_communities):
        # Get ground truth labels for nodes in this discovered community
        labels = [ground_truth_labels[node] for node in community if node in ground_truth_labels]
        if not labels:
            continue

        print(f"labels for community {ci}: {labels}")

        label_counts = Counter(labels)
        max_label_count = max(label_counts.values())

        # Purity for this community
        community_purity = max_label_count / len(community)

        # Weight by the size of the community
        total_purity += len(community) * community_purity

    # Average purity
    average_purity = total_purity / total_nodes
    return average_purity


In [4]:
def calculate_average_gini_impurity(graph, discovered_communities, ground_truth_communities):
    # Create a mapping of nodes to ground truth labels
    ground_truth_labels = {}
    for i, ground_truth_community in enumerate(ground_truth_communities):
        for node in ground_truth_community:
            ground_truth_labels[node] = i  # Assign an integer label to each ground truth community
    total_gini_impurity = 0
    total_nodes = len(graph.nodes)

    # Calculate Gini impurity for each discovered community
    for community in discovered_communities:
        # Get ground truth labels for nodes in this discovered community
        labels = [ground_truth_labels[node] for node in community if node in ground_truth_labels]
        if not labels:
            continue

        # Calculate label frequencies
        label_counts = Counter(labels)
        community_size = len(community)
        proportions = [count / community_size for count in label_counts.values()]

        # Gini impurity for this community
        gini_impurity = 1 - sum(p**2 for p in proportions)

        # Weight by the size of the community
        total_gini_impurity += len(community) * gini_impurity

    # Average Gini impurity
    average_gini_impurity = total_gini_impurity / total_nodes
    return average_gini_impurity

In [9]:
from collections import Counter

def calculate_average_gini_impurity_normal(graph, discovered_communities, ground_truth_communities):
    # Create a mapping of nodes to ground truth labels
    ground_truth_labels = {}
    for i, ground_truth_community in enumerate(ground_truth_communities):
        for node in ground_truth_community:
            ground_truth_labels[node] = i  # Assign an integer label to each ground truth community

    total_gini_impurity = 0
    total_nodes = len(graph.nodes)

    # Calculate Gini impurity for each discovered community
    for community in discovered_communities:
        # Get ground truth labels for nodes in this discovered community
        labels = [ground_truth_labels[node] for node in community if node in ground_truth_labels]
        if not labels:
            continue

        # Calculate label frequencies
        label_counts = Counter(labels)
        community_size = len(community)
        proportions = [count / community_size for count in label_counts.values()]

        # Gini impurity for this community
        gini_impurity = 1 - sum(p**2 for p in proportions)

        # Normalize the Gini impurity by its maximum value
        k = len(label_counts)  # Number of unique labels
        max_gini_impurity = 1 - 1 / k if k > 1 else 1  # Avoid division by zero for single-label communities
        normalized_gini_impurity = gini_impurity / max_gini_impurity if max_gini_impurity > 0 else 0

        # Weight by the size of the community
        total_gini_impurity += len(community) * normalized_gini_impurity

    # Average Gini impurity
    average_gini_impurity = total_gini_impurity / total_nodes
    return average_gini_impurity


In [5]:
def get_ground_truth_communities(graph, label_attribute):
    communities = defaultdict(set)

    # Iterate over nodes and group them by the label
    for node, data in graph.nodes(data=True):
        if label_attribute in data:
            label = data[label_attribute]
            communities[label].add(node)
        else:
            raise ValueError(f"Node {node} does not have the attribute '{label_attribute}'.")

    # Convert the grouped nodes into a list of sets
    return list(communities.values())

In [68]:
import networkx as nx
import numpy as np

def assortativity_null_test(graph, attribute=None, num_nulls=100):
    if attribute:
        observed_assortativity = nx.attribute_assortativity_coefficient(graph, attribute)
    else:
        observed_assortativity = nx.degree_assortativity_coefficient(graph)
    
    null_assortativities = []
    for _ in range(num_nulls):
        # Use configuration model to generate a null model preserving degree distribution
        null_graph = nx.configuration_model([d for _, d in graph.degree()])
        
        # Convert back to a simple graph (removes multi-edges and self-loops)
        null_graph = nx.Graph(null_graph)
        null_graph.remove_edges_from(nx.selfloop_edges(null_graph))

        # Assign the attributes from the original graph to the null model
        for node in null_graph.nodes():
            if node in graph.nodes and attribute in graph.nodes[node]:
                null_graph.nodes[node][attribute] = graph.nodes[node][attribute]

        # Calculate assortativity for the null model
        if attribute:
            assortativity = nx.attribute_assortativity_coefficient(null_graph, attribute)
            #print(assortativity)
        else:
            assortativity = nx.degree_assortativity_coefficient(null_graph)
        null_assortativities.append(assortativity)
    
    # Step 3: Calculate p-value
    null_assortativities = np.array(null_assortativities)
    p_value = np.mean(null_assortativities >= observed_assortativity)
    
    return observed_assortativity, null_assortativities, p_value

In [None]:
import networkx as nx
import numpy as np
from networkx.algorithms.community import modularity
from networkx.algorithms.community import greedy_modularity_communities

def modularity_null_test(graph, communities=None, num_nulls=100):
    # Compute the observed modularity
    if communities is None:
        communities = list(greedy_modularity_communities(graph))
    observed_modularity = modularity(graph, communities)
    
    null_modularities = []
    for _ in range(num_nulls):
        # Generate a null model preserving the degree distribution
        null_graph = nx.configuration_model([d for _, d in graph.degree()])
        
        # Convert to a simple graph (remove multi-edges and self-loops)
        null_graph = nx.Graph(null_graph)
        null_graph.remove_edges_from(nx.selfloop_edges(null_graph))
        
        # Compute the modularity for the null model
        null_communities = list(greedy_modularity_communities(null_graph))
        null_modularity = modularity(null_graph, null_communities)
        null_modularities.append(null_modularity)
    
    # Step 3: Calculate p-value
    null_modularities = np.array(null_modularities)
    p_value = np.mean(null_modularities >= observed_modularity)
    
    return observed_modularity, null_modularities, p_value


In [None]:
# Example Usage of assortativity null test
# Create a graph
G = nx.karate_club_graph()

# Add a node attribute for demonstration
for node in G.nodes():
    G.nodes[node]["club_type"] = G.nodes[node]["club"]

# Perform null test for degree assortativity
obs_assort, null_assorts, p_val = modularity_null_test(G, communities=, num_nulls=100)

print(f"Observed Assortativity: {obs_assort}")
print(f"p-value: {p_val}")

TypeError: modularity_null_test() got an unexpected keyword argument 'attribute'

In [76]:
graph_data["1_resource_allocation_naive.pkl"].keys()

dict_keys(['label_propagation', 'greedy_modularity', 'Louvain'])

In [10]:
community_labels = ["most_frequent_locality","most_frequent_parasite_group","animals_group"]
discovery_algorithm_labels = ["greedy_modularity","label_propagation","Louvain"]
for name,graph in graphs.items():
    for label in community_labels:
        for algorithm in discovery_algorithm_labels:
            if name == '2_simple_disparity_filter.pkl':
                largest_component = max(nx.connected_components(graph), key=len)
                largest_component_subgraph = graph.subgraph(largest_component)

                ground_truth_communities = get_ground_truth_communities(largest_component_subgraph,label)
                algorithm_communities = graph_data[name][algorithm]['communities']
                print(f"{name},{label},{algorithm} average gini impurity: {calculate_average_gini_impurity_normal(largest_component_subgraph,algorithm_communities,ground_truth_communities)}")


2_simple_disparity_filter.pkl,most_frequent_locality,greedy_modularity average gini impurity: 0.803014940170085
2_simple_disparity_filter.pkl,most_frequent_locality,label_propagation average gini impurity: 0.5593285718184728
2_simple_disparity_filter.pkl,most_frequent_locality,Louvain average gini impurity: 0.7381820562433361
2_simple_disparity_filter.pkl,most_frequent_parasite_group,greedy_modularity average gini impurity: 0.8052257690311595
2_simple_disparity_filter.pkl,most_frequent_parasite_group,label_propagation average gini impurity: 0.7195596320621135
2_simple_disparity_filter.pkl,most_frequent_parasite_group,Louvain average gini impurity: 0.7630617092190483
2_simple_disparity_filter.pkl,animals_group,greedy_modularity average gini impurity: 0.5835701865807364
2_simple_disparity_filter.pkl,animals_group,label_propagation average gini impurity: 0.32499965548548865
2_simple_disparity_filter.pkl,animals_group,Louvain average gini impurity: 0.3937366166671036


In [50]:
def assign_community_attributes(graph, communities, attribute_name="community"):
    for community_id, community in enumerate(communities):
        for node in community:
            graph.nodes[node][attribute_name] = community_id


In [None]:
assortativity_tests = {}
modularity_tests = {}
discovery_algorithm_labels = ["greedy_modularity","label_propagation","Louvain"]
for name,graph in graphs.items():
    for algo in discovery_algorithm_labels:
        #assortativity_tests[f"{name}_{label}"]=assortativity_null_test(graph,f"{label}")
        communities = graph_data[name][algo]['communities']
        modularity_tests[f"{name}_{label}"]=modularity_null_test(graph,communities)
        print(modularity_tests[f"{name}_{label}"])

In [26]:
print(graph_data['1_resource_allocation_naive.pkl']['greedy_modularity']['communities'])

[frozenset({4097, 3, 4107, 4111, 14356, 4130, 20517, 20519, 58, 72, 4171, 20561, 20568, 89, 20571, 4204, 20592, 4209, 4211, 24693, 20600, 20607, 20609, 4226, 4235, 24718, 4244, 24724, 149, 20647, 24765, 16574, 20671, 14531, 16580, 24775, 24776, 28880, 20695, 2270, 16612, 24804, 20711, 28909, 20718, 24819, 248, 28926, 20743, 20747, 10513, 20754, 10514, 24858, 16668, 14633, 28973, 28974, 24879, 10553, 24897, 321, 24899, 14660, 324, 14664, 29001, 10571, 20816, 10578, 10582, 14679, 10584, 10585, 20829, 29023, 24931, 8549, 10601, 364, 14707, 24949, 10615, 10620, 8573, 383, 10628, 24965, 20883, 29075, 407, 8603, 10654, 20894, 24992, 24995, 24999, 8623, 10672, 8630, 10685, 8639, 10688, 27074, 20931, 8651, 10702, 25043, 10708, 10709, 10717, 29159, 10729, 2539, 20971, 20975, 20979, 10744, 29177, 10756, 4612, 4614, 4616, 2571, 531, 29206, 29207, 21018, 21019, 29214, 29218, 21033, 560, 21049, 14908, 10813, 27200, 10817, 14914, 579, 23117, 21073, 21076, 10838, 21080, 21083, 29277, 29278, 21087, 88

In [5]:
assortativity_null_tests = pickle.load(open("../data/null_tests/assortativity_tests_1.pkl","rb"))
modularity_null_tests = pickle.load(open("../data/null_tests/modularity_tests_1.pkl","rb"))
modularity_null_tests

{'1_resource_allocation_naive.pkl_label_propagation': (0.8737346500459588,
  array([0.60613804, 0.60535385, 0.60426188, 0.60483036, 0.60676335,
         0.60426062, 0.60532236, 0.60487156, 0.60776975, 0.60489641,
         0.60638725, 0.6050135 , 0.60571474, 0.60568094, 0.6051292 ,
         0.60464862, 0.60606704, 0.6072429 , 0.60650308, 0.60553253,
         0.60498317, 0.60537555, 0.60592282, 0.60715161, 0.60689941,
         0.60466828, 0.60546323, 0.60820964, 0.60588959, 0.6059248 ,
         0.60625018, 0.60745765, 0.6051964 , 0.60454642, 0.60719404,
         0.60557402, 0.60334931, 0.60335247, 0.60481   , 0.60697097,
         0.60535303, 0.60544416, 0.60514374, 0.60330657, 0.60481973,
         0.60733227, 0.6056832 , 0.60551041, 0.6061658 , 0.60595574,
         0.60417811, 0.60640297, 0.60598801, 0.60427681, 0.60498911,
         0.6049375 , 0.60496823, 0.6037046 , 0.60492796, 0.60449449,
         0.60669043, 0.60485866, 0.60726663, 0.6063833 , 0.60525496,
         0.60487467, 0.60211

In [None]:
def normalized_cut_size(G, S, T=None, weight=None):
    if T is None:
        T = set(G) - set(S)
    num_cut_edges = nx.cut_size(G, S, T=T, weight=weight)
    volume_S = nx.volume(G, S, weight=weight)
    volume_T = nx.volume(G, T, weight=weight)
    return num_cut_edges * ((1 / volume_S) + (1 / volume_T))