In [78]:
import os
os.chdir('../')

In [107]:
import pandas as pd
from pathlib import Path
from source.lib.helpers import *
from concurrent.futures import ProcessPoolExecutor
import matplotlib.pyplot as plt
import random
import numpy as np
import networkx as nx
from source.derived.contributor_stats.calculate_contributions import *

In [108]:
graph_dict = {}
for filepath in glob("issue/graphs/**_201701.gexf"):
    filename = os.path.basename(filepath)
    repo = filename.replace("_201701.gexf", "").replace("_", "/")
    G = nx.read_gexf(filepath)
    graph_dict[repo] = G
    print(repo)

zengin-code/zengin-py
PhilipGarnero/django-rest-framework-social-oauth2
un1t/django-cleanup
jackparmer/colorlover
jsocol/django-ratelimit
level12/keg-elements
keitheis/alog
crdoconnor/xeger
common-workflow-language/cwltool
jsonpickle/jsonpickle
aio-libs/aiohttp-devtools
vovanec/supervisor/checks
cameronbwhite/Flask-CAS
tuvistavie/python-i18n
thumbor/thumbor
ponyorm/pony
allanlei/python-zipstream
src-d/jgscm
NaturalHistoryMuseum/pylibdmtx
dfm/emcee
NitorCreations/nitor-deploy-tools
go-macaroon-bakery/py-macaroon-bakery
olofk/fusesoc
aajanki/yle-dl
pytest-dev/pytest-repeat
ubernostrum/django-contact-form
django-admin-tools/django-admin-tools
CiwPython/Ciw
jasonish/py-idstools
Amber-MD/pytraj
pennlabs/penn-sdk-python
dave-shawley/sphinx-jsondomain
pytest-dev/pluggy
fedosov/updates
psolin/cleanco
glamp/bashplotlib
dalibo/ldap2pg
Pylons/pylons
jd-boyd/python-lzo
Yelp/service/configuration/lib
gmr/flatdict
gitpython-developers/smmap
mirumee/django-prices-openexchangerates
annoviko/pyclusteri

In [110]:
degree_graph_dict = {}
for repo, G in graph_dict.items():
    print(repo)
    original_G = G.copy()
    # Compute betweenness centrality on the original graph
    dc = nx.betweenness_centrality(original_G)
    
    # Mark nodes as important if their betweenness is >= 0.1
    important_nodes = {node for node, cent in dc.items() if cent >= 0.1}
    
    # Create a modified graph and set the 'important' attribute
    modified_G = original_G.copy()
    nx.set_node_attributes(modified_G, {node: (node in important_nodes) for node in modified_G.nodes()}, 'important')
    
    # Remove all edges connecting two important nodes
    edges_to_remove = [(u, v) for u, v in modified_G.edges() if u in important_nodes and v in important_nodes]
    modified_G.remove_edges_from(edges_to_remove)
    
    # Find important nodes that became isolated (degree 0) in the modified graph
    isolated_nodes = [node for node in important_nodes if modified_G.degree(node) == 0]
    for node in isolated_nodes:
        # Restore edges from the original graph that connect this node to other important nodes
        restored_edges = [(u, v, original_G[u][v]) 
                          for u, v in original_G.edges(node)
                          if u in important_nodes and v in important_nodes]
        modified_G.add_edges_from(restored_edges)
        # Mark the node as not important and remove it from the set
        important_nodes.remove(node)
        modified_G.nodes[node]['important'] = False
    
    degree_graph_dict[repo] = {'original': original_G, 'important_edges_removed': modified_G}


zengin-code/zengin-py
PhilipGarnero/django-rest-framework-social-oauth2
un1t/django-cleanup
jackparmer/colorlover
jsocol/django-ratelimit
level12/keg-elements
keitheis/alog
crdoconnor/xeger
common-workflow-language/cwltool
jsonpickle/jsonpickle
aio-libs/aiohttp-devtools
vovanec/supervisor/checks
cameronbwhite/Flask-CAS
tuvistavie/python-i18n
thumbor/thumbor
ponyorm/pony
allanlei/python-zipstream
src-d/jgscm
NaturalHistoryMuseum/pylibdmtx
dfm/emcee
NitorCreations/nitor-deploy-tools
go-macaroon-bakery/py-macaroon-bakery
olofk/fusesoc
aajanki/yle-dl
pytest-dev/pytest-repeat
ubernostrum/django-contact-form
django-admin-tools/django-admin-tools
CiwPython/Ciw
jasonish/py-idstools
Amber-MD/pytraj
pennlabs/penn-sdk-python
dave-shawley/sphinx-jsondomain
pytest-dev/pluggy
fedosov/updates
psolin/cleanco
glamp/bashplotlib
dalibo/ldap2pg
Pylons/pylons
jd-boyd/python-lzo
Yelp/service/configuration/lib
gmr/flatdict
gitpython-developers/smmap
mirumee/django-prices-openexchangerates
annoviko/pyclusteri

## Mathematical Formulations

Let \( G = (V, E) \) be a graph with \(|V|\) nodes, and let \( I \subseteq V \) be the set of important contributors.

*Note: All importance metrics (normalized degree and betweenness) are computed on the modified graph \( G' \) (i.e. with edges between important contributors removed).*

### Importance Metrics

For each \( i \in I \):
1. **Unnormalized Degree:**
   $$
   d_i = \text{degree}(i) \quad (\text{computed on } G')
   $$
2. **Normalized Degree Centrality:**
   $$
   C_D(i) = \frac{d_i}{|V| - 1}
   $$
3. **Betweenness Centrality:**
   $$
   C_B(i) = \text{betweenness}(i)
   $$

Total number of important contributors:
$$
|I|
$$

### Clustering Metrics

Let \( G' \) be the modified graph, and let \( C(i) \) be the connected component of \( G' \) that contains \( i \).

1. **Individual Coverage:**  
   For each \( i \in I \),
   $$
   \text{individual\_coverage}(i) = \frac{|C(i)|}{|V|} \times 100\%
   $$
   This is the percentage of all nodes in \( G' \) that are in \( i \)'s connected component.
   
2. **Overall Unweighted Overlap:**  
   Let \( N(i) \) be the set of neighbors of \( i \) in \( G' \) and define
   $$
   U_i = \bigcup_{\substack{j \in I \\ j \neq i}} N(j)
   $$
   Then:
   $$
   O(i) = \frac{|N(i) \cap U_i|}{|N(i)|} \times 100\%
   $$
3. **Weighted Overall Overlap:**  
   Let 
   $$
   w_i = \sum_{v \in N(i)} w(i,v)
   $$
   Then:
   $$
   W(i) = \frac{\sum_{v \in N(i) \cap U_i} w(i,v)}{w_i} \times 100\%
   $$
4. **Aggregate Cluster Coverage:**  
   Let \( \mathcal{C} \) be the set of connected components in \( G' \) that contain at least one important contributor.
   $$
   \text{Coverage}_{\text{agg}} = \frac{\left|\bigcup_{C \in \mathcal{C}} C\right|}{|V|} \times 100\%
   $$
5. **Cluster-Level Weighted Averages:**  
   For a cluster \( C \) with important contributors \( I_C \),
   $$
   \overline{O}_C = \frac{\sum_{i \in I_C} O(i) \, d_i}{\sum_{i \in I_C} d_i}, \quad
   \overline{W}_C = \frac{\sum_{i \in I_C} W(i) \, d_i}{\sum_{i \in I_C} d_i}
   $$
6. **Ego-Cluster Overlap Metrics:**  
   For each \( i \in I \), define its ego cluster as:
   $$
   E_i = \{ i \} \cup N(i)
   $$
   Let 
   $$
   U = \bigcup_{i \in I} E_i
   $$
   and for each node \( v \in U \), let \( c(v) \) be the number of ego clusters that contain \( v \). Then:
   - **Average Number of Ego Clusters per Node:**
     $$
     \text{AvgClusters} = \frac{1}{|U|} \sum_{v \in U} c(v)
     $$
   - **Percentage of Nodes in Exactly One Ego Cluster:**
     $$
     \text{PctOneCluster} = \frac{|\{ v \in U : c(v) = 1 \}|}{|U|} \times 100\%
     $$

### Communication Metrics

For each \( i \in I \), define:
- \( E_i \): the set of edges incident on \( i \) in \( G' \) (with the other endpoint in the same connected component).
- \( E^I_i \): the set of edges in the original graph \( G \) between \( i \) and other important contributors.

Then:
1. **Mean Edge Weight:**
   $$
   \bar{w}_i = \frac{1}{|E_i|} \sum_{e \in E_i} w(e), \quad \bar{w}^I_i = \frac{1}{|E^I_i|} \sum_{e \in E^I_i} w(e)
   $$
2. **Standard Error (SE):**
   $$
   \text{SE}_i = \frac{\sigma_i}{\sqrt{|E_i|}}, \quad \sigma_i = \sqrt{\frac{1}{|E_i| - 1} \sum_{e \in E_i} \left(w(e) - \bar{w}_i\right)^2}
   $$
3. **Percentiles:**  
   The 10th, 25th, 50th, 75th, and 90th percentiles are computed from 
   $$
   \{ w(e) : e \in E_i \} \quad \text{(or from } \{ w(e) : e \in E^I_i \}\text{)}
   $$
4. Additionally, for each important contributor, we record the weight of the edges connecting them to each other important contributor (from the original graph \( G \)).

### Graph Size

For the original graph \( G \):
- **Total Nodes:** \(|V|\)
- **Total Edges:** \(|E|\)

In [114]:
def GetImportantContributors(G):
    """
    Return a set of nodes that are marked as important.
    Each node is expected to have an attribute 'important' that is True.
    """
    return {node for node, attr in G.nodes(data=True) if attr.get('important', False)}

def ComputeNeighborhoods(G):
    """
    Precompute and return the neighborhood (set of direct neighbors) for each node in G.
    """
    return {node: set(G.neighbors(node)) for node in G.nodes()}

def ComputeImportanceMetrics(G, important_contributors):
    """
    Compute basic importance statistics for each important contributor in G:
      - Unnormalized degree: d_i
      - Normalized degree centrality: C_D(i) = d_i / (|V| - 1)
      - Betweenness centrality: C_B(i)
    
    Returns a dictionary keyed by each important contributor.
    *Note: These metrics are computed on the modified graph G where edges between important contributors are removed.*
    """
    importance_metrics = {}
    n_nodes = G.number_of_nodes()
    betweenness = nx.betweenness_centrality(G)
    for node in important_contributors:
        degree = G.degree(node)
        normalized_degree = degree / (n_nodes - 1) if n_nodes > 1 else 0
        importance_metrics[node] = {
            'degree': degree,
            'normalized_degree': normalized_degree,
            'betweenness': betweenness.get(node, 0)
        }
    importance_metrics['total_important'] = len(important_contributors)
    return importance_metrics

def ComputeOverlapMetrics(G, important_contributors, neighborhoods, weight='weight'):
    """
    For each important contributor, compute overlap metrics:
      - Pairwise unweighted overlap: for each other important contributor j, 
        the percentage of i's neighbors that are also neighbors of j.
      - Pairwise weighted overlap: the percentage of i's weighted degree contributed by common neighbors.
      - Overall unweighted overlap: percentage of i's neighbors shared with any other important contributor.
      - Overall weighted overlap: as above, but weighted by the sum of weights.
    
    Returns a dictionary keyed by each important contributor with keys:
       "pairwise", "weighted_pairwise", "any_other", and "weighted_any_other".
    """
    metrics_results = {}
    for i in important_contributors:
        neighbors_i = neighborhoods.get(i, set())
        degree_i = len(neighbors_i)
        weighted_degree_i = sum(G[i][nbr].get(weight, 1) for nbr in neighbors_i)
        
        pairwise_overlap = {}
        weighted_pairwise_overlap = {}
        for j in important_contributors:
            if i == j:
                continue
            neighbors_j = neighborhoods.get(j, set())
            common_neighbors = neighbors_i.intersection(neighbors_j)
            overlap = (len(common_neighbors) / degree_i * 100) if degree_i > 0 else 0
            pairwise_overlap[j] = overlap
            weighted_common = sum(G[i][v].get(weight, 1) for v in common_neighbors)
            weighted_overlap = (weighted_common / weighted_degree_i * 100) if weighted_degree_i > 0 else 0
            weighted_pairwise_overlap[j] = weighted_overlap
        
        union_neighbors = set()
        for j in important_contributors:
            if i == j:
                continue
            union_neighbors |= neighborhoods.get(j, set())
        overall_overlap = (len(neighbors_i.intersection(union_neighbors)) / degree_i * 100) if degree_i > 0 else 0
        weighted_common_total = sum(G[i][v].get(weight, 1) for v in neighbors_i.intersection(union_neighbors))
        overall_weighted_overlap = (weighted_common_total / weighted_degree_i * 100) if weighted_degree_i > 0 else 0
        
        metrics_results[i] = {
            "pairwise": pairwise_overlap,
            "weighted_pairwise": weighted_pairwise_overlap,
            "any_other": overall_overlap,
            "weighted_any_other": overall_weighted_overlap
        }
    
    return metrics_results

def ComputeClusteringMetrics(G_modified, important_contributors, overlap_metrics):
    """
    Compute clustering metrics based on the modified graph G_modified.
    
    For each important contributor i, define its individual coverage as the percentage of all nodes 
    in G_modified that are in its connected component:
    
       individual_coverage(i) = (|C(i)| / |V|) * 100,
       
    where C(i) is the connected component containing i.
    
    Additionally, group important contributors by their connected component (using a sorted tuple as key)
    and compute cluster-level weighted averages:
    
       For a cluster C with important contributors I_C:
       mean_overlap_C = (∑_{i ∈ I_C} O(i) · d_i) / (∑_{i ∈ I_C} d_i)
       mean_weighted_overlap_C = (∑_{i ∈ I_C} W(i) · d_i) / (∑_{i ∈ I_C} d_i)
    
    Moreover, compute ego-cluster overlap metrics:
      For each important contributor i, define its ego cluster:
         E_i = {i} ∪ N(i)
      Let U = ∪_{i ∈ I} E_i. For each node v in U, let c(v) be the number of ego clusters containing v.
      
      Then:
         AvgClusters = (1/|U|) * ∑_{v ∈ U} c(v)
         PctOneCluster = (|{v ∈ U : c(v) = 1}| / |U|) * 100%
    
    Returns a tuple:
      (clustering_metrics, cluster_aggregates, cluster_overlap_metrics)
      
      clustering_metrics: dict mapping each important contributor to:
         {'individual_coverage': value, 'overall_overlap': value, 'weighted_overall_overlap': value}
      cluster_aggregates: a dictionary with aggregated cluster-level statistics:
         {'mean_overlap': value, 'mean_weighted_overlap': value, 'total_clusters': value}
      cluster_overlap_metrics: dict with keys:
         {'avg_clusters': value, 'pct_nodes_one_cluster': value}
    """
    clustering_metrics = {}
    total_nodes = G_modified.number_of_nodes()
    components = list(nx.connected_components(G_modified))
    component_map = {}
    for comp in components:
        for node in comp:
            if node in important_contributors:
                component_map[node] = comp

    for i in important_contributors:
        comp = component_map.get(i, {i})
        individual_coverage = (len(comp) / total_nodes * 100) if total_nodes > 0 else 0
        clustering_metrics[i] = {
            'individual_coverage': individual_coverage,
            'overall_overlap': overlap_metrics.get(i, {}).get('any_other', None),
            'weighted_overall_overlap': overlap_metrics.get(i, {}).get('weighted_any_other', None)
        }
    
    # Group important contributors by their connected component.
    clusters = {}
    for i in important_contributors:
        comp = component_map.get(i, {i})
        comp_key = tuple(sorted(comp))
        clusters.setdefault(comp_key, []).append(i)
    
    # Aggregate cluster-level weighted averages.
    total_overlap = 0
    total_weight_overlap = 0
    total_cluster_nodes = 0
    for comp_key, nodes in clusters.items():
        total_weight = sum(G_modified.degree(n) for n in nodes)
        if total_weight > 0:
            mean_overlap = sum(overlap_metrics[n]['any_other'] * G_modified.degree(n) for n in nodes) / total_weight
            mean_weighted_overlap = sum(overlap_metrics[n]['weighted_any_other'] * G_modified.degree(n) for n in nodes) / total_weight
        else:
            mean_overlap = 0
            mean_weighted_overlap = 0
        total_overlap += mean_overlap * len(comp_key)
        total_weight_overlap += mean_weighted_overlap * len(comp_key)
        total_cluster_nodes += len(comp_key)
    if total_cluster_nodes > 0:
        agg_mean_overlap = total_overlap / total_cluster_nodes
        agg_mean_weighted_overlap = total_weight_overlap / total_cluster_nodes
    else:
        agg_mean_overlap = agg_mean_weighted_overlap = 0
    cluster_aggregates = {
        'mean_overlap': agg_mean_overlap,
        'mean_weighted_overlap': agg_mean_weighted_overlap,
        'total_clusters': len(clusters)
    }
    
    # Compute ego-cluster metrics.
    ego_clusters = {}
    for i in important_contributors:
        ego_clusters[i] = {i} | set(G_modified.neighbors(i))
    U = set()
    for cluster in ego_clusters.values():
        U.update(cluster)
    cluster_counts = {}
    for v in U:
        count = sum(1 for cluster in ego_clusters.values() if v in cluster)
        cluster_counts[v] = count
    if U:
        avg_clusters = sum(cluster_counts.values()) / len(U)
        one_cluster = sum(1 for count in cluster_counts.values() if count == 1)
        pct_one_cluster = (one_cluster / len(U)) * 100
    else:
        avg_clusters = 0
        pct_one_cluster = 0

    cluster_aggregates['avg_clusters_per_node'] = avg_clusters
    cluster_aggregates['pct_nodes_one_cluster'] = pct_one_cluster
        
    
    return clustering_metrics, cluster_aggregates

def ComputeAggregateClusterCoverage(G_modified, important_contributors):
    """
    Compute the aggregate cluster coverage: the percentage of all nodes in G_modified that are
    contained in clusters (connected components) that include at least one important contributor.
    
    Returns:
      Coverage_agg = (|∪{ C in components with I ∩ C ≠ ∅ }| / |V|) * 100%
    """
    components = list(nx.connected_components(G_modified))
    nodes_in_important_clusters = set()
    for comp in components:
        if comp.intersection(important_contributors):
            nodes_in_important_clusters.update(comp)
    total_nodes = G_modified.number_of_nodes()
    coverage = (len(nodes_in_important_clusters) / total_nodes * 100) if total_nodes > 0 else 0
    return coverage

def ComputeCommunicationMetrics(G_modified, G_original, important_contributors):
    """
    Compute communication metrics.
    
    Using the modified graph (G_modified) that has important-to-important edges removed, for each important contributor,
    compute:
      - Mean edge weight, standard error (SE), and percentiles (10th, 25th, 50th, 75th, 90th)
        for edges connecting the contributor with others in the same connected component.
    
    Using the original graph (G_original), for the subgraph induced by important contributors, compute similar metrics.
    
    Returns a tuple:
       (comm_metrics, imp_comm_metrics)
       
       comm_metrics: dict keyed by important contributor with:
           {'avg_edge_weight': value, 'se_edge_weight': value, 'percentiles': {'10': ..., '25': ..., '50': ..., '75': ..., '90': ...}}
       imp_comm_metrics: similar dictionary for the subgraph of important contributors.
    """
    import numpy as np
    comm_metrics = {}
    components = list(nx.connected_components(G_modified))
    comp_map = {}
    for comp in components:
        for node in comp:
            comp_map[node] = comp
    for node in important_contributors:
        comp = comp_map.get(node, {node})
        weights = [G_modified[node][nbr]['weight'] for nbr in G_modified.neighbors(node) if nbr in comp]
        if weights:
            arr = np.array(weights)
            mean_val = np.mean(arr)
            se_val = np.std(arr, ddof=1) / np.sqrt(len(arr)) if len(arr) > 1 else 0
            percentiles = np.percentile(arr, [10, 25, 50, 75, 90])
        else:
            mean_val = se_val = 0
            percentiles = [0, 0, 0, 0, 0]
        comm_metrics[node] = {
            'avg_edge_weight': mean_val,
            'se_edge_weight': se_val,
            'percentiles': {
                '10': percentiles[0],
                '25': percentiles[1],
                '50': percentiles[2],
                '75': percentiles[3],
                '90': percentiles[4]
            }
        }
    
    imp_comm_metrics = {}
    subG = G_original.subgraph(important_contributors)
    for node in important_contributors:
        weights = [subG[node][nbr]['weight'] for nbr in subG.neighbors(node)]
        if weights:
            arr = np.array(weights)
            mean_val = np.mean(arr)
            se_val = np.std(arr, ddof=1) / np.sqrt(len(arr)) if len(arr) > 1 else 0
            percentiles = np.percentile(arr, [10, 25, 50, 75, 90])
        else:
            mean_val = se_val = 0
            percentiles = [0, 0, 0, 0, 0]
        imp_comm_metrics[node] = {
            'avg_edge_weight': mean_val,
            'se_edge_weight': se_val,
            'percentiles': {
                '10': percentiles[0],
                '25': percentiles[1],
                '50': percentiles[2],
                '75': percentiles[3],
                '90': percentiles[4]
            }
        }
    
    return comm_metrics, imp_comm_metrics

def GetGraphSize(G):
    """
    Return a tuple (n, m) where n is the number of nodes and m is the number of edges in graph G.
    """
    return G.number_of_nodes(), G.number_of_edges()

def ComputeAllGraphMetrics(degree_graph_dict):
    """
    For each repository in degree_graph_dict, compute and compile all metrics for important contributors.
    
    Metrics include:
      - Importance Metrics (from ComputeImportanceMetrics).
      - Overlap Metrics (from ComputeOverlapMetrics).
      - Clustering Metrics:
            * Individual coverage (percentage of all nodes in the contributor's connected component),
            * Cluster-level weighted averages (aggregated as a dictionary),
            * Aggregate cluster coverage,
            * Ego-cluster overlap metrics: average number of ego clusters per node and percentage of nodes in exactly one ego cluster.
      - Communication Metrics:
            * In-cluster communication (from G_modified),
            * Important-to-important communication (from G_original), including per-node statistics and aggregated overall stats.
      - Graph Size (total nodes and total edges from the original graph).
      - Additionally, for each important contributor, record the weights of edges connecting them to each other important contributor.
    
    Returns a dictionary structured as:
      {
         repo1: {
            important_contributor1: { ... node-level metrics ... },
            important_contributor2: { ... },
            ...,
            'repo_overall': {
                 'total_nodes': <value>,
                 'total_edges': <value>,
                 'aggregate_cluster_coverage': <value>,
                 'cluster_averages': <aggregated cluster-level stats as a dict>,
                 'total_important': <number>,
                 'imp_to_imp_comm_overall': <aggregated communication stats>,
                 'avg_clusters_per_node': <value>,
                 'pct_nodes_one_cluster': <value>
            }
         },
         repo2: { ... },
         ...
      }
    """
    all_metrics = {}
    
    for repo, data in degree_graph_dict.items():
        print(repo)
        repo_metrics = {}
        G_mod = data.get('important_edges_removed')
        G_orig = data.get('original')
        if G_mod is None or G_orig is None:
            continue
        
        important_contributors = GetImportantContributors(G_mod)
        if not important_contributors:
            continue
        
        neighborhoods = ComputeNeighborhoods(G_mod)
        importance = ComputeImportanceMetrics(G_mod, important_contributors)
        overlap_metrics = ComputeOverlapMetrics(G_mod, important_contributors, neighborhoods)
        clustering_metrics, cluster_aggregates = ComputeClusteringMetrics(G_mod, important_contributors, overlap_metrics)
        aggregate_coverage = ComputeAggregateClusterCoverage(G_mod, important_contributors)
        comm_metrics, imp_comm_metrics = ComputeCommunicationMetrics(G_mod, G_orig, important_contributors)
        
        # Compute important-to-important edge weights.
        subG = G_orig.subgraph(important_contributors)
        imp_to_imp_edge_weights = {}
        for node in important_contributors:
            weights_dict = {}
            for nbr in subG.neighbors(node):
                weights_dict[nbr] = subG[node][nbr]['weight']
            imp_to_imp_edge_weights[node] = weights_dict
        
        weights_all = [d['weight'] for u, v, d in subG.edges(data=True)]
        if weights_all:
            mean_all = np.mean(weights_all)
            se_all = np.std(weights_all, ddof=1) / np.sqrt(len(weights_all)) if len(weights_all) > 1 else 0
            perc_all = np.percentile(np.array(weights_all), [10, 25, 50, 75, 90])
            imp_to_imp_overall = {
                'avg_edge_weight': mean_all,
                'se_edge_weight': se_all,
                'percentiles': {
                    '10': perc_all[0],
                    '25': perc_all[1],
                    '50': perc_all[2],
                    '75': perc_all[3],
                    '90': perc_all[4]
                }
            }
        else:
            imp_to_imp_overall = {
                'avg_edge_weight': 0,
                'se_edge_weight': 0,
                'percentiles': {'10': 0, '25': 0, '50': 0, '75': 0, '90': 0}
            }
        
        for important_contributor in important_contributors:
            contributor_dict = {}
            contributor_dict.update(importance.get(important_contributor, {}))
            contributor_dict.update({
                'pairwise_overlap': overlap_metrics.get(important_contributor, {}).get('pairwise', {}),
                'weighted_pairwise_overlap': overlap_metrics.get(important_contributor, {}).get('weighted_pairwise', {}),
                'overall_overlap': overlap_metrics.get(important_contributor, {}).get('any_other', None),
                'weighted_overall_overlap': overlap_metrics.get(important_contributor, {}).get('weighted_any_other', None)
            })
            contributor_dict['individual_coverage'] = clustering_metrics.get(important_contributor, {}).get('individual_coverage', None)
            contributor_dict.update(comm_metrics.get(important_contributor, {}))
            contributor_dict.update({'imp_to_imp_comm': imp_comm_metrics.get(important_contributor, {})})
            # Add edge weight mapping for important-to-important edges.
            contributor_dict['imp_to_imp_edge_weights'] = imp_to_imp_edge_weights.get(important_contributor, {})
            repo_metrics[important_contributor] = contributor_dict
        
        total_nodes, total_edges = GetGraphSize(G_orig)

        repo_metrics['repo_overall'] = {
            'total_nodes': total_nodes,
            'total_edges': total_edges,
            'aggregate_cluster_coverage': aggregate_coverage,
            'cluster_averages': cluster_aggregates,
            'total_important': len(important_contributors),
            'imp_to_imp_comm_overall': imp_to_imp_overall
        }
        
        all_metrics[repo] = repo_metrics
    
    return all_metrics


In [115]:
graph_metrics = ComputeAllGraphMetrics(degree_graph_dict)

zengin-code/zengin-py
PhilipGarnero/django-rest-framework-social-oauth2
un1t/django-cleanup
jackparmer/colorlover
jsocol/django-ratelimit
level12/keg-elements
keitheis/alog
crdoconnor/xeger
common-workflow-language/cwltool
jsonpickle/jsonpickle
aio-libs/aiohttp-devtools
vovanec/supervisor/checks
cameronbwhite/Flask-CAS
tuvistavie/python-i18n
thumbor/thumbor
ponyorm/pony
allanlei/python-zipstream
src-d/jgscm
NaturalHistoryMuseum/pylibdmtx
dfm/emcee
NitorCreations/nitor-deploy-tools
go-macaroon-bakery/py-macaroon-bakery
olofk/fusesoc
aajanki/yle-dl
pytest-dev/pytest-repeat
ubernostrum/django-contact-form
django-admin-tools/django-admin-tools
CiwPython/Ciw
jasonish/py-idstools
Amber-MD/pytraj
pennlabs/penn-sdk-python
dave-shawley/sphinx-jsondomain
pytest-dev/pluggy
fedosov/updates
psolin/cleanco
glamp/bashplotlib
dalibo/ldap2pg
Pylons/pylons
jd-boyd/python-lzo
Yelp/service/configuration/lib
gmr/flatdict
gitpython-developers/smmap
mirumee/django-prices-openexchangerates
annoviko/pyclusteri

In [119]:
graph_metrics['dask/dask']

{'2783717.0': {'degree': 70,
  'normalized_degree': 0.3684210526315789,
  'betweenness': 0.22418856189998077,
  'pairwise_overlap': {'306380.0': 70.0, '6042212.0': 24.285714285714285},
  'weighted_pairwise_overlap': {'306380.0': 87.64044943820225,
   '6042212.0': 31.086142322097377},
  'overall_overlap': 75.71428571428571,
  'weighted_overall_overlap': 90.2621722846442,
  'individual_coverage': 98.95287958115183,
  'avg_edge_weight': 3.8142857142857145,
  'se_edge_weight': 0.9616438774604057,
  'percentiles': {'10': 1.0, '25': 1.0, '50': 2.0, '75': 3.0, '90': 8.0},
  'imp_to_imp_comm': {'avg_edge_weight': 89.5,
   'se_edge_weight': 76.5,
   'percentiles': {'10': 28.3,
    '25': 51.25,
    '50': 89.5,
    '75': 127.75,
    '90': 150.7}},
  'imp_to_imp_edge_weights': {'306380.0': 166.0, '6042212.0': 13.0}},
 '306380.0': {'degree': 147,
  'normalized_degree': 0.7736842105263158,
  'betweenness': 0.7264326888023157,
  'pairwise_overlap': {'2783717.0': 33.33333333333333,
   '6042212.0': 22.