In [1]:
import os
os.chdir('../')

In [21]:
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
from datetime import datetime
import json


In [22]:
with open("issue/graph_metrics.json", "r") as f:
    graph_metrics = json.load(f)

In [23]:

rows = []

for repo, time_dict in graph_metrics.items():
    for t_str, project_time in time_dict.items():
        year = int(t_str[:4])
        month = int(t_str[4:6])
        time_period = datetime(year, month, 1)
        
        # Extract project-level metrics
        overall = project_time.get('repo_overall', {})
        total_important = overall.get('total_important', None)
        total_nodes = overall.get('total_nodes', None)
        cluster_averages = overall.get('cluster_averages', {})
        mean_overlap = cluster_averages.get('mean_overlap', None)
        avg_clusters_per_node = cluster_averages.get('avg_clusters_per_node', None)
        pct_nodes_one_cluster = cluster_averages.get('pct_nodes_one_cluster', None)
        
        # important contributors
        important_contributor_keys = [k for k in project_time.keys() if k != 'repo_overall']
        
        # Compute HHI values for three metrics across actors in this project_time.
        total_norm = sum(project_time[actor].get('normalized_degree', 0) for actor in important_contributor_keys)
        total_ind_cov = sum(project_time[actor].get('individual_coverage', 0) for actor in important_contributor_keys)
        total_ind_cov_cluster = sum(project_time[actor].get('individual_coverage_cluster', 0) for actor in important_contributor_keys)
        
        hhi_norm = sum((project_time[actor].get('normalized_degree', 0)/total_norm)**2 for actor in important_contributor_keys) if total_norm > 0 else None
        hhi_ind_cov = sum((project_time[actor].get('individual_coverage', 0)/total_ind_cov)**2 for actor in important_contributor_keys) if total_ind_cov > 0 else None
        hhi_ind_cov_cluster = sum((project_time[actor].get('individual_coverage_cluster', 0)/total_ind_cov_cluster)**2 for actor in important_contributor_keys) if total_ind_cov_cluster > 0 else None
        
        for actor in important_contributor_keys:
            actor_data = project_time[actor]
            row = {
                'repo_name': repo,
                'time_period': time_period,
                'actor_id': actor,
                'total_important': total_important,
                'total_nodes': total_nodes,
                'mean_cluster_overlap': mean_overlap,
                'avg_clusters_per_node': avg_clusters_per_node,
                'pct_nodes_one_cluster': pct_nodes_one_cluster,
                'HHI_normalized_degree': hhi_norm,
                'HHI_individual_coverage': hhi_ind_cov,
                'HHI_individual_coverage_cluster': hhi_ind_cov_cluster,
                'normalized_degree': actor_data.get('normalized_degree', None),
                'individual_node_coverage': actor_data.get('individual_coverage', None),
                'individual_coverage_cluster': actor_data.get('individual_coverage_cluster', None),
                'overall_overlap': actor_data.get('overall_overlap', None),
                'weighted_overall_overlap': actor_data.get('weighted_overall_overlap', None),
                'imp_to_other_avg_edge_weight': actor_data.get('avg_edge_weight', None)
            }
            # Add actor-level percentiles
            perc = actor_data.get('percentiles', {})
            row['imp_to_other_perc_10'] = perc.get('10', None)
            row['imp_to_other_perc_25'] = perc.get('25', None)
            row['imp_to_other_perc_50'] = perc.get('50', None)
            row['imp_to_other_perc_75'] = perc.get('75', None)
            row['imp_to_other_perc_90'] = perc.get('90', None)
            
            # Add important-to-important communication metrics
            imp_comm = actor_data.get('imp_to_imp_comm', {})
            row['imp_to_imp_avg_edge_weight'] = imp_comm.get('avg_edge_weight', None)
            imp_comm_perc = imp_comm.get('percentiles', {})
            row['imp_to_imp_perc_10'] = imp_comm_perc.get('10', None)
            row['imp_to_imp_perc_25'] = imp_comm_perc.get('25', None)
            row['imp_to_imp_perc_50'] = imp_comm_perc.get('50', None)
            row['imp_to_imp_perc_75'] = imp_comm_perc.get('75', None)
            row['imp_to_imp_perc_90'] = imp_comm_perc.get('90', None)
            
            rows.append(row)

df = pd.DataFrame(rows)
df['time_period'] = pd.to_datetime(df['time_period'])
df['prop_important'] = df['total_important']/df['total_nodes']

In [24]:
def compute_leave_one_out(df):
    # Compute the per-row product
    df = df.copy()
    df['weighted'] = df['normalized_degree'] * df['overall_overlap']
    # Compute group sums using transform (which returns a Series aligned with df)
    sum_w  = df.groupby(['repo_name','time_period'])['normalized_degree'].transform('sum')
    sum_wx = df.groupby(['repo_name','time_period'])['weighted'].transform('sum')
    # Compute leave-one-out weighted mean for each row:
    # (group sum of weighted - row weighted) divided by (group sum of weights - row weight)
    df['leave_one_out_mean_cluster_overlap'] = (sum_wx - df['weighted']) / (sum_w - df['normalized_degree'])
    return df

df = compute_leave_one_out(df)

In [26]:
df.to_csv('issue/graph_important.csv')