In [1]:
import os
os.chdir('../')

In [2]:
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
from datetime import datetime
import json


In [3]:
with open("issue/graph_metrics.json", "r") as f:
    graph_metrics = json.load(f)

In [4]:
graph_metrics['pandas-dev/pandas']

{'202107': {'45562402.0': {'degree': 102,
   'normalized_degree': 0.09156193895870736,
   'betweenness': 0.07839638759796906,
   'pairwise_overlap': {'61934744.0': 18.681318681318682,
    '24256554.0': 13.186813186813188,
    '13159005.0': 24.175824175824175,
    '41898282.0': 8.791208791208792,
    '8078968.0': 18.681318681318682,
    '953992.0': 40.65934065934066,
    '47963215.0': 9.89010989010989,
    '24736507.0': 19.78021978021978,
    '37011898.0': 19.78021978021978,
    '33491632.0': 15.384615384615385,
    '10647082.0': 18.681318681318682},
   'weighted_pairwise_overlap': {'61934744.0': 37.95379537953795,
    '24256554.0': 16.5016501650165,
    '13159005.0': 26.072607260726073,
    '41898282.0': 22.772277227722775,
    '8078968.0': 40.26402640264026,
    '953992.0': 65.01650165016501,
    '47963215.0': 16.17161716171617,
    '24736507.0': 46.864686468646866,
    '37011898.0': 34.98349834983499,
    '33491632.0': 41.254125412541256,
    '10647082.0': 48.51485148514851},
   'ove

In [5]:

rows = []

for repo, time_dict in graph_metrics.items():
    for t_str, project_time in time_dict.items():
        year = int(t_str[:4])
        month = int(t_str[4:6])
        time_period = datetime(year, month, 1)
        
        # Extract project-level metrics
        overall = project_time.get('repo_overall', {})
        total_important = overall.get('total_important', None)
        total_nodes = overall.get('total_nodes', None)
        cluster_averages = overall.get('cluster_averages', {})
        mean_overlap = cluster_averages.get('mean_overlap', None)
        avg_clusters_per_node = cluster_averages.get('avg_clusters_per_node', None)
        pct_nodes_one_cluster = cluster_averages.get('pct_nodes_one_cluster', None)
        
        # important contributors
        contributor_keys = [k for k in project_time.keys() if k != 'repo_overall' and project_time[k].get('overall_overlap',False) != False]
        important_contributor_keys = [k for k in project_time.keys() if k != 'repo_overall' and project_time[k].get('overall_overlap',False) == False]

        # Compute HHI values for three metrics across actors in this project_time.
        total_norm = sum(project_time[actor].get('normalized_degree', 0) for actor in important_contributor_keys)
        total_ind_cov = sum(project_time[actor].get('individual_coverage', 0) for actor in important_contributor_keys)
        total_ind_cov_cluster = sum(project_time[actor].get('individual_coverage_cluster', 0) for actor in important_contributor_keys)
        
        hhi_norm = sum((project_time[actor].get('normalized_degree', 0)/total_norm)**2 for actor in important_contributor_keys) if total_norm > 0 else None
        hhi_ind_cov = sum((project_time[actor].get('individual_coverage', 0)/total_ind_cov)**2 for actor in important_contributor_keys) if total_ind_cov > 0 else None
        hhi_ind_cov_cluster = sum((project_time[actor].get('individual_coverage_cluster', 0)/total_ind_cov_cluster)**2 for actor in important_contributor_keys) if total_ind_cov_cluster > 0 else None

        contributor_keys = [k for k in project_time.keys() if k != 'repo_overall']
        for actor in important_contributor_keys + contributor_keys:
            actor_data = project_time[actor]
            row = {
                'repo_name': repo,
                'time_period': time_period,
                'actor_id': actor,
                'total_important': total_important,
                'total_nodes': total_nodes,
                'mean_cluster_overlap': mean_overlap,
                'avg_clusters_per_node': avg_clusters_per_node,
                'pct_nodes_one_cluster': pct_nodes_one_cluster,
                'HHI_normalized_degree': hhi_norm,
                'HHI_individual_coverage': hhi_ind_cov,
                'HHI_individual_coverage_cluster': hhi_ind_cov_cluster,
                'normalized_degree': actor_data.get('normalized_degree', None),
                'imp_to_other_avg_edge_weight': actor_data.get('avg_edge_weight', None)
            }
            if row in important_contributor_keys:
                row['individual_node_coverage'] = actor_data.get('individual_coverage', None)
                row['individual_coverage_cluster'] = actor_data.get('individual_coverage_cluster', None)
                row['overall_overlap'] = actor_data.get('overall_overlap', None)
                row['weighted_overall_overlap'] = actor_data.get('weighted_overall_overlap', None),
            # Add actor-level percentiles
            perc = actor_data.get('percentiles', {})
            row['imp_to_other_perc_10'] = perc.get('10', None)
            row['imp_to_other_perc_25'] = perc.get('25', None)
            row['imp_to_other_perc_50'] = perc.get('50', None)
            row['imp_to_other_perc_75'] = perc.get('75', None)
            row['imp_to_other_perc_90'] = perc.get('90', None)
            
            # Add important-to-important communication metrics
            imp_comm = actor_data.get('imp_to_imp_comm', {})
            row['imp_to_imp_avg_edge_weight'] = imp_comm.get('avg_edge_weight', None)
            imp_comm_perc = imp_comm.get('percentiles', {})
            row['imp_to_imp_perc_10'] = imp_comm_perc.get('10', None)
            row['imp_to_imp_perc_25'] = imp_comm_perc.get('25', None)
            row['imp_to_imp_perc_50'] = imp_comm_perc.get('50', None)
            row['imp_to_imp_perc_75'] = imp_comm_perc.get('75', None)
            row['imp_to_imp_perc_90'] = imp_comm_perc.get('90', None)
            
            rows.append(row)

df = pd.DataFrame(rows)
df['time_period'] = pd.to_datetime(df['time_period'])
df['prop_important'] = df['total_important']/df['total_nodes']

In [6]:
"""def compute_leave_one_out(df):
    # Compute the per-row product
    df = df.copy()
    df['weighted'] = df['normalized_degree'] * df['overall_overlap']
    # Compute group sums using transform (which returns a Series aligned with df)
    sum_w  = df.groupby(['repo_name','time_period'])['normalized_degree'].transform('sum')
    sum_wx = df.groupby(['repo_name','time_period'])['weighted'].transform('sum')
    # Compute leave-one-out weighted mean for each row:
    # (group sum of weighted - row weighted) divided by (group sum of weights - row weight)
    df['leave_one_out_mean_cluster_overlap'] = (sum_wx - df['weighted']) / (sum_w - df['normalized_degree'])
    return df

df = compute_leave_one_out(df)"""

"def compute_leave_one_out(df):\n    # Compute the per-row product\n    df = df.copy()\n    df['weighted'] = df['normalized_degree'] * df['overall_overlap']\n    # Compute group sums using transform (which returns a Series aligned with df)\n    sum_w  = df.groupby(['repo_name','time_period'])['normalized_degree'].transform('sum')\n    sum_wx = df.groupby(['repo_name','time_period'])['weighted'].transform('sum')\n    # Compute leave-one-out weighted mean for each row:\n    # (group sum of weighted - row weighted) divided by (group sum of weights - row weight)\n    df['leave_one_out_mean_cluster_overlap'] = (sum_wx - df['weighted']) / (sum_w - df['normalized_degree'])\n    return df\n\ndf = compute_leave_one_out(df)"

In [9]:
df.to_parquet('issue/graph_important.parquet')

In [8]:
df

Unnamed: 0,repo_name,time_period,actor_id,total_important,total_nodes,mean_cluster_overlap,avg_clusters_per_node,pct_nodes_one_cluster,HHI_normalized_degree,HHI_individual_coverage,...,imp_to_other_perc_50,imp_to_other_perc_75,imp_to_other_perc_90,imp_to_imp_avg_edge_weight,imp_to_imp_perc_10,imp_to_imp_perc_25,imp_to_imp_perc_50,imp_to_imp_perc_75,imp_to_imp_perc_90,prop_important
0,ParallelSSH/ssh2-python,2021-07-01,1391208.0,1,9,0.0,1.0,100.0,0.234375,1.0,...,4.0,4.5,5.4,0.0,0.0,0.0,0.0,0.0,0.0,0.111111
1,ParallelSSH/ssh2-python,2021-07-01,837316.0,1,9,0.0,1.0,100.0,0.234375,1.0,...,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111
2,ParallelSSH/ssh2-python,2021-07-01,1861005.0,1,9,0.0,1.0,100.0,0.234375,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.111111
3,ParallelSSH/ssh2-python,2021-07-01,2694872.0,1,9,0.0,1.0,100.0,0.234375,1.0,...,0.0,0.0,0.0,6.0,6.0,6.0,6.0,6.0,6.0,0.111111
4,ParallelSSH/ssh2-python,2021-07-01,24194635.0,1,9,0.0,1.0,100.0,0.234375,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3095979,loomchild/reload,2018-01-01,30437729.0,1,2,0.0,1.0,100.0,0.500000,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.500000
3095980,openstack/freezer-web-ui,2018-01-01,903479.0,1,2,0.0,1.0,100.0,0.500000,1.0,...,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500000
3095981,openstack/freezer-web-ui,2018-01-01,19940210.0,1,2,0.0,1.0,100.0,0.500000,1.0,...,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,0.500000
3095982,openstack/freezer-web-ui,2018-01-01,903479.0,1,2,0.0,1.0,100.0,0.500000,1.0,...,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500000
