In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
import warnings
from source.lib.helpers import *
from pandarallel import pandarallel
import ast
from itertools import combinations
from collections import defaultdict
from functools import reduce

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

In [3]:
def WeightedQuantile(values, weights, quantiles):
    values = np.array(values)
    weights = np.array(weights)
    sorter = np.argsort(values)
    values = values[sorter]
    weights = weights[sorter]
    cumulative_weight = np.cumsum(weights)
    total_weight = cumulative_weight[-1]
    return np.interp(np.array(quantiles) * total_weight, cumulative_weight, values)

def Assign3Bin(repo):
    val = base_wm[repo]
    if val <= q33:
        return 0
    elif val <= q67:
        return 1
    else:
        return 2

def ConvertLogKeysToInt(log_dict):
    return {int(float(key)): value for key, value in log_dict.items()}


In [4]:
df_problems_contr_filtered = pd.read_parquet('issue/filtered_problem_data.parquet')

In [5]:
def ProjectLevelStats(df_problems_contr_filtered):
    # controls that I can add down the road
    # % of problems that are unlinked prs/linked
    df_problems_contr_filtered['departed_involved'] = df_problems_contr_filtered.apply(lambda x: x['departed_actor_id'] in x['all_actors'], axis = 1)
    df_problems_contr_filtered['key_contributor_count'] = df_problems_contr_filtered['important_actors_rolling'].apply(len)
    df_problems_contr_filtered['total_contributor_count'] = df_problems_contr_filtered['all_actors_period'].apply(len)
    df_problems_contr_filtered['departed_opener'] = pd.to_numeric(df_problems_contr_filtered.apply(lambda x: x['departed_actor_id'] in x['pr_opener'] if x['departed_involved'] else np.nan, axis = 1))
    df_problems_contr_filtered['departed_author'] = pd.to_numeric(df_problems_contr_filtered.apply(lambda x: x['departed_actor_id'] in x['pr_authors'] if x['departed_involved'] else np.nan, axis = 1))
    
    df_project_filtered_group = df_problems_contr_filtered.groupby(
        ['repo_name', 'time_period', 'treatment_period', 'key_contributor_count', 'total_contributor_count']
    ).agg(
        problem_count=('problem_id', 'count'),
        ind_collab=('ind_collab_roll', 'mean'),
        ind_key_collab=('ind_key_collab_roll', 'mean'),
        ind_other_collab=('ind_other_collab_roll', 'mean'),
        departed_involved_count=('departed_involved','sum'),
        departed_involved=('departed_involved','mean'),
        departed_opened_count=('departed_opener','sum'),
        departed_opened=('departed_opener','mean'), # conditional on involvement
        departed_authored_count=('departed_author','sum'),
        departed_authored=('departed_author','mean'), # conditional on involvement
    ).reset_index()
    return df_project_filtered_group

df_project_filtered_group = ProjectLevelStats(df_problems_contr_filtered)


In [6]:
repo_list = df_problems_contr_filtered['repo_name'].unique().tolist()
def GetCommunicationLogs(df_contributors):
    df_contributors['communication_log'] = df_contributors['communication_log'].apply(ast.literal_eval)
    df_contributors['communication_log'] = df_contributors['communication_log'].apply(ConvertLogKeysToInt)
    
    df_contr_comm = df_contributors[['repo_name','time_period','actor_id','communication_log']]
    df_contr_dept = df_problems_contr_filtered[['repo_name','time_period','departed_actor_id','treatment_period']].drop_duplicates()
    df_contr_dept_comm = pd.merge(df_contr_comm, df_contr_dept)
    df_contr_dept_comm['dept_ov_comm'] = df_contr_dept_comm.apply(
        lambda x: x['communication_log'].get(int(x['departed_actor_id'])), axis = 1)

    cooccur_counts = BuildCooccurrenceCounts(df_problems_contr_filtered)
    
    df_contr_dept_comm['problem_count'] = df_contr_dept_comm.apply(
        lambda row: cooccur_counts.get(
            (row.repo_name, row.time_period, int(float(row.actor_id)), int(float(row.departed_actor_id))), 0), axis=1)
    df_contr_dept_comm.loc[df_contr_dept_comm['problem_count'] ==0, 'dept_ov_comm']= np.nan
    return df_contr_dept_comm

def BuildCooccurrenceCounts(df_probs):
    """Returns dict mapping
       (repo_name, time_period, actor, departed_actor) → count
       for every unordered pair in each row’s all_actors list."""
    counts = defaultdict(int)
    for (_, group) in df_probs.groupby(['repo_name','time_period']):
        repo = group.repo_name.iloc[0]
        period = group.time_period.iloc[0]
        for actors in group.all_actors:
            # turn actors-list into a set to avoid dupes
            s = set([int(float(a)) for a in actors])
            # for every *ordered* pair of distinct actors
            for a, b in combinations(s, 2):
                counts[(repo, period, a, b)] += 1
                counts[(repo, period, b, a)] += 1
    return counts

df_contributors = pd.read_parquet('drive/output/derived/graph_structure/contributor_characteristics.parquet', filters = [('repo_name',"in",repo_list)])
df_contr_dept_comm = GetCommunicationLogs(df_contributors)

In [64]:
df_pre_contr = df_contr_dept_comm[df_contr_dept_comm['time_period'] < df_contr_dept_comm['treatment_period']]
df_pre_repo = (
    df_pre_contr[['repo_name','time_period']].drop_duplicates()
    .sort_values(['repo_name','time_period'], ascending=[True,False])
    .groupby(['repo_name'], as_index=False)
    .head(5)
)
df_pre_contr = pd.merge(df_pre_contr, df_pre_repo)

actor_metrics = (
    df_pre_contr
    .groupby(['repo_name','actor_id'], as_index=False)
    .agg(
        agg_dept_ov_comm   = ('dept_ov_comm', 'sum'),
        agg_problem_count     = ('problem_count',    'sum'),
    )
)
actor_metrics['agg_dept_ov_comm'] = actor_metrics['agg_dept_ov_comm'].replace(0, np.nan)
actor_metrics['agg_dept_ov_comm_per_problem'] = actor_metrics['agg_dept_ov_comm'] / actor_metrics['agg_problem_count']

repo_metrics = (
    actor_metrics.groupby('repo_name', as_index=False)
    .agg(avg_dept_ov_comm_repo_avg = ('agg_dept_ov_comm', 'mean'),
         avg_dept_ov_comm_per_problem_repo_avg = ('agg_dept_ov_comm_per_problem','mean'),
    )
)

df_actor_summary = actor_metrics.merge(repo_metrics, on='repo_name')
df_actor_summary['actor_id'] = pd.to_numeric(df_actor_summary['actor_id'], errors='coerce')

for metric in ['dept_ov_comm', 'dept_ov_comm_per_problem']:
    agg_col  = f'agg_{metric}'
    avg_col  = f'avg_{metric}_repo_avg'

        
    df_actor_summary[f'{metric}_2bin']   = (
        df_actor_summary[agg_col] > df_actor_summary[avg_col]
    ).astype(int)
    df_actor_summary[f'{metric}_05avg_2bin']   = (
        df_actor_summary[agg_col] > .5*df_actor_summary[avg_col]
    ).astype(int)
    df_actor_summary[f'{metric}_2avg_2bin']   = (
        df_actor_summary[agg_col] > 2*df_actor_summary[avg_col]
    ).astype(int)
    df_actor_summary[f'{metric}_3avg_2bin']   = (
        df_actor_summary[agg_col] > 3*df_actor_summary[avg_col]
    ).astype(int)
    
    df_actor_summary.loc[df_actor_summary[agg_col].isna(), f'{metric}_2bin'] = np.nan
    df_actor_summary.loc[df_actor_summary[agg_col].isna(), f'{metric}_05avg_2bin'] = np.nan
    df_actor_summary.loc[df_actor_summary[agg_col].isna(), f'{metric}_2avg_2bin'] = np.nan
    df_actor_summary.loc[df_actor_summary[agg_col].isna(), f'{metric}_3avg_2bin'] = np.nan

    if metric == 'dept_ov_comm_per_problem':
        df_actor_summary[[f'{metric}_min_2bin',f'{metric}_min_05avg_2bin',f'{metric}_min_2avg_2bin',f'{metric}_min_3avg_2bin']] = df_actor_summary[
        [f'{metric}_2bin',f'{metric}_05avg_2bin',f'{metric}_2avg_2bin',f'{metric}_3avg_2bin']]
        
        df_actor_summary.loc[df_actor_summary['agg_problem_count']<5, f'{metric}_min_2bin'] = np.nan
        df_actor_summary.loc[df_actor_summary['agg_problem_count']<5, f'{metric}_min_05avg_2bin'] = np.nan
        df_actor_summary.loc[df_actor_summary['agg_problem_count']<5, f'{metric}_min_2avg_2bin'] = np.nan
        df_actor_summary.loc[df_actor_summary['agg_problem_count']<5, f'{metric}_min_3avg_2bin'] = np.nan

In [65]:
df_contr_all = df_problems_contr_filtered[['repo_name','time_period','all_actors_period']].drop_duplicates(
    ['repo_name','time_period']).explode('all_actors_period').rename(columns={'all_actors_period':'actor_id'}).sort_values(
    'time_period').drop_duplicates(['repo_name','actor_id'])
df_contr_all['actor_id'] = pd.to_numeric(df_contr_all['actor_id'])

df_actor_summary = pd.merge(df_contr_all, df_actor_summary, how = 'outer')
df_actor_summary['agg_problem_count'] = df_actor_summary['agg_problem_count'].fillna(0)
df_actor_summary['avg_dept_ov_comm_repo_avg'] = (df_actor_summary.groupby('repo_name')['avg_dept_ov_comm_repo_avg']
                                           .transform(lambda x: x.ffill().bfill()))
df_actor_summary['avg_dept_ov_comm_per_problem_repo_avg'] = (df_actor_summary.groupby('repo_name')['avg_dept_ov_comm_per_problem_repo_avg']
                                          .transform(lambda x: x.ffill().bfill()))

In [66]:
df_actor_summary = pd.merge(df_actor_summary, df_contr_dept_comm[['repo_name','treatment_period']].drop_duplicates())

In [67]:
import pandas as pd
from functools import reduce
overview_metrics = [
    'dept_ov_comm_2bin',
    'dept_ov_comm_05avg_2bin',
    'dept_ov_comm_2avg_2bin',
    'dept_ov_comm_3avg_2bin',
    'dept_ov_comm_per_problem_2bin',
    'dept_ov_comm_per_problem_05avg_2bin',
    'dept_ov_comm_per_problem_2avg_2bin',
    'dept_ov_comm_per_problem_3avg_2bin',
    'dept_ov_comm_per_problem_min_2bin',
    'dept_ov_comm_per_problem_min_05avg_2bin',
    'dept_ov_comm_per_problem_min_2avg_2bin',
    'dept_ov_comm_per_problem_min_3avg_2bin'
]
status_funcs = {
    'high': lambda s: s > 0,
    'low':  lambda s: s == 0,
    'never_communicated': lambda s: s.isna(),
    'communicated':       lambda s: s.notna(),
    'never_communicated_predep': "",
}

actor_frames = []
for metric in overview_metrics:
    for status, cond in status_funcs.items():
        col_name = f'{status}_{metric}_actors'
        if status == "never_communicated_predep":
            df_temp = (
                df_actor_summary[df_actor_summary[metric].isna()]
                .query('time_period < treatment_period')
                .groupby('repo_name')['actor_id']
                .agg(list)
                .reset_index()
                .rename(columns={'actor_id': col_name})
            )
        else:   
            df_temp = (
                df_actor_summary[cond(df_actor_summary[metric])]
                .groupby('repo_name')['actor_id']
                .agg(list)
                .reset_index()
                .rename(columns={'actor_id': col_name})
            )
        actor_frames.append(df_temp)

df_actor_lists = reduce(
    lambda left, right: pd.merge(left, right, on='repo_name', how='outer'),
    actor_frames
)

for metric in overview_metrics:
    for status in status_funcs:
        col = f'{status}_{metric}_actors'
        df_actor_lists[col] = df_actor_lists[col].apply(lambda x: x if isinstance(x, list) else [])

df_pr_issues      = df_problems_contr_filtered[
    df_problems_contr_filtered['type'].isin(['linked', 'unlinked pr'])
]
df_pr_with_actors = pd.merge(df_pr_issues, df_actor_lists, on='repo_name', how='left')


In [68]:
status_map = {'high': 'above',
 'low': 'below',
 'never_communicated': 'never_comm',
 'communicated': 'comm',
             'never_communicated_predep':'never_comm_predep'}
metric_map = {
    'dept_ov_comm_2bin': 'dept_comm_avg',
    'dept_ov_comm_05avg_2bin': 'dept_comm_05avg',
    'dept_ov_comm_2avg_2bin': 'dept_comm_2avg',
    'dept_ov_comm_3avg_2bin': 'dept_comm_3avg',
    'dept_ov_comm_per_problem_2bin': 'dept_comm_per_problem_avg',
    'dept_ov_comm_per_problem_05avg_2bin': 'dept_comm_per_problem_05avg',
    'dept_ov_comm_per_problem_2avg_2bin': 'dept_comm_per_problem_2avg',
    'dept_ov_comm_per_problem_3avg_2bin': 'dept_comm_per_problem_3avg',
    'dept_ov_comm_per_problem_min_2bin': 'dept_comm_per_problem_min_avg',
    'dept_ov_comm_per_problem_min_05avg_2bin': 'dept_comm_per_problem_min_05avg',
    'dept_ov_comm_per_problem_min_2avg_2bin': 'dept_comm_per_problem_min_2avg',
    'dept_ov_comm_per_problem_min_3avg_2bin': 'dept_comm_per_problem_min_3avg',
}

In [69]:
# --- your existing actor-list build (unchanged) ---
def SummarizeMetricStatus(df, metric, status, match_col    ):
    actor_col = f'{status}_{metric}_actors'
    df[actor_col] = df[actor_col].apply(lambda x: x if isinstance(x, list) else [])
    mask = df.apply(
        lambda row: (
            row['departed_actor_id'] not in row['all_actors']
            and bool(set(row[match_col]) & set(row[actor_col]))
        ),
        axis=1
    )
    df_filtered = df[mask].drop_duplicates(
        ['repo_name','problem_id']).query('type != "unlinked issue"')
    
    summary = (
        df_filtered
        .groupby(['repo_name', 'time_period'])
        .apply(lambda grp: pd.Series({
            'prs_opened_count': len(grp),
            'contributor_count': len(set().union(*grp[actor_col].tolist(), *grp['all_actors_period'].tolist()))
        }))
        .reset_index()
    )
    if status in ['low','high']:
        opp_status = 'high' if status == 'low' else 'low'
        opp_actor_col = f'{opp_status}_{metric}_actors'
        opp_mask = df_filtered.apply(lambda row: (row['departed_actor_id'] not in row['all_actors']
                                         and bool(set(row['all_actors']) & set(row[opp_actor_col]))),
                            axis=1)
        subset_opp_involved = df_filtered[opp_mask]
        print(metric, status)
        print(subset_opp_involved.shape[0]/df_filtered.shape[0], df_filtered.shape[0])
        
    summary['overview_metric'] = metric
    summary['status'] = status
    return summary


def PivotAndFlattenCommSummary(df_pr_with_actors, match_col, overview_metrics, base_status, 
                               special_status, metric_map, status_map):
    summaries = [
        SummarizeMetricStatus(df_pr_with_actors, m, s, match_col)
        for m in overview_metrics for s in base_status
    ] + [
        SummarizeMetricStatus(df_pr_with_actors, overview_metrics[0], s, match_col)
        for s in special_status
    ]
    df_comm_summary = pd.concat(summaries, ignore_index=True)

    df_wide = (
        df_comm_summary
        .pivot_table(
            index=['repo_name', 'time_period'],
            columns=['overview_metric', 'status'],
            values=['prs_opened_count', 'contributor_count'],
            fill_value=0
        )
        .reset_index()
    )

    flat_columns = []
    for col in df_wide.columns:
        if isinstance(col, tuple):
            if all(col) and len(col) == 3:
                metric_type, metric_name, status = col
                flat_prefix = 'prs_opened' if metric_type == 'prs_opened_count' else 'contributors'
                mname = metric_map[metric_name]
                sname = status_map[status]
                flat_columns.append(f'{flat_prefix}_{mname}_{sname}')
            else:
                flat_columns.append(col[0])
        else:
            flat_columns.append(col)

    df_wide.columns = flat_columns

    return df_wide.rename(columns={
        'prs_opened_dept_comm_avg_comm':           'prs_opened_dept_comm',
        'prs_opened_dept_comm_avg_never_comm':     'prs_opened_dept_never_comm',
        'prs_opened_dept_comm_avg_never_comm_predep':     'prs_opened_dept_never_comm_predep',
        'contributors_dept_comm_avg_comm':         'contributors_dept_comm',
        'contributors_dept_comm_avg_never_comm':   'contributors_dept_never_comm',
        'contributors_dept_comm_avg_never_comm_predep':   'contributors_dept_never_comm_predep',
    })

df_comm_wide = PivotAndFlattenCommSummary(
    df_pr_with_actors, match_col='pr_opener', overview_metrics=overview_metrics, base_status=['high', 'low'], 
    special_status=['never_communicated', 'communicated', 'never_communicated_predep'], metric_map=metric_map, status_map=status_map)
"""df_comm_wide_ov = PivotAndFlattenCommSummary(
    df_pr_with_actors, match_col='all_actors', overview_metrics=overview_metrics, base_status=['high', 'low'], 
    special_status=['never_communicated', 'communicated'], metric_map=metric_map, status_map=status_map)"""

dept_ov_comm_2bin high
0.14045547319798046 37236
dept_ov_comm_2bin low
0.5181714471968709 12272
dept_ov_comm_05avg_2bin high
0.10529123623842088 41238
dept_ov_comm_05avg_2bin low
0.5944800871565186 8261
dept_ov_comm_2avg_2bin high
0.2126938775510204 24500
dept_ov_comm_2avg_2bin low
0.3505278310940499 25008
dept_ov_comm_3avg_2bin high
0.2506403750422889 20691
dept_ov_comm_3avg_2bin low
0.2919211549139367 28816
dept_ov_comm_per_problem_2bin high
0.28794642857142855 17920
dept_ov_comm_per_problem_2bin low
0.17089469517022962 31575
dept_ov_comm_per_problem_05avg_2bin high
0.09707805569527642 43738
dept_ov_comm_per_problem_05avg_2bin low
0.5317336115458181 5751
dept_ov_comm_per_problem_2avg_2bin high
0.4561911658218682 1381
dept_ov_comm_per_problem_2avg_2bin low
0.022497141074955818 48095
dept_ov_comm_per_problem_3avg_2bin high
0.6814814814814815 135
dept_ov_comm_per_problem_3avg_2bin low
0.0002837799489196092 49334
dept_ov_comm_per_problem_min_2bin high
0.25478175576262874 16312
dept_ov_co

"df_comm_wide_ov = PivotAndFlattenCommSummary(\n    df_pr_with_actors, match_col='all_actors', overview_metrics=overview_metrics, base_status=['high', 'low'], \n    special_status=['never_communicated', 'communicated'], metric_map=metric_map, status_map=status_map)"

In [70]:
"""# people are involved int wice as much activity as they're opening 
for avg in ['avg','2avg','3avg']:
    sel_cols = [f'prs_opened_dept_comm_{avg}_above',f'prs_opened_dept_comm_{avg}_below']
    print(df_comm_wide_ov[sel_cols].sum().sum()/df_comm_wide[sel_cols].sum().sum())"""

"# people are involved int wice as much activity as they're opening \nfor avg in ['avg','2avg','3avg']:\n    sel_cols = [f'prs_opened_dept_comm_{avg}_above',f'prs_opened_dept_comm_{avg}_below']\n    print(df_comm_wide_ov[sel_cols].sum().sum()/df_comm_wide[sel_cols].sum().sum())"

In [71]:
df_contributors = df_problems_contr_filtered[['repo_name','time_period','treatment_period','all_actors_period','departed_actor_id']].explode('all_actors_period').sort_values(['repo_name','time_period'])
df_project_predeparture_contributors = df_contributors.query('time_period < treatment_period & departed_actor_id != all_actors_period').drop_duplicates(['repo_name','all_actors_period'])
df_project_predeparture_contributors = df_project_predeparture_contributors.groupby(['repo_name'])['all_actors_period'].agg(list).reset_index().rename(columns={'all_actors_period':'all_actors_pre_departure'})
df_project_nondeparture_contributors = df_contributors.query('departed_actor_id != all_actors_period').drop_duplicates(['repo_name','all_actors_period'])
df_project_nondeparture_contributors = df_project_nondeparture_contributors.groupby(['repo_name'])['all_actors_period'].agg(list).reset_index().rename(columns={'all_actors_period':'all_actors_non_departure'})


In [72]:
df_problems_contr_filtered_predep = pd.merge(df_problems_contr_filtered, df_project_predeparture_contributors)
df_problems_contr_filtered_predep = df_problems_contr_filtered_predep.loc[
    df_problems_contr_filtered_predep.apply(lambda row: row['all_actors'].size == np.intersect1d(row['all_actors'], row['all_actors_pre_departure']).size, axis=1)
]
df_problems_contr_filtered_nondep = pd.merge(df_problems_contr_filtered, df_project_nondeparture_contributors)
df_problems_contr_filtered_nondep = df_problems_contr_filtered_nondep.loc[
    df_problems_contr_filtered_nondep.apply(lambda row: row['all_actors'].size == np.intersect1d(row['all_actors'], row['all_actors_non_departure']).size, axis=1)
]

In [73]:
df_agg_predep = df_problems_contr_filtered_predep.sort_values(['repo_name','problem_id_num','time_period']).drop_duplicates(
    ['repo_name','problem_id']).query('type != "unlinked issue"').groupby(
    ['repo_name','time_period'])['problem_id'].count().reset_index().rename(columns={'problem_id':'prs_opened_predep'})

df_agg_nondep = df_problems_contr_filtered_nondep.sort_values(['repo_name','problem_id_num','time_period']).drop_duplicates(
    ['repo_name','problem_id']).query('type != "unlinked issue"').groupby(
    ['repo_name','time_period'])['problem_id'].count().reset_index().rename(columns={'problem_id':'prs_opened_nondep'})

df_agg_prob = df_problems_contr_filtered.sort_values(['repo_name','problem_id_num','time_period']).drop_duplicates(
    ['repo_name','problem_id']).query('type != "unlinked issue"').groupby(
    ['repo_name','time_period'])['problem_id'].count().reset_index().rename(columns={'problem_id':'prs_opened_prob'})

df_agg_prs = pd.merge(df_agg_predep, df_agg_nondep, how = 'outer').merge(df_comm_wide, how = 'outer').fillna(0).merge(df_agg_prob, how = 'outer').fillna(0)

In [75]:
df_agg_prs[['repo_name','time_period','prs_opened_prob','prs_opened_predep','prs_opened_nondep',
            'prs_opened_dept_comm','prs_opened_dept_never_comm','prs_opened_dept_never_comm_predep']].head(30)

Unnamed: 0,repo_name,time_period,prs_opened_prob,prs_opened_predep,prs_opened_nondep,prs_opened_dept_comm,prs_opened_dept_never_comm,prs_opened_dept_never_comm_predep
0,AnalogJ/lexicon,2016-01-01,38.0,8.0,8.0,5.0,3.0,3.0
1,AnalogJ/lexicon,2016-07-01,27.0,5.0,5.0,1.0,4.0,4.0
2,AnalogJ/lexicon,2017-01-01,43.0,20.0,20.0,13.0,7.0,7.0
3,AnalogJ/lexicon,2017-07-01,17.0,3.0,3.0,2.0,1.0,1.0
4,AnalogJ/lexicon,2018-01-01,58.0,9.0,9.0,4.0,2.0,2.0
5,AnalogJ/lexicon,2018-07-01,55.0,3.0,3.0,1.0,0.0,0.0
6,AnalogJ/lexicon,2019-01-01,57.0,11.0,11.0,7.0,0.0,0.0
7,AnalogJ/lexicon,2019-07-01,40.0,9.0,9.0,6.0,3.0,2.0
8,AnalogJ/lexicon,2020-01-01,37.0,21.0,40.0,23.0,17.0,3.0
9,AnalogJ/lexicon,2020-07-01,21.0,10.0,21.0,6.0,15.0,1.0


In [76]:
preperiod_recent = df_project_filtered_group.query('time_period < treatment_period').groupby('repo_name').tail(5)
preperiod_recent['other_involved_count'] = preperiod_recent['departed_involved_count'] - preperiod_recent['problem_count']
preperiod_recent['uniform_weight'] = 1

count_dict = {
    'ind_collab': 'problem_count',
    'ind_key_collab': 'departed_involved_count',
    'ind_other_collab': 'other_involved_count',
    'departed_involved': 'problem_count',
    'departed_involved_count': 'uniform_weight',
    'key_contributor_count': 'uniform_weight',
    'total_contributor_count': 'uniform_weight',
    'problem_count': 'uniform_weight',
    'departed_opened': 'departed_opened_count',
    'departed_authored': 'departed_authored_count'
}

for collab_type, count_col in count_dict.items():
    avg_collab = WeightedMean(preperiod_recent[collab_type], preperiod_recent[count_col])
    base_wm = preperiod_recent.groupby('repo_name').apply(
        lambda df: WeightedMean(df[collab_type], df[count_col], zero_weight_return = 0)
    )

    above_set = set(base_wm[base_wm > avg_collab].index)
    df_project_filtered_group[f"{collab_type}_2bin"] = df_project_filtered_group['repo_name'].apply(lambda x: int(x in above_set))

    # 3-bin: weighted quantiles
    q33, q67 = WeightedQuantile(preperiod_recent[collab_type], preperiod_recent[count_col], [0.33, 0.67])

    df_project_filtered_group[f"{collab_type}_3bin"] = df_project_filtered_group['repo_name'].apply(Assign3Bin)

df_project_filtered_group = df_project_filtered_group.merge(df_agg_prs, how = 'left')
df_project_filtered_group[['prs_opened_predep','prs_opened_nondep']] = df_project_filtered_group[['prs_opened_predep','prs_opened_nondep']].fillna(0)
df_project_filtered_group.to_parquet('issue/project_collaboration.parquet')