In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
import warnings
from source.lib.helpers import *
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

In [3]:
def WeightedQuantile(values, weights, quantiles):
    values = np.array(values)
    weights = np.array(weights)
    sorter = np.argsort(values)
    values = values[sorter]
    weights = weights[sorter]
    cumulative_weight = np.cumsum(weights)
    total_weight = cumulative_weight[-1]
    return np.interp(np.array(quantiles) * total_weight, cumulative_weight, values)

def Assign3Bin(repo):
    val = base_wm[repo]
    if val <= q33:
        return 0
    elif val <= q67:
        return 1
    else:
        return 2


In [4]:
# controls that I can add down the road
# % of problems that are unlinked prs/linked


df_problems_contr_filtered = pd.read_parquet('issue/filtered_problem_data.parquet')
df_problems_contr_filtered['departed_involved'] = df_problems_contr_filtered.apply(lambda x: x['departed_actor_id'] in x['all_actors'], axis = 1)
df_problems_contr_filtered['key_contributor_count'] = df_problems_contr_filtered['important_actors_rolling'].apply(len)
df_problems_contr_filtered['total_contributor_count'] = df_problems_contr_filtered['all_actors_period'].apply(len)
df_problems_contr_filtered['departed_opener'] = pd.to_numeric(df_problems_contr_filtered.apply(lambda x: x['departed_actor_id'] in x['pr_opener'] if x['departed_involved'] else np.nan, axis = 1))
df_problems_contr_filtered['departed_author'] = pd.to_numeric(df_problems_contr_filtered.apply(lambda x: x['departed_actor_id'] in x['pr_authors'] if x['departed_involved'] else np.nan, axis = 1))

df_project_filtered_group = df_problems_contr_filtered.groupby(
    ['repo_name', 'time_period', 'treatment_period', 'key_contributor_count', 'total_contributor_count']
).agg(
    problem_count=('problem_id', 'count'),
    ind_collab=('ind_collab_roll', 'mean'),
    ind_key_collab=('ind_key_collab_roll', 'mean'),
    ind_other_collab=('ind_other_collab_roll', 'mean'),
    departed_involved_count=('departed_involved','sum'),
    departed_involved=('departed_involved','mean'),
    departed_opened_count=('departed_opener','sum'),
    departed_opened=('departed_opener','mean'), # conditional on involvement
    departed_authored_count=('departed_author','sum'),
    departed_authored=('departed_author','mean'), # conditional on involvement
).reset_index()

In [5]:
df_contributors = df_problems_contr_filtered[['repo_name','time_period','treatment_period','all_actors_period','departed_actor_id']].explode('all_actors_period').sort_values(['repo_name','time_period'])
df_project_predeparture_contributors = df_contributors.query('time_period < treatment_period & departed_actor_id != all_actors_period').drop_duplicates(['repo_name','all_actors_period'])
df_project_predeparture_contributors = df_project_predeparture_contributors.groupby(['repo_name'])['all_actors_period'].agg(list).reset_index().rename(columns={'all_actors_period':'all_actors_pre_departure'})
df_project_nondeparture_contributors = df_contributors.query('departed_actor_id != all_actors_period').drop_duplicates(['repo_name','all_actors_period'])
df_project_nondeparture_contributors = df_project_nondeparture_contributors.groupby(['repo_name'])['all_actors_period'].agg(list).reset_index().rename(columns={'all_actors_period':'all_actors_non_departure'})

In [None]:
df_problems_contr_filtered_predep = pd.merge(df_problems_contr_filtered, df_project_predeparture_contributors)
df_problems_contr_filtered_predep = df_problems_contr_filtered_predep.loc[
    df_problems_contr_filtered_predep.apply(lambda row: row['all_actors'].size == np.intersect1d(row['all_actors'], row['all_actors_pre_departure']).size, axis=1)
]
df_problems_contr_filtered_nondep = pd.merge(df_problems_contr_filtered, df_project_nondeparture_contributors)
df_problems_contr_filtered_nondep = df_problems_contr_filtered_nondep.loc[
    df_problems_contr_filtered_nondep.apply(lambda row: row['all_actors'].size == np.intersect1d(row['all_actors'], row['all_actors_non_departure']).size, axis=1)
]

In [23]:
df_agg_predep = df_problems_contr_filtered_predep.drop_duplicates(['repo_name','problem_id']).query('type != "unlinked issue"').groupby(['repo_name','time_period'])['problem_id'].count().reset_index().rename(columns={'problem_id':'prs_opened_predep'})
df_agg_nondep = df_problems_contr_filtered_nondep.drop_duplicates(['repo_name','problem_id']).query('type != "unlinked issue"').groupby(['repo_name','time_period'])['problem_id'].count().reset_index().rename(columns={'problem_id':'prs_opened_nondep'})

In [24]:
df_agg_prs = pd.merge(df_agg_predep, df_agg_nondep, how = 'outer').fillna(0)

In [29]:

preperiod_recent = df_project_filtered_group.query('time_period < treatment_period').groupby('repo_name').tail(5)
preperiod_recent['other_involved_count'] = preperiod_recent['departed_involved_count'] - preperiod_recent['problem_count']
preperiod_recent['uniform_weight'] = 1

count_dict = {
    'ind_collab': 'problem_count',
    'ind_key_collab': 'departed_involved_count',
    'ind_other_collab': 'other_involved_count',
    'departed_involved': 'problem_count',
    'departed_involved_count': 'uniform_weight',
    'key_contributor_count': 'uniform_weight',
    'total_contributor_count': 'uniform_weight',
    'problem_count': 'uniform_weight',
    'departed_opened': 'departed_opened_count',
    'departed_authored': 'departed_authored_count'
}

for collab_type, count_col in count_dict.items():
    avg_collab = WeightedMean(preperiod_recent[collab_type], preperiod_recent[count_col])
    base_wm = preperiod_recent.groupby('repo_name').apply(
        lambda df: WeightedMean(df[collab_type], df[count_col], zero_weight_return = 0)
    )

    above_set = set(base_wm[base_wm > avg_collab].index)
    df_project_filtered_group[f"{collab_type}_2bin"] = df_project_filtered_group['repo_name'].apply(lambda x: int(x in above_set))

    # 3-bin: weighted quantiles
    q33, q67 = WeightedQuantile(preperiod_recent[collab_type], preperiod_recent[count_col], [0.33, 0.67])

    df_project_filtered_group[f"{collab_type}_3bin"] = df_project_filtered_group['repo_name'].apply(Assign3Bin)

df_project_filtered_group = df_project_filtered_group.merge(df_agg_prs, how = 'left')
df_project_filtered_group[['prs_opened_predep','prs_opened_nondep']] = df_project_filtered_group[['prs_opened_predep','prs_opened_nondep']].fillna(0)
df_project_filtered_group.to_parquet('issue/project_collaboration.parquet')