
Departing individual characteristics 
- What areas of problem solving does this person touch (indicator variables)
    - A priori: increased coverage should negative affect the project
    - Initial metrics category: Indicator for problem identification, problem solving, solution incorporation
    - Later on (deep-dive): Indicators for all activities within each category
- What % of problems in each area is this person solving
    - Initial metrics category:  % solved in last 6 months, last year for opened issues, other issue comments/PRs/commits, PR merging+issue closing (just use individual)
    - Later on (deep-dive): More time periods, more activities in each category


In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
from pathlib import Path
import numpy as np
import sys
import glob
import warnings
import random
from glob import glob 
import datetime
import itertools
import time
from multiprocessing import pool
from source.lib.helpers import *

In [39]:
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pandarallel.initialize(progress_bar = True)

indir_data = Path('drive/output/derived/contributor_stats/contributor_data')
outdir_data = Path('drive/output/derived/project_outcomes')

time_period = 6#int(sys.argv[1])
df_contributor_panel = pd.read_parquet(indir_data / f"major_contributors_major_months{time_period}_window732D_samplefull.parquet")

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [40]:
agg_cols = ['issues_opened','helping_issue_comments','commits','prs_opened','prs_merged','issues_closed']
val_cols = [col for col in df_contributor_panel.columns if 'pct' in col]
df_contributor_panel = df_contributor_panel.drop(val_cols, axis = 1).reset_index(drop = True)
df_contributor_panel = df_contributor_panel.rename({'issue_number':'issues_opened','pr':'prs_opened'}, axis = 1)


In [43]:
# AddActivityCategories
df_contributor_panel['problem_identification'] = df_contributor_panel.parallel_apply(
    lambda x: x['issues_opened']>0 or x['own_issue_comments']>0, axis = 1)
df_contributor_panel['problem_discussion'] = df_contributor_panel.parallel_apply(
    lambda x: x['helping_issue_comments']>0 or x['pr_comments']>0, axis = 1)
df_contributor_panel['problem_solving'] = df_contributor_panel.parallel_apply(
    lambda x: x['prs_opened']>0 or x['commits']>0, axis = 1)
df_contributor_panel['solution_incorporation'] = df_contributor_panel.parallel_apply(
    lambda x: x['pr_reviews']>0 or x['pr_review_comments']>0 or x['prs_merged']>0 or x['issues_closed']>0, axis = 1)
for col in agg_cols:
    df_contributor_panel[f"{col}_share"] = df_contributor_panel[col]/df_contributor_panel.groupby(['repo_name','time_period'])[col].transform('sum')

df_contributor_selected_panel = df_contributor_panel[['repo_name','time_period','actor_id',

In [73]:
def CalculateSize(df_contributor_panel):
    df_repo_size = df_contributor_panel.groupby(['repo_name','time_period']).agg({
        'actor_id':'count','problem_identification': 'sum','problem_discussion':'sum','problem_solving':'sum',
        'solution_incorporation':'sum'}).rename({
        'actor_id':"contributor_count","problem_identification":"problem_identifier_count","problem_discussion":"problem_discusser_count",
        "problem_solving":"problem_solver_count","solution_incorporation":"solution_incorporator_count"}, axis = 1).reset_index()
    return df_repo_size
    
# CalculateSpan
# how??

def CalculateHHI(df_contributor_panel, agg_cols):
    for col in agg_cols:
        df_contributor_panel[f"{col}_hhi"] = df_contributor_panel.assign(share_sq = lambda x: x[f"{col}_share"]**2)\
            .groupby(['repo_name','time_period'])['share_sq'].transform('sum')
        df_contributor_panel.loc[df_contributor_panel.query(f"{col}_share.isna()").index,f"{col}_hhi"] = np.nan
    for col in agg_cols:
        df_contributor_panel[f"{col}_hhi_missing"] = df_contributor_panel[f"{col}_hhi"].isna()
    df_repo_hhi = df_contributor_panel[['repo_name','time_period'] + [f"{col}_hhi" for col in agg_cols] + [f"{col}_hhi_missing" for col in agg_cols]]\
        .drop_duplicates(['repo_name','time_period'])

    return df_repo_hhi

def CalculateOverlap(df_contributor_panel):
    df_contributor_panel['solve_and_incorporate'] = df_contributor_panel.apply(
        lambda x: x['solution_incorporation'] and x['problem_solving'], axis = 1)
    df_contributor_panel['solve_and_incorporate_and_discuss'] = df_contributor_panel.apply(
        lambda x: x['solution_incorporation'] and x['problem_solving'] and x['problem_discussion'], axis = 1)
    df_contributor_panel['solve_and_discuss'] = df_contributor_panel.apply(
        lambda x: x['problem_solving'] and x['problem_discussion'], axis = 1)
    df_repo_overlap = df_contributor_panel.groupby(['repo_name','time_period'])\
        [['solve_and_incorporate','solve_and_incorporate_and_discuss','solve_and_discuss']].mean().reset_index()
    return df_repo_overlap

In [48]:
df_repo_size = CalculateSize(df_contributor_panel)
df_repo_hhi = CalculateHHI(df_contributor_panel, agg_cols)
df_repo_overlap = CalculateOverlap(df_contributor_panel)

In [74]:
df_repo_panel = pd.merge(df_repo_size, df_repo_hhi, how = 'outer').merge(df_repo_overlap, how = 'outer')
df_repo_panel

Unnamed: 0,repo_name,time_period,contributor_count,problem_identifier_count,problem_discusser_count,problem_solver_count,solution_incorporator_count,issues_opened_hhi,helping_issue_comments_hhi,commits_hhi,prs_opened_hhi,prs_merged_hhi,issues_closed_hhi,issues_opened_hhi_missing,helping_issue_comments_hhi_missing,commits_hhi_missing,prs_opened_hhi_missing,prs_merged_hhi_missing,issues_closed_hhi_missing,solve_and_incorporate,solve_and_incorporate_and_discuss,solve_and_discuss
0,007gzs/django_restframework_apiview,2017-01-01,2,0,0,2,0,,,0.882812,,,,True,True,False,True,True,True,0.000000,0.000000,0.000000
1,007gzs/django_restframework_apiview,2017-07-01,1,0,0,1,0,,,1.000000,,,,True,True,False,True,True,True,0.000000,0.000000,0.000000
2,007gzs/django_restframework_apiview,2018-01-01,1,0,0,1,0,,,1.000000,,,,True,True,False,True,True,True,0.000000,0.000000,0.000000
3,007gzs/django_restframework_apiview,2019-01-01,2,2,1,2,2,0.510204,1.0,0.950033,1.0,1.0,0.520000,False,False,False,False,False,False,1.000000,0.500000,0.500000
4,007gzs/django_restframework_apiview,2019-07-01,1,0,0,1,0,,,1.000000,,,,True,True,False,True,True,True,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87905,zzzsochi/Flask-Gravatar,2018-01-01,2,1,1,1,1,1.000000,,1.000000,1.0,1.0,1.000000,False,True,False,False,False,False,0.500000,0.000000,0.000000
87906,zzzsochi/Flask-Gravatar,2022-07-01,2,0,0,1,1,,,1.000000,1.0,1.0,,True,True,False,False,False,True,0.000000,0.000000,0.000000
87907,zzzsochi/trans,2015-07-01,2,1,1,1,2,1.000000,1.0,1.000000,,,0.555556,False,False,False,True,True,False,0.500000,0.500000,0.500000
87908,zzzsochi/trans,2016-01-01,1,0,0,1,0,,,1.000000,,,,True,True,False,True,True,True,0.000000,0.000000,0.000000


In [None]:
df_contributor_selected_panel

In [45]:
df_balanced = df_repo_panel[['repo_name']].drop_duplicates()
df_balanced['time_period'] = [time_periods for i in range(df_balanced.shape[0])]
df_balanced = df_balanced.explode('time_period')
df_repo_panel_full = pd.merge(df_balanced, df_repo_panel, how = 'left')
df_repo_panel_full[['first_period','final_period']] = df_repo_panel_full.groupby(['repo_name'])[['first_period','final_period']].ffill()
df_repo_panel_full = df_repo_panel_full.query('time_period >= first_period')
df_repo_panel_full = df_repo_panel_full.fillna(0)