In [22]:
import pandas as pd
import os
import glob
import numpy as np
from pathlib import Path
pd.set_option('display.max_columns', None)


In [2]:
#os.chdir('../')

In [23]:
def ContributorCount(df):
    return df[['actor_id','repo_name']].drop_duplicates().shape[0]

In [24]:
def GetConsecutiveSum(df):
    gb = df.groupby((df['issue_exceed'] != df['issue_exceed'].shift()).cumsum())
    df['consecutive_periods'] = gb['issue_exceed'].cumsum()
    df.loc[df['issue_exceed'] == 0, 'consecutive_periods'] = 0
    return df

In [35]:
indir = Path('drive/output/derived/major_contributor_prospects')

In [26]:
major_months = 6
window = 732

In [41]:
def GetContributorData(indir, major_months, window):
    file_list = [file for file in indir.glob(f'major_contributors_major_months{major_months}_window{window}D_samplefull_chunk*.parquet')]
    df_contributors = pd.concat([pd.read_parquet(file) for file in file_list])
    return df_contributors

In [42]:
df_contributors = GetContributorData(indir, major_months, window)
num_contributors = ContributorCount(df_contributors)

In [None]:
issue_pct = [75, 90]
pr_pct = [75, 90]
general_pct = [25, 50]
consecutive_periods = [2, 3, 4, 5, 6]
issue_col = 'issue_comments'
pr_col = 'pr'
post_period_length_list = [2, 3, 4, 5]
decline_threshold = .2

In [6]:
identifiers = ['repo_name','actor_id','time_period', 'earliest_appearance']

issue_analysis_col = f'{issue_col}_{issue_pct}th_pct'
issue_general_col = f'general_{issue_col}_{general_pct}th_pct'
issue_cols = [issue_col, issue_analysis_col, issue_general_col]
pr_analysis_col = f'{pr_col}_{pr_pct}th_pct'
pr_general_col = f'general_{pr_col}_{general_pct}th_pct'
pr_cols = [pr_col, pr_analysis_col, pr_general_col]

analysis_cols = identifiers + issue_cols + pr_cols
df_analysis = df_contributors[analysis_cols]

In [7]:
potential_major_contributors = df_analysis.query(f'{issue_col}>{issue_analysis_col} & {issue_col}>{issue_general_col}')\
    [['actor_id','repo_name']].drop_duplicates()
df_potential = pd.merge(df_analysis, potential_major_contributors, on = ['actor_id','repo_name'])
num_potential_contributors = ContributorCount(df_potential)
print("total contributors: {}, potential (exceed thresholds) contributors: {}".format(num_contributors, num_potential_contributors))
df_potential = df_potential.assign(issue_exceed = (df_potential[issue_col]>df_potential[issue_analysis_col]).astype(int))
df_potential = df_potential.groupby(['actor_id','repo_name']).apply(GetConsecutiveSum).reset_index(drop = True)

total contributors: 21033, potential (exceed thresholds) contributors: 16227


  df_potential = df_potential.groupby(['actor_id','repo_name']).apply(GetConsecutiveSum).reset_index(drop = True)


In [8]:
has_consecutive_periods = df_potential.query(f'consecutive_periods>={consecutive_periods}')[['actor_id','repo_name']].drop_duplicates()
df_potential_consecutive = pd.merge(df_potential, has_consecutive_periods)
num_consecutive_periods = ContributorCount(df_potential_consecutive)
print("total contributors: {}, potential (exceed thresholds) contributors: {}, \
    {} or more consecutive periods: {}".format(num_contributors, num_potential_contributors, consecutive_periods, num_consecutive_periods))

total contributors: 21033, potential (exceed thresholds) contributors: 16227,     3 or more consecutive periods: 7203


In [9]:
# not allowing for repeat exits
departure_candidates = df_potential_consecutive[['actor_id','repo_name', 'consecutive_periods', 'time_period']]\
    .sort_values('consecutive_periods', ascending = False)\
    .drop_duplicates(['actor_id','repo_name'])\
    .reset_index(drop = True)\
    .rename({'time_period':'final_period'}, axis = 1)
num_departure_candidates = ContributorCount(departure_candidates)
print("total contributors: {}, potential (exceed thresholds) contributors: {}, \
    {} or more consecutive periods: {}, after removing duplicate exits: {}".format(
        num_contributors, num_potential_contributors, consecutive_periods, num_consecutive_periods,
    num_departure_candidates))

total contributors: 21033, potential (exceed thresholds) contributors: 16227,     3 or more consecutive periods: 7203, after removing duplicate exits: 7203


In [10]:
def GetCandidates(post_period_length):
    df_candidates = pd.DataFrame()
    for i in departure_candidates.index:
        final_period = departure_candidates.loc[i, 'final_period']
        total_consecutive_periods = departure_candidates.loc[i, 'consecutive_periods']
        df_candidate = pd.merge(df_potential_consecutive, departure_candidates.loc[[i]].drop('consecutive_periods', axis = 1))
        if df_candidate.query('time_period>final_period').shape[0]>post_period_length: # note that 2023-07-01 is incomplete
            if df_candidate.query('time_period>final_period').head(post_period_length)[issue_analysis_col].isna().sum() != post_period_length: # project is still active - other people are still making issue comments
                pre_period_mean = df_candidate.query('time_period<=final_period').sort_values('time_period').tail(total_consecutive_periods)[issue_col].mean()
                post_period_mean = df_candidate.query('time_period>final_period').head(post_period_length)[issue_col].mean()
                if pre_period_mean * decline_threshold > post_period_mean:
                    df_candidates = pd.concat([df_candidates, df_candidate])
    return df_candidates

In [None]:
for post_period_length in [post_period_length_options[0]]:
    df_candidates = GetCandidates(post_period_length)
    num_candidates = ContributorCount(df_candidates)
    print("total contributors: {}, potential (exceed thresholds) contributors: {}, \
          {} or more consecutive periods: {}, after removing duplicate exits: {}, \
          final candidate count, using {} post periods: {}".format(
            num_contributors, num_potential_contributors, consecutive_periods, num_consecutive_periods,
        num_departure_candidates, post_period_length, num_candidates))

In [None]:
#df_candidates = GetCandidates(2)

In [21]:
print(df_candidates[['actor_id','repo_name']].drop_duplicates().shape)
print(df_candidates[['repo_name']].drop_duplicates().shape)

(3037, 2)
(1664, 1)


In [20]:
print(np.sum(df_candidates[['actor_id','repo_name']].drop_duplicates()['repo_name'].value_counts()==1))

1164


In [None]:
df_candidates

In [None]:
df_candidates.to_parquet('issue/candidates.parquet')

In [None]:
df_truckfactor = pd.concat([pd.read_csv(file) for file in glob.glob('drive/output/scrape/get_weekly_truck_factor/*.csv')])

In [None]:
df_truckfactor.drop('Unnamed: 0', axis = 1, inplace = True)

In [43]:
df_candidates

Unnamed: 0,repo_name,actor_id,time_period,earliest_appearance,issue_comments,issue_comments_75th_pct,general_issue_comments_25th_pct,pr,pr_90th_pct,general_pr_25th_pct,issue_exceed,consecutive_periods,final_period
0,takluyver/pynsist,327925.0,2015-01-01,2015-01-01,35.0,6.750,1.0,2.0,2.000000,1.0,1,1,2022-01-01
1,takluyver/pynsist,327925.0,2015-07-01,2015-01-01,43.0,12.375,1.0,2.0,2.000000,1.0,1,2,2022-01-01
2,takluyver/pynsist,327925.0,2016-01-01,2015-01-01,26.0,9.250,1.0,3.0,2.333333,1.0,1,3,2022-01-01
3,takluyver/pynsist,327925.0,2016-07-01,2015-01-01,92.0,9.625,1.0,10.0,4.250000,1.0,1,4,2022-01-01
4,takluyver/pynsist,327925.0,2017-01-01,2015-01-01,68.0,8.700,1.0,5.0,4.400000,1.0,1,5,2022-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11,astropy/regions,314716.0,2021-07-01,2016-01-01,1.0,8.900,1.0,0.0,12.320000,1.4,0,0,2017-01-01
12,astropy/regions,314716.0,2022-01-01,2016-01-01,0.0,9.400,1.0,0.0,17.640000,1.4,0,0,2017-01-01
13,astropy/regions,314716.0,2022-07-01,2016-01-01,0.0,7.000,1.0,0.0,19.080000,1.2,0,0,2017-01-01
14,astropy/regions,314716.0,2023-01-01,2016-01-01,0.0,6.400,1.0,0.0,19.440000,1.2,0,0,2017-01-01
