In [1]:
import os
os.chdir('../')

In [2]:

import pandas as pd
from pathlib import Path
import numpy as np
import sys
import glob
import warnings
import random
from pandarallel import pandarallel
from source.lib.JMSLab import autofill
from source.lib.helpers import *
from ast import literal_eval
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from glob import glob 
import datetime
import itertools
import time
from multiprocessing import pool
from source.lib.helpers import *

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pandarallel.initialize(progress_bar = True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [61]:
def GetConsecutiveSum(df):
    gb = df.groupby((df['periods_all'] != df['periods_all'].shift()).cumsum())
    df['consecutive_periods'] = gb['periods_all'].cumsum()
    df.loc[df['periods_all'] == 0, 'consecutive_periods'] = 0
    return df

In [None]:
indir_committers_info = Path('drive/output/scrape/link_committers_profile')
indir_data = Path('drive/output/derived/data_export')

commit_cols = ['commits','commit additions','commit deletions','commit changes total','commit files changed count']
author_thresh = 1/3
time_period = 6
rolling_window = '1828D'
SECONDS_IN_DAY = 86400
closing_day_options = [30, 60, 90, 180, 360]

df_issue = pd.read_parquet(indir_data / 'df_issue.parquet')
df_pr = pd.read_parquet(indir_data / 'df_pr.parquet')
df_pr_commits = pd.read_parquet(indir_data / 'df_pr_commits.parquet')
df_linked_issues = ReadFileList(glob('drive/output/scrape/link_issue_pull_request/linked_issue/*.csv'))

df_issue['created_at'] = pd.to_datetime(df_issue['created_at'])
df_pr['created_at'] = pd.to_datetime(df_pr['created_at'])

In [None]:
#sample_size = 100
#selected_repos = df_issue[['repo_name']].drop_duplicates().sample(sample_size, random_state = 1235)['repo_name'].tolist()
selected_repos = df_issue[['repo_name']].drop_duplicates()['repo_name'].tolist()

In [None]:
df_issue_selected = df_issue[(df_issue['repo_name'].isin(selected_repos)) & (df_issue['created_at']>='2015-01-01')]
df_pr_selected = df_pr[(df_pr['repo_name'].isin(selected_repos))  & (df_pr['created_at']>='2015-01-01')]
df_pr_commits_selected = df_pr_commits[(df_pr_commits['repo_name'].isin(selected_repos))]
df_linked_issues = df_linked_issues[(df_linked_issues['repo_name'].isin(selected_repos))]

In [None]:
df_issue_selected = ImputeTimePeriod(df_issue_selected, time_period)
df_pr_selected = ImputeTimePeriod(df_pr_selected, time_period)

In [None]:
df_repo_panel = pd.concat([df_issue_selected[['repo_name','time_period']].drop_duplicates(), 
                           df_pr_selected[['repo_name','time_period']].drop_duplicates()]).drop_duplicates()\
    .groupby('repo_name')\
    .agg({'time_period': ['min','max']})\
    .reset_index()\
    .rename({('time_period','min'): 'earliest-date',
             ('time_period','max'): 'latest_date'}, axis = 1)
df_repo_panel.columns = ['repo_name','earliest_date','latest_date']
df_repo_panel['time_period'] = df_repo_panel.apply(lambda x: pd.date_range(x['earliest_date'], x['latest_date'] , freq=f'{time_period}MS').tolist(), axis = 1)
df_repo_panel = df_repo_panel.drop(['earliest_date', 'latest_date'], axis = 1).explode('time_period')

In [None]:
df_opened_issues = df_issue_selected.query('issue_action == "opened"')[
    ['repo_name','issue_number','time_period', 'created_at']]\
    .assign(opened_issue=1)\
    .sort_values('created_at',ascending = True)\
    .drop_duplicates(['repo_name','issue_number'])
df_closed_issues =  df_issue_selected.query('issue_action == "closed"')[
    ['repo_name','issue_number','time_period', 'created_at']]\
    .sort_values('created_at',ascending = True)\
    .drop_duplicates(['repo_name','issue_number'])\
    .assign(closed_issue = 1)\
    .rename({'time_period':'closed_time_period','created_at':'closed_at'}, axis = 1)

df_issues_sans_comments = pd.merge(df_opened_issues, df_closed_issues, how = 'left')
## TODO: how many closed issues are unlinked

In [None]:
df_issue_comments = df_issue_selected.query('type == "IssueCommentEvent"')\
    [['issue_number','issue_comment_id','repo_name']]\
    .drop_duplicates()\
    .assign(issue_comments=1)\
    .groupby(['repo_name','issue_number'])['issue_comments'].sum()\
    .reset_index()

# TODO: how many unlinked issues by issue comments
df_issues = pd.merge(df_issues_sans_comments, df_issue_comments, how = 'left')
for col in ['closed_issue','issue_comments']:    
    df_issues[col] = df_issues[col].fillna(0)

df_issues['days_to_close'] = (df_issues['closed_at'] - df_issues['created_at']).apply(lambda x: x.total_seconds()/SECONDS_IN_DAY)
for day in closing_day_options:
    df_issues[f'closed_in_{day}_days'] = pd.to_numeric(df_issues['days_to_close']<day).astype(int)

df_issues_stats = df_issues.groupby(['repo_name','time_period'])\
    .agg({'opened_issue': 'sum','closed_issue':['sum','mean'],
          'issue_comments':['sum', 'mean'],
          'closed_in_30_days':'mean', 'closed_in_60_days':'mean','closed_in_90_days':'mean',
          'closed_in_180_days':'mean', 'closed_in_360_days':'mean'})
df_issues_stats.columns = df_issues_stats.columns.to_flat_index()
df_issues_stats = df_issues_stats.reset_index()\
    .rename(columns = {('opened_issue','sum'): 'opened_issues',
             ('closed_issue', 'sum'): 'closed_issues',
             ('closed_issue', 'mean'): 'p_issues_closed',
             ('issue_comments', 'sum'): 'issue_comments',
             ('issue_comments', 'mean'): 'avg_issue_commments',
             ('closed_in_30_days', 'mean'): 'p_issues_closed_30d',
             ('closed_in_60_days', 'mean'): 'p_issues_closed_60d',
             ('closed_in_90_days', 'mean'): 'p_issues_closed_90d',
             ('closed_in_180_days', 'mean'): 'p_issues_closed_180d',
             ('closed_in_360_days', 'mean'): 'p_issues_closed_360d'})

In [None]:
df_opened_prs = df_pr_selected.query('pr_action == "opened"')[
    ['repo_name','pr_number','time_period', 'created_at']].drop_duplicates()\
    .assign(opened_pr=1)\
    .sort_values('created_at',ascending = True)\
    .drop_duplicates(['repo_name','pr_number'])
df_closed_prs = df_pr_selected.query('pr_action=="closed" & pr_merged_by_id.isna()')[
    ['repo_name','pr_number','time_period', 'created_at']].drop_duplicates()\
    .sort_values('created_at',ascending = True)\
    .drop_duplicates(['repo_name','pr_number'])\
    .assign(closed_unmerged_pr = 1)\
    .rename({'time_period':'closed_unmerged_time_period','created_at':'closed_unmerged_at'}, axis = 1)

df_merged_prs = df_pr_selected.query('pr_action=="closed" & ~pr_merged_by_id.isna()')[
    ['repo_name','pr_number','time_period', 'created_at', 'pr_merged_by_type']].drop_duplicates()\
    .sort_values('created_at',ascending = True)\
    .drop_duplicates(['repo_name','pr_number'])\
    .assign(merged_pr = 1)\
    .rename({'time_period':'merged_time_period','created_at':'merged_at'}, axis = 1)

df_prs_sans_reviews = pd.merge(df_opened_prs, df_closed_prs, how = 'left').merge(df_merged_prs, how = 'left')

In [None]:
df_pr_reviews = df_pr_selected.query('type == "PullRequestReviewEvent"')[
    ['repo_name','pr_number','time_period', 'created_at','pr_review_id','pr_review_state']].drop_duplicates()\
    .sort_values('created_at',ascending = True)\
    .drop_duplicates(['repo_name','pr_number','pr_review_id'])\
    .assign(pr_review = 1)
for col in ['commented','approved','changes_requested']:
    df_pr_reviews[f'review_state_{col}'] = pd.to_numeric(df_pr_reviews['pr_review_state']==col).astype(int)
df_pr_review_stats = df_pr_reviews.groupby(['repo_name','pr_number'])\
    [['pr_review','review_state_commented','review_state_approved','review_state_changes_requested']].sum().reset_index()

In [None]:
df_pr_review_comments = df_pr_selected.query('type == "PullRequestReviewCommentEvent"')[
    ['repo_name','pr_number','time_period', 'created_at','pr_review_comment_body']].drop_duplicates()\
    .sort_values('created_at',ascending = True)\
    .drop_duplicates(['repo_name','pr_number','pr_review_comment_body'])\
    .assign(pr_review_comment = 1)
df_pr_review_comments_stats = df_pr_review_comments.groupby(['repo_name','pr_number'])\
    [['pr_review_comment']].sum().reset_index()

In [None]:
df_prs_complete = pd.merge(df_prs_sans_reviews, df_pr_review_stats, how = 'left').merge(df_pr_review_comments_stats, how = 'left')
for col in ['closed_unmerged_pr', 'merged_pr', 'pr_review','review_state_commented',
            'review_state_approved','review_state_changes_requested','pr_review_comment']:    
    df_prs_complete[col] = df_prs_complete[col].fillna(0)
df_prs_complete['pr_review_comments_total'] = df_prs_complete['pr_review']+df_prs_complete['pr_review_comment']

df_prs_complete['days_to_merge'] = (df_prs_complete['merged_at'] - df_prs_complete['created_at']).apply(lambda x: x.total_seconds()/SECONDS_IN_DAY)
for day in closing_day_options:
    df_prs_complete[f'merged_in_{day}_days'] = pd.to_numeric(df_prs_complete['days_to_merge']<day).astype(int)

df_prs_stats = df_prs_complete.groupby(['repo_name','time_period'])\
    .agg({'opened_pr': 'sum','merged_pr':['sum','mean'],
          'pr_review': ['sum','mean'], 'pr_review_comment': ['sum','mean'],
          'review_state_commented':'mean', 'review_state_approved': 'mean',
          'review_state_changes_requested': 'mean',
          'merged_in_30_days':'mean', 'merged_in_60_days':'mean','merged_in_90_days':'mean',
          'merged_in_180_days':'mean', 'merged_in_360_days':'mean'})
df_prs_stats.columns = df_prs_stats.columns.to_flat_index()
df_prs_stats = df_prs_stats.reset_index()\
    .rename(columns = {('opened_pr','sum'): 'opened_prs',
                       ('merged_pr','sum'): 'merged_prs',
                       ('merged_pr','mean'): 'p_prs_merged',
                       ('pr_review','sum'): 'pr_reviews',
                       ('pr_review','mean'): 'mean_reviews_per_pr',
                       ('pr_review_comment','sum'):'pr_review_comments',
                       ('pr_review_comment','mean'):'mean_review_comments_per_pr',
                       ('review_state_commented','mean'):'p_review_state_commented',
                       ('review_state_approved','mean'):'p_review_state_approved',
                       ('review_state_changes_requested','mean'):'p_review_state_changes_requested',
                       ('closed_issue', 'sum'): 'closed_issues',
                       ('closed_issue', 'mean'): 'p_issues_closed',
                       ('merged_in_30_days', 'mean'): 'p_prs_merged_30d',
                       ('merged_in_60_days', 'mean'): 'p_prs_merged_60d',
                       ('merged_in_90_days', 'mean'): 'p_prs_merged_90d',
                       ('merged_in_180_days', 'mean'): 'p_prs_merged_180d',
                       ('merged_in_360_days', 'mean'): 'p_prs_merged_360d'})

In [None]:
df_stats = pd.merge(df_issues_stats, df_prs_stats, how = 'outer')
df_repo_panel_stats = pd.merge(df_repo_panel, df_stats, how = 'left')
#df_repo_panel_stats.to_parquet('issue/direct_outcomes.parquet')

In [None]:
#df_stats = pd.read_parquet('issue/direct_outcomes.parquet')

In [None]:
departure_candidates = pd.read_parquet('issue/candidates.parquet')

In [None]:
repo_appears_once = departure_candidates[['repo_name','actor_id']]\
    .drop_duplicates()\
    ['repo_name'].value_counts()\
    .reset_index()\
    .query('count==1')\
    ['repo_name'].tolist()
departure_repos = departure_candidates['repo_name'].unique().tolist()
all_repos = df_repo_panel_stats['repo_name'].unique().tolist()
repo_never_appears = [repo for repo in all_repos if repo not in departure_repos]

In [None]:
repo_one_treatment = df_repo_panel_stats[
    df_repo_panel_stats['repo_name'].isin(repo_never_appears + repo_appears_once)]
treated_date = departure_candidates[departure_candidates['repo_name'].isin(repo_appears_once)]\
    [['repo_name','final_period']].drop_duplicates()

In [27]:
df_repo_one_treatment_panel = pd.merge(repo_one_treatment, treated_date, how = 'left')

In [28]:
for col in [['opened_issues','closed_issues','opened_prs']]:
    df_repo_one_treatment_panel[col] = df_repo_one_treatment_panel[col].fillna(0)
df_repo_one_treatment_panel['treatment'] = df_repo_one_treatment_panel.parallel_apply(
    lambda x: 0 if pd.isnull(x['final_period']) else int(x['time_period']>x['final_period']), axis = 1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11247), Label(value='0 / 11247')))…

In [29]:
time_period_dict = df_repo_one_treatment_panel['time_period'].sort_values().drop_duplicates().reset_index(drop = True).to_dict()
time_period_dict = {v: k for (k, v) in time_period_dict.items()}
df_repo_one_treatment_panel['time_index'] = df_repo_one_treatment_panel['time_period'].apply(lambda x: time_period_dict[x])

In [30]:
df_repo_one_treatment_panel['active_all'] = df_repo_one_treatment_panel.apply(
    lambda x: x['opened_issues']>0 and x['opened_prs']>0, axis = 1).astype(int)

In [37]:
df_repo_one_treatment_panel['mean_activity_all'] = df_repo_one_treatment_panel.groupby('repo_name')['active_all'].transform('mean')
df_repo_one_treatment_panel['periods_all'] = df_repo_one_treatment_panel.groupby('repo_name')['active_all'].transform('sum')

In [39]:
df_repo_one_treatment_panel.to_csv('issue/one_treatment_panel.csv')

In [44]:
df_repo_one_treatment_panel

Unnamed: 0,repo_name,time_period,opened_issues,closed_issues,p_issues_closed,issue_comments,avg_issue_commments,p_issues_closed_30d,p_issues_closed_60d,p_issues_closed_90d,p_issues_closed_180d,p_issues_closed_360d,opened_prs,merged_prs,p_prs_merged,pr_reviews,mean_reviews_per_pr,pr_review_comments,mean_review_comments_per_pr,p_review_state_commented,p_review_state_approved,p_review_state_changes_requested,p_prs_merged_30d,p_prs_merged_60d,p_prs_merged_90d,p_prs_merged_180d,p_prs_merged_360d,final_period,treatment,time_index,active,mean_activity,active_all,mean_activity_all,periods,periods_all
0,007gzs/django_restframework_apiview,2019-01-01,5.0,5.0,1.0,17.0,3.4,0.8,0.8,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,NaT,0,8,1,0.500000,1,0.250000,2,1
1,007gzs/django_restframework_apiview,2019-07-01,0.0,0.0,,,,,,,,,0.0,,,,,,,,,,,,,,,NaT,0,9,0,0.500000,0,0.250000,2,1
2,007gzs/django_restframework_apiview,2020-01-01,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,NaT,0,10,1,0.500000,0,0.250000,2,1
3,007gzs/django_restframework_apiview,2020-07-01,0.0,0.0,,,,,,,,,0.0,,,,,,,,,,,,,,,NaT,0,11,0,0.500000,0,0.250000,2,1
4,02strich/pykerberos,2015-01-01,0.0,0.0,,,,,,,,,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,NaT,0,0,1,0.666667,0,0.333333,12,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89968,zzzsochi/trans,2017-07-01,0.0,0.0,,,,,,,,,0.0,,,,,,,,,,,,,,,NaT,0,5,0,0.333333,0,0.000000,3,0
89969,zzzsochi/trans,2018-01-01,0.0,0.0,,,,,,,,,0.0,,,,,,,,,,,,,,,NaT,0,6,0,0.333333,0,0.000000,3,0
89970,zzzsochi/trans,2018-07-01,0.0,0.0,,,,,,,,,0.0,,,,,,,,,,,,,,,NaT,0,7,0,0.333333,0,0.000000,3,0
89971,zzzsochi/trans,2019-01-01,0.0,0.0,,,,,,,,,0.0,,,,,,,,,,,,,,,NaT,0,8,0,0.333333,0,0.000000,3,0


In [66]:
df_repo_one_treatment_panel = df_repo_one_treatment_panel.groupby(['repo_name']).parallel_apply(GetConsecutiveSum).reset_index(drop = True)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=905), Label(value='0 / 905'))), HB…