In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
from pathlib import Path
import numpy as np
import sys
import glob
import warnings
import random
from pandarallel import pandarallel
from source.lib.JMSLab import autofill
from source.lib.helpers import *
from ast import literal_eval
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from glob import glob 
import datetime
import itertools
import time

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pandarallel.initialize(progress_bar = True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
indir_committers_info = Path('drive/output/scrape/link_committers_profile')

In [4]:
df_issue = pd.read_parquet('issue/df_issue.parquet')
df_pr = pd.read_parquet('issue/df_pr.parquet')

In [5]:
df_pr_commits = pd.read_parquet('issue/df_pr_commits.parquet')
df_linked_issues = ReadFileList(glob('drive/output/scrape/link_issue_pull_request/linked_issue/*.csv'))

drive/output/scrape/link_issue_pull_request/linked_issue/modin-project_modin_linked_issue_to_pull_request.csv
drive/output/scrape/link_issue_pull_request/linked_issue/PyMySQL_PyMySQL_linked_issue_to_pull_request.csv
drive/output/scrape/link_issue_pull_request/linked_issue/google_grr_linked_issue_to_pull_request.csv
drive/output/scrape/link_issue_pull_request/linked_issue/celiao_tmdbsimple_linked_issue_to_pull_request.csv
drive/output/scrape/link_issue_pull_request/linked_issue/aaugustin_websockets_linked_issue_to_pull_request.csv


In [6]:
df_issue['created_at'] = pd.to_datetime(df_issue['created_at'])
df_pr['created_at'] = pd.to_datetime(df_pr['created_at'])

In [7]:
sample_num = 100
commit_cols = ['commits','commit additions','commit deletions','commit changes total','commit files changed count']
author_thresh = 1/3

In [8]:
def CleanCommittersInfo(indir_committers_info):
    # TODO: edit file so it can handle pushes
    df_committers_info = pd.read_csv(indir_committers_info / 'committers_info_pr.csv', index_col = 0).dropna()
    df_committers_info['committer_info'] = df_committers_info['committer_info'].apply(literal_eval)
    # TODO: handle cleaning so that it can handle the other cases
    df_committers_info = df_committers_info[df_committers_info['committer_info'].apply(lambda x: len(x)==4)]
    df_committers_info['actor_name'] = df_committers_info['committer_info'].apply(lambda x: x[0])
    df_committers_info['actor_id'] = df_committers_info['committer_info'].apply(lambda x: x[1])

    committers_match = df_committers_info[['name','email','user_type','actor_name','actor_id']].drop_duplicates()
    committers_match.rename({'actor_id':'commit_author_id'}, axis = 1, inplace = True)

    return committers_match

In [9]:
def LinkPRCommits(df_pr_selected, df_pr_commits_selected, committers_match, commit_cols):

    # TODO: what % of commits were dropped because nobody could be found
    matched_commits = pd.merge(df_pr_commits_selected, committers_match,
                               how = 'inner', left_on = ['commit author name','commit author email'],
                               right_on = ['name','email'])
    matched_commits = matched_commits.assign(commits=1)
    
    matched_commits_total = matched_commits.groupby(['repo_name','pr_number'])\
        [commit_cols].sum()
    matched_commits_total.columns = [col + ' total' for col in commit_cols]
    matched_commits_share = pd.merge(
        matched_commits,
        matched_commits_total.reset_index(), on = ['repo_name','pr_number'])
    
    for col in commit_cols:
        matched_commits_share[f"{col} share"] = matched_commits_share[col]/matched_commits_share[f"{col} total"]

    final_agg_cols = commit_cols + [f"{col} share" for col in commit_cols]
    commit_stats = matched_commits_share\
        .assign(commits=1)\
        .groupby(['repo_name','pr_number','commit_author_id'])\
        [final_agg_cols].sum().reset_index()
    
    merged_commits = df_pr_selected.query('pr_action == "closed" & ~pr_merged_by_id.isna()')
    # TODO: what % of commits had truncated information bc 250 max - also, is that push or PR? 
    # TODO: what % of commits could we not get information for
    df_commit_stats = pd.merge(merged_commits, commit_stats, on = ['repo_name','pr_number'])

    return df_commit_stats

In [10]:
def LinkIssuePR(df_issue_selected, df_linked_issues):
    df_linked_issues = df_linked_issues.query('linked_pull_request != "list index out of range"')
    df_linked_issues['linked_pr_number'] = df_linked_issues['linked_pull_request'].apply(lambda x: x.split("/")[-1])
    df_issue_pr = df_linked_issues[['repo_name','issue_number', 'linked_pr_number']].drop_duplicates()

    df_issue_selected = pd.merge(df_issue_selected, df_issue_pr, how = 'left', on = ['repo_name','issue_number'])

    return df_issue_selected

In [11]:
def FilterDuplicateIssues(df, query):
    df_sel = df.query(query)\
        .sort_values(['repo_name','issue_number','created_at'])\
        [['repo_name','actor_id', 'issue_user_id','issue_number', 
          'issue_comment_id', 'created_at', 'linked_pr_number']]\
        .dropna(subset = ['issue_number'])\
        .dropna(axis=1, how='all')\
        .drop_duplicates()

    return df_sel

In [12]:
def ImputeTimePeriod(df, time_period_months):
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['year'] = df['created_at'].apply(lambda x: x.year)
    
    df['period'] = df['created_at'].apply(lambda x: int(x.month>6))
    df['time_period'] = df['created_at'].apply(lambda x: datetime.date(x.year, np.floor(x.month/time_period_months)+1, 1))
    df['time_period'] = pd.to_datetime(df['time_period'])
    
    df_period_index = df[['year','period']].drop_duplicates()\
        .sort_values(['year','period'], ascending = True)\
        .reset_index(drop = True)
    df_period_index['index'] = df_period_index.index
    df = pd.merge(df, df_period_index).drop(['year','period'], axis = 1)\
        .rename({'index': 'time_period_index'}, axis = 1)
    
    return df 

In [13]:
def AssignPRAuthorship(df_pr_commit_stats, author_thresh, commit_cols):
    commit_cols_share = [f"{col} share" for col in commit_cols]
    commit_author_bool = df_pr_commit_stats.apply(lambda x: any([x[col]>author_thresh for col in commit_cols_share]), axis = 1)
    df_pr_commit_author_stats = df_pr_commit_stats[commit_author_bool]
    return df_pr_commit_author_stats

In [14]:
def CalculateColumnPercentile(df, repo, window, col, pct): 
    df_repo = df.query(f'repo_name == "{repo}"')
    df_pct = df_repo.set_index('time_period')\
        [col].resample("1d")\
        .quantile(pct)\
        .rolling(window = window, min_periods = 1)\
        .mean()\
        .rename(f'{col}_{int(pct*100)}th_pct')\
        .reset_index()
    df_repo = pd.merge(df_repo, df_pct, on = ['time_period'])

    return df_repo

In [15]:
def CalculateColumnPercentileDF(df, window, col, pct, general_pct): 
    repo_list = df['repo_name'].unique().tolist()
    
    with ThreadPoolExecutor(8) as pool:
        df = pd.concat(pool.map(CalculateColumnPercentile, itertools.repeat(df), repo_list, itertools.repeat(window), itertools.repeat(col), itertools.repeat(pct)))

    df_all_pct = df.set_index('time_period')\
        [col].resample("1d")\
        .quantile(general_pct)\
        .rolling(window = window, min_periods = 1)\
        .mean()\
        .rename(f'general_{col}_{int(general_pct*100)}th_pct')\
        .reset_index()
    df = pd.merge(df, df_all_pct, on = ['time_period'])

    return df

In [16]:
selected_repos = df_issue[['repo_name']].drop_duplicates()['repo_name'].tolist()#.sample(sample_num, random_state = 123)['repo_name'].tolist()

df_issue_selected = df_issue[(df_issue['repo_name'].isin(selected_repos)) & (df_issue['created_at']>='2015-01-01')]
df_pr_selected = df_pr[(df_pr['repo_name'].isin(selected_repos))  & (df_pr['created_at']>='2015-01-01')]
df_pr_commits_selected = df_pr_commits[(df_pr_commits['repo_name'].isin(selected_repos))]
df_linked_issues = df_linked_issues[(df_linked_issues['repo_name'].isin(selected_repos))]

In [34]:
options_list = []
for major_pct_options in [0.75, 0.9, 0.95]:
    for general_pct_options in [0.25, 0.5, 0.75]:
        for time_period_months_options in [2, 3, 6, 12]:
            options_list.append([major_pct_options, general_pct_options, time_period_months_options])

In [18]:
committers_match = CleanCommittersInfo(indir_committers_info)
df_pr_commit_stats = LinkPRCommits(df_pr_selected, df_pr_commits_selected, committers_match, commit_cols)
df_issue_selected = LinkIssuePR(df_issue_selected, df_linked_issues)
issue_comments = FilterDuplicateIssues(df_issue_selected, 'type == "IssueCommentEvent"')

In [32]:
def GetMajorContributorPostpercentile(ts_data, rolling_window, major_col, major_pct, general_pct):
    ts_data = ts_data.reset_index()\
        .sort_values(['repo_name','time_period_index','actor_id'])

    
    repo_pct_col = f'{major_col}_{int(major_pct*100)}th_pct'
    general_pct_col = f'general_{major_col}_{int(general_pct*100)}th_pct'
    major_cols = ['time_period','time_period_index', 'repo_name','actor_id', major_col,
                  repo_pct_col, general_pct_col]
    
    ts_data_pct = CalculateColumnPercentileDF(ts_data, '1828D', major_col, major_pct, general_pct)
    major_contributor_data = ts_data_pct[major_cols].query(f'{major_col}>{repo_pct_col} & {major_col}>{general_pct_col}')

    return major_contributor_data

def GroupedFill(df, group, fill_cols):
    df[fill_cols] = major_contributors_data.groupby(group)[fill_cols].ffill()
    df[fill_cols] = major_contributors_data.groupby(group)[fill_cols].bfill()

    return df

def GenerateBalancedContributorsPanel(ic_major_contributor_data, pr_major_contributor_data):
    major_contributors = pd.concat([ic_major_contributor_data[['repo_name','actor_id']].drop_duplicates(),
                                    pr_major_contributor_data[['repo_name','actor_id']].drop_duplicates()]).drop_duplicates()
    time_periods = sorted(ic_major_contributor_data['time_period'].unique().tolist())
    major_contributors['time_period'] = [time_periods for i in range(major_contributors.shape[0])]
    major_contributors_data = major_contributors.explode('time_period').reset_index(drop = True)
    major_contributors_data = pd.merge(major_contributors_data, ic_major_contributor_data, how = 'left')
    major_contributors_data = pd.merge(major_contributors_data, pr_major_contributor_data, how = 'left')

    return major_contributors_data

def RemovePeriodsPriorToJoining(major_contributors_data):
    contributor_earliest = major_contributors_data.dropna().sort_values('time_period')\
        [['repo_name','actor_id','time_period']]\
        .drop_duplicates(['repo_name','actor_id'])\
        .rename({'time_period':'earliest_appearance'}, axis = 1)
    major_contributors_data = pd.merge(major_contributors_data, contributor_earliest, how = 'inner', on = ['repo_name','actor_id'])
    major_contributors_data = major_contributors_data.query('time_period>=earliest_appearance')

    return major_contributors_data


def OutputMajorContributors(committers_match, df_pr_commit_stats, df_issue_selected, issue_comments, options):
    major_pct = options[0]
    general_pct = options[1]
    time_period = options[2]

    df_pr_commit_stats = ImputeTimePeriod(df_pr_commit_stats, time_period)
    df_pr_commit_author_stats = AssignPRAuthorship(df_pr_commit_stats, author_thresh, commit_cols)
    ts_pr_authorship = df_pr_commit_author_stats.assign(pr = 1)\
        .groupby(['time_period', 'time_period_index', 'repo_name','actor_id'])\
        [['pr'] + commit_cols + [f"{col} share" for col in commit_cols]].sum()
    
    major_pr_col = 'pr'
    rolling_window = '1828D'
    pr_major_contributor_data = GetMajorContributorPostpercentile(ts_pr_authorship, rolling_window, major_pr_col, major_pct, general_pct)

    issue_comments = ImputeTimePeriod(issue_comments, time_period)
    ts_issue_comments = issue_comments.assign(issue_comments=1)\
        .groupby(['time_period','time_period_index', 'repo_name','actor_id'])\
        ['issue_comments'].sum()

    major_ic_col = 'issue_comments'
    ic_major_contributor_data = GetMajorContributorPostpercentile(ts_issue_comments, rolling_window, major_ic_col, major_pct, general_pct)
    
    major_contributors_data = GenerateBalancedContributorsPanel(ic_major_contributor_data, pr_major_contributor_data)

    major_contributors_data = RemovePeriodsPriorToJoining(major_contributors_data)

    pct_cols = [repo_pct_pr_col, general_pct_pr_col, repo_pct_ic_col, general_pct_ic_col]
    major_cols = [major_pr_col, major_ic_col]

    major_contributors_data = GroupedFill(major_contributors_data, ['repo_name','time_period'], pct_cols)
    major_contributors_data = GroupedFill(major_contributors_data, ['repo_name','time_period'], ['time_period_index'])
    major_contributors_data[major_cols] = major_contributors_data[major_cols].fillna(0)

    print(f"Major PCT: {major_pct}, General PCT: {general_pct}, Time Period: {time_period} months")
    print(major_contributors_data[['repo_name','actor_id']].drop_duplicates().shape)
    major_contributors_data.to_csv(f'issue/major_contributors_major{major_pct}_general{general_pct}_months{time_period}.csv')

In [None]:
#pool.map(OutputMajorContributors, itertools.repeat(committers_match), itertools.repeat(df_pr_commit_stats), itertools.repeat(df_issue_selected), 
#         itertools.repeat(issue_comments), options_list)
OutputMajorContributors(committers_match, df_pr_commit_stats, df_issue_selected, issue_comments, options_list[0])

In [None]:
with multiprocessing.Pool(4) as pool:
    for result in pool.imap(OutputMajorContributors, github_repos):
        print(result)

In [24]:
# TODO: What % was dropped at each stage
major_contributors_data[['repo_name','actor_id']].drop_duplicates().shape
#major_contributors_data.to_csv(f'major_contributors_{sample_num}.csv')

(4138, 2)

In [25]:
# TODO: autofill about ignoring reopenings
np.mean(df_issue['issue_action']=='reopened')

0.003611777045728788