In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
from pathlib import Path
import numpy as np
import sys
import glob
import warnings
import random
from pandarallel import pandarallel
from source.lib.JMSLab import autofill
from source.lib.helpers import ExportTable, AddToTableList

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pandarallel.initialize(progress_bar = True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
def ReadPrIssueData(file_dirs, data_cols):
    df_final = pd.DataFrame(columns = data_cols)
    for file in file_dirs:
        df_part = pd.read_csv(file, nrows = 1)
        df_part_cols = [col for col in data_cols if col in df_part.columns]
        df_part = pd.read_csv(file, usecols = df_part_cols)
        df_final = pd.concat([df_final, df_part]).drop_duplicates()

    df_final = AddDates(df_final)

    return df_final

def AddDates(df):
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['date'] = df.parallel_apply(lambda x: f"{x['created_at'].year}-{x['created_at'].month}", axis = 1)

    return df


In [4]:
def read_parquet(filename, commit_cols):
    try:
        df = pd.read_parquet(filename).drop_duplicates('commit sha')[commit_cols]
        return df
    except:
        return 

In [5]:
def read_csv(filename, commit_cols):
    try:
        df = pd.read_csv(filename, index_col = 0).drop_duplicates('commit sha')[commit_cols]
        return df
    except:
        return 

In [None]:
pr_data_indir = glob.glob('drive/output/scrape/extract_github_data/pull_request_data/*.csv')
pr_data_indir.extend(glob.glob('drive/output/scrape/extract_github_data/pull_request_review_data/*.csv'))
pr_data_indir.extend(glob.glob('drive/output/scrape/extract_github_data/pull_request_review_comment_data/*.csv'))
pr_cols = ['type','created_at','repo_id','repo_name','actor_id','actor_login','pr_number', 'pr_title',
           'pr_body', 'pr_action','pr_merged_by_id','pr_merged_by_type','pr_label', 'pr_review_action',
           'pr_review_id','pr_review_state', 'pr_review_body', 'pr_review_comment_body']
df_pr = ReadPrIssueData(pr_data_indir, pr_cols)

issue_data_indir = glob.glob('drive/output/scrape/extract_github_data/issue_data/*.csv')
issue_data_indir.extend(glob.glob('drive/output/scrape/extract_github_data/issue_comment_data/*.csv'))
issue_cols = ['type','created_at','repo_id','repo_name','actor_id','actor_login','issue_number', 'issue_body','issue_title',
              'issue_action','issue_state', 'issue_comment_id', 'issue_user_id', 'issue_comment_body']
df_issue = ReadPrIssueData(issue_data_indir, issue_cols)

In [None]:
df_pr_selected = df_pr
df_issue_selected = df_issue

In [None]:
df_commiters_raw = pd.read_csv('drive/output/scrape/link_committers_profile/committers_info.csv', index_col = 0)
for col in ['commit_repo','committer_info']:
    df_commiters_raw[col] = df_commiters_raw[col].apply(lambda x: literal_eval(x) if not pd.isnull(x) else x)
df_commiters_raw['repo_name'] = df_commiters_raw['commit_repo'].apply(lambda x: x[0].split("_")[-1] if type(x) == list else x)
df_commiters = df_commiters_raw[['name','email','repo_name','committer_info']].drop_duplicates(
    ['name','email','repo_name']).dropna()

In [None]:
commit_cols = ['repo_name','commit author name','commit author email', 'commit additions',
               'commit deletions','commit changes total','commit files changed count', 'commit file changes']
df_pr_commits = pd.concat([read_parquet(filename, commit_cols) for filename in glob.glob('drive/output/scrape/collect_commits/pr/*')])
df_pr_commits.to_csv('issue/df_pr_commits.csv')

In [None]:
df_pr_commits = pd.concat([read_parquet(filename, commit_cols) for filename in glob.glob('drive/output/scrape/collect_commits/push/*')])
df_pr_commits.to_csv('issue/df_pr_commits.csv')