In [1]:
import pandas as pd
import numpy as np
import glob
from pandarallel import pandarallel
from bs4 import BeautifulSoup, SoupStrainer
import requests
import os

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
pandarallel.initialize(progress_bar = True)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## OLD WORK PROBABLY NOT USEFUL

In [4]:
def getints(field):
    integers = []
    for char in field:
        try:        
            value = int(char)
            integers.append(str(value))
        except ValueError:
            break
    return "".join(integers)

In [5]:
def cleanData(df, type, body_col, extract_cols):
    df['issue_referenced_body'] = df.fillna('').apply(lambda x: '#' in x[body_col], axis = 1)
    df['potential_issue_body'] = df.apply(
        lambda x: [] if not x['issue_referenced_body'] else [getints(ele) for ele in x[body_col].split("#")[1:] if getints(ele) != ''], axis = 1)

    if type == 'pr': 
        df['issue_referenced_title'] = df.fillna('').apply(lambda x: '#' in x['pr_title'], axis = 1)
        df['potential_issue_title'] = df.apply(
            lambda x: [] if not x['issue_referenced_title'] 
            else [getints(ele) for ele in x['pr_title'].split("#")[1:] if getints(ele) != '' ], axis = 1)
        df['potential_issues'] = df.apply(lambda x: x['potential_issue_title'] + x['potential_issue_body'], axis = 1)
    df['potential_issues'] = df.apply(lambda x: x['potential_issue_body'], axis = 1)
    
    return df[extract_cols]  



## PR Event

In [6]:
df_lst = glob.glob('data/github_clean/filtered_github_data/prEvent0*.parquet')
df_lst.extend(glob.glob('data/github_clean/github_data_pre_18/prEvent0*.parquet'))
df_lst.extend(glob.glob('data/github_clean/github_data_2324/pullRequestEvent0*.parquet'))

In [7]:
%%time
pr_data = pd.concat([pd.read_parquet(file, columns = 
                                ['repo_id', 'repo_name', 'pr_id', 'pr_number', 'created_at','type','pr_issue_url', 'pr_title','pr_body']) for file in df_lst])

CPU times: user 58 s, sys: 17.5 s, total: 1min 15s
Wall time: 1min 34s


In [8]:
linked_prs = cleanData(pr_data, 'pr', 'pr_body',
                       ['repo_id', 'repo_name', 'pr_id', 'pr_number', 'potential_issues', 'created_at','type','pr_issue_url'])
linked_prs = linked_prs[linked_prs['potential_issues'].apply(lambda x: len(x) != 0)]

## PR Review Event

In [9]:
df_lst_review = glob.glob('data/github_clean/filtered_github_data/prReviewEvent*.parquet')
df_lst_review.extend(glob.glob('data/github_clean/github_data_pre_18/prReviewEvent*.parquet'))
df_lst_review.extend(glob.glob('data/github_clean/github_data_2324/pullRequestReviewEvent*.parquet'))

In [10]:
%%time
pr_review_data = pd.concat([pd.read_parquet(file, columns =
                                       ['repo_id', 'repo_name', 'pr_review_id', 'pr_number', 'created_at','type', 'pr_review_body']) for file in df_lst_review])

CPU times: user 31.1 s, sys: 6.88 s, total: 38 s
Wall time: 49.7 s


In [11]:
%%time
df_issues_linked_review = cleanData(pr_review_data, 'pr_review', 'pr_review_body',
                                    ['repo_id', 'repo_name', 'pr_review_id', 'pr_number', 'potential_issues', 'created_at','type'])
df_issues_linked_review = df_issues_linked_review[df_issues_linked_review['potential_issues'].apply(lambda x: len(x)>0)]

CPU times: user 31.1 s, sys: 1.12 s, total: 32.2 s
Wall time: 32.2 s


# PR Review Comment Event

In [12]:
df_lst_review_comment = glob.glob('data/github_clean/filtered_github_data/prReviewCommentEvent*.parquet')
df_lst_review_comment.extend(glob.glob('data/github_clean/github_data_pre_18/prReviewCommentEvent*.parquet'))
df_lst_review_comment.extend(glob.glob('data/github_clean/github_data_2324/pullRequestReviewCommentEvent*.parquet'))

In [13]:
%%time
pr_review_comment_data = pd.concat([pd.read_parquet(file,
                                               columns = ['repo_id', 'repo_name', 'pr_review_comment_id', 'pr_number', 'created_at','type', 'pr_review_comment_body']) for file in df_lst_review_comment])

CPU times: user 38.1 s, sys: 9.35 s, total: 47.4 s
Wall time: 1min 8s


In [14]:
%%time
df_issues_linked_review_comment = cleanData(pr_review_comment_data, 'pr_review_comment', 'pr_review_comment_body',
                                            ['repo_id', 'repo_name', 'pr_review_comment_id', 'pr_number', 'potential_issues', 'created_at','type'])
df_issues_linked_review_comment = df_issues_linked_review_comment[df_issues_linked_review_comment['potential_issues'].apply(lambda x: len(x) != 0)]

CPU times: user 49.2 s, sys: 1.63 s, total: 50.8 s
Wall time: 50.8 s


## Ensure we only have valid issues

In [15]:
def read_parquet(fpath, colnames):
    try:
        return pd.read_parquet(fpath, columns = colnames)
    except:
        return pd.DataFrame(columns = colnames)

In [16]:
%%time
# Read data on issue comments, issues
issue_com = glob.glob('data/github_clean/filtered_github_data/issueCo*')
issue_com.extend(glob.glob('data/github_clean/github_data_pre_18/issueCo*'))
issue_com.extend(glob.glob('data/github_clean/github_data_2324/issueCo*'))
df_issue_comments = pd.concat([read_parquet(ele, ['repo_id', 'repo_name', 'issue_number', 'created_at']) for ele in issue_com]).reset_index(drop = True)

issues = glob.glob('data/github_clean/filtered_github_data/issues*')
issues.extend(glob.glob('data/github_clean/github_data_pre_18/issues*'))
issues.extend(glob.glob('data/github_clean/github_data_2324/issues*'))
df_issue = pd.concat([read_parquet(ele, ['repo_id', 'repo_name', 'issue_number', 'created_at']) for ele in issues]).reset_index(drop = True)

CPU times: user 59.7 s, sys: 10.6 s, total: 1min 10s
Wall time: 1min 27s


In [17]:
all_issue_info = pd.concat([df_issue, df_issue_comments]).groupby(
    ['repo_id','repo_name','issue_number'])['created_at'].min().reset_index()

In [18]:
# pr data
linked_pr_data = linked_prs[['repo_id', 'repo_name', 'pr_number', 'potential_issues','created_at']].explode('potential_issues')
linked_pr_data['potential_issues'] = linked_pr_data['potential_issues'].astype(int)
# pr review
linked_pr_review_data = df_issues_linked_review_comment[['repo_id','repo_name', 'pr_number', 'potential_issues', 'created_at']].explode('potential_issues')
linked_pr_review_data['potential_issues'] = pd.to_numeric(linked_pr_review_data['potential_issues'], errors = 'coerce')
linked_pr_review_data = linked_pr_review_data[~linked_pr_review_data['potential_issues'].isna()]
# pr review comment
linked_pr_review_comment_data = df_issues_linked_review_comment[['repo_id', 'repo_name','pr_number', 'potential_issues', 'created_at']].explode('potential_issues')
linked_pr_review_comment_data['potential_issues'] = pd.to_numeric(linked_pr_review_comment_data['potential_issues'], errors = 'coerce')
linked_pr_review_comment_data = linked_pr_review_comment_data[~linked_pr_review_comment_data['potential_issues'].isna()]

In [22]:
pr_issue_data = pd.concat([linked_pr_review_comment_data,linked_pr_review_data,linked_pr_data])
pr_issue_data['created_at'] = pd.to_datetime(pr_issue_data['created_at'], utc = True)
pr_issue_data = pr_issue_data.sort_values('created_at').drop_duplicates(
    ['repo_id','repo_name','pr_number','potential_issues'])

In [23]:
all_issue_info['key'] = all_issue_info['repo_id'].apply(lambda x: str(int(x)))+"_"+all_issue_info['issue_number'].apply(lambda x: str(int(x)))
all_issue_info_dict = all_issue_info[['key', 'created_at']].set_index('key').to_dict()['created_at']
pr_issue_data['issue_first_date'] = pr_issue_data.apply(lambda x: all_issue_info_dict.get(str(int(x['repo_id']))+"_"+str(int(x['potential_issues']))), axis = 1)


In [24]:
pr_issue_data['created_at'] = pd.to_datetime(pr_issue_data['created_at'], utc= True)
pr_issue_data['issue_first_date'] = pd.to_datetime(pr_issue_data['issue_first_date'], utc= True)

In [25]:
pr_issue_data = pr_issue_data[pr_issue_data.apply(lambda x: not pd.isnull(x['issue_first_date']) and x['created_at'] >= x['issue_first_date'], axis = 1)]

In [26]:
pr_issue_data[['repo_id','repo_name', 'potential_issues']].drop_duplicates()

Unnamed: 0,repo_id,repo_name,potential_issues
845,921367,cobrateam/splinter,110.0
1739,1062237,libgit2/pygit2,50.0
317,1060073,omab/django-social-auth,156.0
165,1357152,erikrose/nose-progressive,23.0
1705,1446474,pypa/virtualenv,186.0
...,...,...,...
8436,12888993,home-assistant/core,93498.0
13047,2081289,astropy/astropy,16080.0
16806,47203045,DataDog/integrations-core,15308.0
3564,17165658,apache/spark,4686.0


## Ensure we only have valid PRs

In [33]:
all_pr_info = pd.concat([pr_data[['repo_id','repo_name', 'pr_number', 'created_at']],
           pr_review_data[['repo_id','repo_name', 'pr_number', 'created_at']],
           pr_review_comment_data[['repo_id','repo_name', 'pr_number', 'created_at']]])
all_pr_info['created_at'] = pd.to_datetime(all_pr_info['created_at'], utc = True, format = 'mixed', errors = 'coerce')
all_pr_info = all_pr_info.sort_values('created_at').drop_duplicates(
    ['repo_id','pr_number'])

## Issue Comment Event

In [34]:
%%time
# Read data on issue comments, issues
issue_com = glob.glob('data/github_clean/filtered_github_data/issueCo*')
issue_com.extend(glob.glob('data/github_clean/github_data_pre_18/issueCo*'))
df_issue_comments = pd.concat([pd.read_parquet(ele) for ele in issue_com]).reset_index(drop = True)

CPU times: user 1min 58s, sys: 39.7 s, total: 2min 38s
Wall time: 1min 50s


In [35]:
%%time
df_issue_comments_linked = cleanData(df_issue_comments, 'issue_comment', 'issue_comment_body',
                                            ['repo_id', 'repo_name', 'issue_id', 'issue_number', 
                                             'potential_issues', 'created_at','type' , 'issue_pull_request'])
df_issue_comments_linked = df_issue_comments_linked[df_issue_comments_linked['potential_issues'].apply(
    lambda x: len(x) != 0)]
df_issue_comments_linked.rename({'potential_issues':'potential_prs'}, axis = 1, inplace = True)
df_issue_comments_linked = df_issue_comments_linked[~df_issue_comments_linked['issue_number'].isna()]

CPU times: user 1min 21s, sys: 12.3 s, total: 1min 34s
Wall time: 1min 33s


In [36]:
all_pr_info = all_pr_info[all_pr_info['pr_number'].apply(lambda x: type(x) != str and type(x) != float and type(x) != type(None))]
all_pr_info['key'] = all_pr_info['repo_id'].apply(lambda x: str(int(x)))+"_"+all_pr_info['pr_number'].apply(lambda x: str(int(x)))
pr_info_dict = all_pr_info.set_index('key')['created_at'].to_dict()

In [37]:
linked_issue_comments = df_issue_comments_linked[['repo_id','repo_name', 'issue_number', 'potential_prs', 'created_at']].explode('potential_prs')
linked_issue_comments['potential_prs'] = linked_issue_comments['potential_prs'].apply(lambda x: int(x))
linked_issue_comments = linked_issue_comments.drop_duplicates()
linked_issue_comments['key'] = linked_issue_comments['repo_id'].apply(lambda x: str(int(x)))+"_"+linked_issue_comments['potential_prs'].apply(lambda x: str(int(x)))
linked_issue_comments['pr_created_date'] = linked_issue_comments['key'].apply(lambda x: pr_info_dict.get(x, np.nan))

In [38]:
linked_issue_comments['key'] = linked_issue_comments['repo_id'].apply(lambda x: str(int(x)))+"_"+linked_issue_comments['potential_prs'].apply(lambda x: str(int(x)))

In [39]:
linked_issue_comments = linked_issue_comments[(~linked_issue_comments['pr_created_date'].isna()) & \
    (linked_issue_comments['created_at']>=linked_issue_comments['pr_created_date'])]

## Make sure the "issues" that are references are not PRs lmfao 

In [40]:
check_issues = pd.concat([
    pr_issue_data[['repo_id', 'repo_name','potential_issues']].drop_duplicates(),
    linked_issue_comments[['repo_id','repo_name', 'issue_number']].drop_duplicates().rename(
        {'issue_number':'potential_issues'}, axis = 1)]).drop_duplicates()

In [41]:
check_issues['linked_pr'] = np.nan
check_issues['potential_issues'] = check_issues['potential_issues'].astype(int)
check_issues = check_issues.reset_index(drop = True)

In [42]:
os.makedirs("data/inputs/linked_issues", exist_ok=True)

In [53]:
check_issues['repo_id'] = pd.to_numeric(check_issues['repo_id'])

In [54]:
inds = np.array_split(check_issues.index, 1000)
i = 0
for ind in inds:
    i+=1
    check_issues.loc[ind].to_parquet(f'data/inputs/linked_issues/linked_issue_{i}.parquet')