In [1]:
from io import BytesIO
import requests
import pandas as pd
from utils import *
import os

In [2]:
def cache_import_csv_from_url(url, fname, header='infer'):
    if os.path.isfile(fname):
        return pd.read_csv(fname,header=header)
    else:
        df=import_csv_from_url(url, header=header)
        df.to_csv(fname)
        return df

In [3]:
%%time
# urls to google docs will be kept secret until publication in order to protect identity.
pr50=cache_import_csv_from_url('https://docs.google.com/spreadsheets/d/anonymised/exportFormat=csv', 'pr150.csv')
pr130=cache_import_csv_from_url('https://docs.google.com/spreadsheets/d/anonymised/exportFormat=csv','pr130.csv')
pr11=cache_import_csv_from_url('https://docs.google.com/spreadsheets/d/anonymised/exportFormat=csv','pr11.csv')
pr11['source']='pr11'
pr11['title_mask']='testability'
pr50['source']='pr50'
pr130['source']='pr130'
pr180=pr50.append(pr130, sort=False)
pr180['title_mask']='testability_body'

pr180=pr180.append(pr11, sort=False)
pr180['reviewer']='Reviewer1'
pr180.loc[pr180['ref_pattern']=='irrelevant','pr_group']='irrelevant'


CPU times: user 19.8 ms, sys: 3 ms, total: 22.8 ms
Wall time: 21.8 ms


In [4]:
manually_reviewed1=pd.read_csv('manually_reviewed.csv')
manually_reviewed1['reviewer']='Reviewer1'
manually_reviewed1['source']='manually_reviewed.csv'
print(len(set(manually_reviewed1.url)))

606


In [5]:
manually_reviewed3=cache_import_csv_from_url('https://docs.google.com/spreadsheets/d/anonymised/exportFormat=csv','manually_reviewed3.csv')
manually_reviewed3=manually_reviewed3[manually_reviewed3.reviewer!='example']
manually_reviewed3['source']='externally_reviewed_PRs.csv'
print(len(set(manually_reviewed3.url)))


320


In [6]:
%%time
all_prs_with_files=pd.read_csv('all_prs_with_files.csv')
all_prs=generate_all_prs(all_prs_with_files)
all_prs['selected']='Other'
all_prs.loc[(all_prs['test_pairs'] > 0) & (all_prs['changed_files'] < 10),'selected']='small_with_tpairs'
all_prs.loc[~(all_prs['test_pairs'] > 0) & (all_prs['changed_files'] < 10),'selected']='small_without_tpairs'
all_prs.loc[(all_prs['test_pairs'] > 0) & ~(all_prs['changed_files'] < 10),'selected']='large_with_tpairs'
all_prs.loc[~(all_prs['test_pairs'] > 0) & ~(all_prs['changed_files'] < 10),'selected']='large_without_tpairs'
all_prs['churn']=all_prs['prod_additions']+all_prs['prod_deletions']+all_prs['test_additions']+all_prs['test_deletions']
all_prs[all_prs.selected.isin(['small_with_tpairs','large_with_tpairs'])][['churn','changed_files','test_pairs']].quantile([0.1,0.5,0.6,0.62,0.9])
all_prs['title_mask']=all_prs.title.apply(lambda x: is_testability_relevant(x)[1])


CPU times: user 43.5 s, sys: 3.97 s, total: 47.4 s
Wall time: 47.9 s


In [7]:
manually_reviewed_cols=['url','prod_file','pr_group','ref_pattern','reviewer','test_location','source']
manually_reviewed1=manually_reviewed1[~manually_reviewed1.url.isin(manually_reviewed3.url)][manually_reviewed_cols]
manually_reviewed=manually_reviewed3[manually_reviewed_cols].append(manually_reviewed1)
manually_reviewed['url']=manually_reviewed['url'].apply(normalise_url)
print(len(set(manually_reviewed.url)))
manually_reviewed=manually_reviewed.merge(all_prs[['url','title_mask','selected']].drop_duplicates())
print(len(set(manually_reviewed.url)))
manually_reviewed['testability_meaning']=None

764
764


In [8]:
manually_reviewed.loc[manually_reviewed.pr_group=='irrelevant','ref_pattern']='irrelevant'

In [9]:
cols=['url','title_mask','pr_group','ref_pattern','prod_file','test_location','source','reviewer']
reviewed=(pr180[cols].append(manually_reviewed[cols])).drop_duplicates()
len(set(reviewed.url))

955

In [10]:
%%time
hiwi1=cache_import_csv_from_url('https://docs.google.com/spreadsheets/d/anonymised/exportFormat=csv','hiwi1.csv',header=[1])
hiwi1['reviewer']='Reviewer2'
hiwi1['source']='hiwi1'
hiwi2=cache_import_csv_from_url('https://docs.google.com/spreadsheets/d/anonymised/exportFormat=csv','hiwi2.csv')
hiwi2['reviewer']='Reviewer3'
hiwi2['source']='hiwi2'
hiwi3=cache_import_csv_from_url('https://docs.google.com/spreadsheets/d/anonymised/exportFormat=csv','hiwi3.csv')
hiwi3['reviewer']='Reviewer3'
hiwi3['source']='hiwi3'

CPU times: user 11 ms, sys: 1.63 ms, total: 12.7 ms
Wall time: 13.5 ms


In [11]:
urls={'https://docs.google.com/spreadsheets/d/anonymised/edit#gid=1231948719':'aug2022_test.csv',
     'https://docs.google.com/spreadsheets/d/anonymised/edit#gid=1472634510':'aug2022_Other.csv',
     'https://docs.google.com/spreadsheets/d/anonymised/edit#gid=780422865':'16aug2022_Other.csv',
     'https://docs.google.com/spreadsheets/d/anonymised/edit#gid=1913656730':'16aug2022_test.csv',
     'https://docs.google.com/spreadsheets/d/anonymised/edit#gid=0':'17aug2022_mixed.csv'}
hiwi_aug2022=pd.DataFrame()
for url,fname in urls.items():
    df=cache_import_csv_from_url(url.replace('/edit','/export?exportFormat=csv'), fname)
    hiwi_aug2022=hiwi_aug2022.append(df)

In [12]:
hiwi_aug2022=hiwi_aug2022[hiwi_aug2022['Coder'].notnull()]

In [13]:
hiwi_aug2022['reviewer']=hiwi_aug2022['Coder'].apply(lambda x: 'Reviewer3' if 'TPu' in x else 'Reviewer4')
hiwi_aug2022['source']='aug2022'
hiwi_aug2022=hiwi_aug2022.drop(columns='Coder')

In [14]:

hiwi=hiwi1.append(hiwi2,sort=False)
hiwi=hiwi.append(hiwi3,sort=False)
hiwi['url']=hiwi['url'].apply(derive_url)
hiwi=hiwi[hiwi['url']!='https://github.com/azkaban/azkaban/pull/1765']
hiwi=hiwi[hiwi.pr_group.notnull()]
hiwi=hiwi.append(hiwi_aug2022)

In [15]:
secondary_aug2022=cache_import_csv_from_url(
    'https://docs.google.com/spreadsheets/d/anonymised/exportFormat=csv', 'secondary_aug2022.csv')
secondary_aug2022['period']='August 2022'
secondary_aug2022['reviewer']='Reviewer1'


In [16]:
primary=manually_reviewed3[manually_reviewed3.reviewer!='Reviewer1']\
    [['url','orig_pr_group','orig_ref_pattern','test_location','reviewer']].\
    rename(columns={'orig_pr_group':'pr_group','orig_ref_pattern':'ref_pattern'})
primary['period']='January'
secondary=manually_reviewed3[manually_reviewed3.reviewer!='Reviewer1']\
    [['url','pr_group','ref_pattern','test_location','reviewer']]
secondary['period']='January'
secondary['reviewer']='Reviewer1'
secondary=secondary.append(secondary_aug2022)
primary['url']=primary['url'].apply(derive_url)
secondary['url']=secondary['url'].apply(derive_url)
august=reviewed[(reviewed.reviewer=='Reviewer1') & (reviewed.url.isin(hiwi.url))][['url','pr_group','ref_pattern','test_location','reviewer']]
august['period']='August'
primary=primary.append(august, sort=True)
s_august=hiwi[hiwi.url.isin(august.url)][['url','orig_pr_group','orig_ref_pattern','pr_group','ref_pattern','test_location','reviewer','comment','fauthor_comment']]
# take orig_pr_group or pr_group
s_august.loc[s_august.orig_pr_group.notnull(),'pr_group']=s_august[s_august.orig_pr_group.notnull()]['orig_pr_group']
s_august.loc[s_august.orig_ref_pattern.notnull(),'ref_pattern']=s_august[s_august.orig_ref_pattern.notnull()]['orig_ref_pattern']
s_august['period']='August'
s_august=s_august.drop(columns=['orig_pr_group','orig_ref_pattern'])
secondary=secondary.append(s_august,sort=True)
primary.loc[primary.pr_group=='irrelevant','ref_pattern']='irrelevant'
secondary.loc[secondary.pr_group=='irrelevant','ref_pattern']='irrelevant'
primary['reviewer']=primary['reviewer'].apply(derive_reviewer_id)
secondary['reviewer']=secondary['reviewer'].apply(derive_reviewer_id)
primary.to_csv('primary_reviewed.csv',index=False)
secondary.to_csv('secondary_reviewed.csv',index=False)

In [17]:
sel=hiwi[hiwi.pr_group.notnull()]
sel=sel[~((sel.fauthor_comment.notnull()) & (sel['pavels_comment'].str.contains('take-mine')))]
sel['url']=sel['url'].apply(derive_url)
freviewed=reviewed[~reviewed.url.isin(sel.url)]
freviewed=freviewed.append(sel[cols],sort=False)
freviewed['reviewer']=freviewed['reviewer'].apply(derive_reviewer_id)
mask_by_url=freviewed[freviewed.title_mask.notnull()][['url','title_mask']].drop_duplicates()
freviewed=freviewed.merge(mask_by_url, on='url', suffixes=['_orig','_derived'])
freviewed.loc[freviewed.title_mask_orig.isnull(),'title_mask_orig']=\
    freviewed[freviewed.title_mask_orig.isnull()]['title_mask_derived']
freviewed=freviewed.rename(columns={'title_mask_orig':'title_mask'}).drop(columns=['title_mask_derived'])
freviewed=freviewed[~((freviewed.pr_group!='irrelevant') & (freviewed.ref_pattern=='irrelevant'))]
freviewed.to_csv('reviewed.csv',index=False)