In [1]:
from collections import Counter
import networkx as nx
import pandas as pd
import pickle
import pprint

In [2]:
data = pd.read_csv('./raw/kubernetes.csv')

In [3]:
pr_id_list = data['pr_id'].unique()

### Get Number of Reviews For Pull Request:

In [4]:
def get_num_reviews(pr_id, data):
    
    reviewer_list={}
    pr_df = data.loc[data.pr_id == pr_id, :]
    
    commenter_list = list(pr_df.commenter_id.values)
    commenter_set = set(commenter_list)
    
    head_commiter_set = set(pr_df.head_commit_committer_id.values)
    head_author_set = set(pr_df.head_commit_author_id.values)
    pr_owner_set = (head_author_set | head_commiter_set) & commenter_set
    reviewer_set = commenter_set - pr_owner_set
    
    buf_list = []
    for c in commenter_list:
        if c not in pr_owner_set:
            buf_list += [c]
    
    commenter_freq = {}
    for reviewer in buf_list:
        if reviewer not in commenter_freq:
            commenter_freq.update({reviewer: 1})
        else:
            commenter_freq[reviewer] += 1

    return commenter_freq

In [5]:
for pr_id in pr_id_list[0:5]: 
    res = get_num_reviews(pr_id, data)
    print('PR_ID: {}, REVIEWERS: {}'.format(pr_id, res))

PR_ID: 16143045, REVIEWERS: {3119227: 10, 13194: 8, 3035095: 5}
PR_ID: 16074582, REVIEWERS: {3665822: 2, 2628808: 21}
PR_ID: 16073995, REVIEWERS: {10155399: 2, 2628808: 10, 12669353: 3}
PR_ID: 18119032, REVIEWERS: {7780816: 4, 452114: 7, 3204869: 7}
PR_ID: 16078235, REVIEWERS: {10456340: 3, 145523: 6, 13194: 6, 3035095: 1}


### Number of comments made by Reviewer r on Pull Request p:

In [6]:
def ncomments_on_pr_by_reviewer(pr_id, reviewer_id, data):
    
    res = get_num_reviews(pr_id, data)
    
    if reviewer_id in res.keys():
        return res[reviewer_id]
    else:
        return 0

In [7]:
for pr_id in pr_id_list[0:5]:
    for reviewer in get_num_reviews(pr_id, data).keys():
        res = ncomments_on_pr_by_reviewer(pr_id, reviewer, data)
        print('PR_ID: {}, REVIEWER: {}, NUM_COMMENTS: {}'.format(pr_id, reviewer, res))

PR_ID: 16143045, REVIEWER: 3119227, NUM_COMMENTS: 10
PR_ID: 16143045, REVIEWER: 13194, NUM_COMMENTS: 8
PR_ID: 16143045, REVIEWER: 3035095, NUM_COMMENTS: 5
PR_ID: 16074582, REVIEWER: 3665822, NUM_COMMENTS: 2
PR_ID: 16074582, REVIEWER: 2628808, NUM_COMMENTS: 21
PR_ID: 16073995, REVIEWER: 10155399, NUM_COMMENTS: 2
PR_ID: 16073995, REVIEWER: 2628808, NUM_COMMENTS: 10
PR_ID: 16073995, REVIEWER: 12669353, NUM_COMMENTS: 3
PR_ID: 18119032, REVIEWER: 7780816, NUM_COMMENTS: 4
PR_ID: 18119032, REVIEWER: 452114, NUM_COMMENTS: 7
PR_ID: 18119032, REVIEWER: 3204869, NUM_COMMENTS: 7
PR_ID: 16078235, REVIEWER: 10456340, NUM_COMMENTS: 3
PR_ID: 16078235, REVIEWER: 145523, NUM_COMMENTS: 6
PR_ID: 16078235, REVIEWER: 13194, NUM_COMMENTS: 6
PR_ID: 16078235, REVIEWER: 3035095, NUM_COMMENTS: 1


### Pull-Requests submitted by Reviewer r to the repository:

In [8]:
def npullreq_on_repo_by_reviewer(reviewer, data):
    df = set(data[data.commenter_id == reviewer]['head_repo_id'])
    return len(df)

In [9]:
entities = None
with open('./cache/arr/entities.pkl', 'rb') as file:
    entities = pickle.load(file)
    
entity_metadata = None
with open('./cache/dicts/entity_metadata.pkl', 'rb') as file:
    entity_metadata = pickle.load(file)

In [10]:
for e in entities[0:10]:
    print('ENTITY: {}, NUM_PULL_REQ: {}'.format(e, npullreq_on_repo_by_reviewer(e, data)))

ENTITY: 10932224, NUM_PULL_REQ: 1
ENTITY: 661508, NUM_PULL_REQ: 1
ENTITY: 3782661, NUM_PULL_REQ: 1
ENTITY: 6123527, NUM_PULL_REQ: 2
ENTITY: 3653642, NUM_PULL_REQ: 4
ENTITY: 1073162, NUM_PULL_REQ: 1
ENTITY: 61452, NUM_PULL_REQ: 3
ENTITY: 706571, NUM_PULL_REQ: 1
ENTITY: 2773008, NUM_PULL_REQ: 1
ENTITY: 6123537, NUM_PULL_REQ: 7


### Reviewers Count:

In [11]:
reviewer_count ={}
for pr_id in pr_id_list:
    reviewer_count[pr_id] = get_num_reviews(pr_id,data)

### List of Pull Requests Submitted by Reviewer r to the repository:

#### Load Entities:

In [12]:
entities = None
with open('./cache/arr/entities.pkl', 'rb') as file:
    entities = pickle.load(file)

#### Finding the List of Pull Requests:

In [13]:
total_pr_reviewed = {}

for reviewer in entities:
    for pr_id in set(pr_id_list):
        if pr_id in reviewer_count.keys():
            if reviewer in list(reviewer_count[pr_id].keys()):

                if reviewer not in total_pr_reviewed.keys():

                    total_pr_reviewed.update({
                        reviewer: {
                            'pr_list': [pr_id], 
                            'total_counts': 1
                        }
                    })

                else:

                    total_pr_reviewed[reviewer]['pr_list'] += [pr_id]
                    total_pr_reviewed[reviewer]['total_counts'] += 1

In [14]:
total_pr_count = {}

for (rev, pr) in total_pr_reviewed.items():
    total_pr_count[rev] = len(pr)

#### Reviewers List:

In [15]:
reviewers = list(total_pr_count.keys())

### Export Reviewers:

In [16]:
export_list = False

if export_list == True:
    with open('./cache/arr/reviewers.pkl', 'wb') as file:
        pickle.dump(reviewers, file)

### Total Reviewed Items by Reviewer:

In [17]:
for reviewer in reviewers[0:5]:
    
    res = total_pr_reviewed[reviewer]
    
    print('======================================================')
    print('REVIEWER_ID:', reviewer)
    print('REVIEWED_PR_LIST:', res['pr_list'])
    print('REVIEWED_PR_COUNT:', res['total_counts'])

REVIEWER_ID: 3782661
REVIEWED_PR_LIST: [9115444, 9855523, 12511590]
REVIEWED_PR_COUNT: 3
REVIEWER_ID: 6123527
REVIEWED_PR_LIST: [11212613, 10713886]
REVIEWED_PR_COUNT: 2
REVIEWER_ID: 3653642
REVIEWED_PR_LIST: [27730309, 26105726, 18542150]
REVIEWED_PR_COUNT: 3
REVIEWER_ID: 61452
REVIEWED_PR_LIST: [10257553, 18772932]
REVIEWED_PR_COUNT: 2
REVIEWER_ID: 6123537
REVIEWED_PR_LIST: [8981843, 16005488, 11682049, 11980907, 10899860, 13201116, 9829448]
REVIEWED_PR_COUNT: 7
