In [1]:
import pandas as pd
import pickle

In [2]:
data = pd.read_csv('./raw/kubernetes.csv')

In [3]:
pr_id_list = data['pr_id'].unique()

In [4]:
def get_num_reviews(pr_id, data):
    """
    Input : pull request id (pr_id) and the dataset(data)
    Return : a dictionary with each entry as (reviewer_id, no. of comments on pr_id)
    """
    
    reviewer_list={}
    
    pr_df = data.loc[data.pr_id == pr_id, :]
    
    commenter_list = list(pr_df.commenter_id.values)
    commenter_set = set(commenter_list)
    
    head_commiter_set = set(pr_df.head_commit_committer_id.values)
    head_author_set = set(pr_df.head_commit_author_id.values)
    pr_owner_set = (head_author_set | head_commiter_set) & commenter_set
    reviewer_set = commenter_set - pr_owner_set
    
    buf_list = []
    for c in commenter_list:
        if c not in pr_owner_set:
            buf_list += [c]
    
    commenter_freq = {}
    for reviewer in buf_list:
        if reviewer not in commenter_freq:
            commenter_freq.update({reviewer: 1})
        else:
            commenter_freq[reviewer] += 1

    return commenter_freq

In [5]:
pr_id_list[0:5]

array([16143045, 16074582, 16073995, 18119032, 16078235], dtype=int64)

### Feature 1: Number of comments made by Reviewer r on PullRequest p

In [15]:
def ncomments_on_pr_by_reviewer(pr_id, reviewer_id, data):

    res = get_num_reviews(pr_id, data)
    
    if reviewer_id in res.keys():
        return res[reviewer_id]
    else:
        return 0

In [16]:
print(get_num_reviews(16078235, data))

{10456340: 3, 145523: 6, 13194: 6, 3035095: 1}


In [18]:
print(ncomments_on_pr_by_reviewer(16078235, 145523, data))

6


### Feature 2: Number of total Pull-Requests submitted by Reviewer r to the repository

In [51]:
def npullreq_on_repo_by_reviewer(reviewer, data):
    
    df = set(data[data.commenter_id == reviewer]['head_repo_id'])
    
    return len(df)

In [54]:
for e in entities:
    print(npullreq_on_repo_by_reviewer(e, data))

1
1
1
2
4
1
3
1
1
7
2
2
1
121
30
10
1
1
2
2
1
6
1
1
76
4
21
1
1
1
1
1
2
1
3
1
2
1
9
7
1
1
1
1
1
4
32
1
2
1
23
4
1
77
25
4
1
2
2
1
1
1
1
1
2
12
1
35
2
7
1
1
1
43
10
46
1
138
16
1
1
6
11
1
3
4
1
2
1
1
1
1
1
1
2
1
1
1
2
1
2
18
1
1
17
3
1
6
2
142
1
66
1
1
3
2
197
1
2
3
1
1
1
14
1
1
2
1
3
1
1
1
2
3
29
1
72
5
1
1
1
1
5
2
1
1
1
2
3
1
2
5
14
1
1
1
11
2
37
1
1
1
4
133
1
2
1
2
1
9
2
32
2
2
1
1
1
1
1
1
1
1
1
3
3
1
1
6
1
14
3
1
1
1
8
27
50
12
1
1
1
1
1
2
3
1
4
3
1
13
5
1
1
3
3
1
1
1
3
1
1
1
3
1
2
64
10
6
1
16
1
23
6
2
1
18
1
32
1
1
29
3
1
2
1
1
3
23
3
9
1
1
2
9
54
1
1
62
10
1
7
17
1
2
5
1
2
1
2
14
132
29
26
1
23
1
7
24
1
2
1
18
2
1
1
1
94
3
1
7
2
13
2
1
1
2
3
1
1
1
3
1
1
2
55
39
2
1
4
23
2
7
1
48
1
1
1
1
1
4
1
45
1
1
191
1
90
1
1
7
14
83
6
1
1
8
1
3
1
4
15
1
23
7
1
2
1
3
2
1
305
3
1
4
15
1
1
1
4
1
2
3
1
2
2
1
10
4
8
2
1
2
2
1
11
5
1
1
1
1
10
1
4
2
13
1
2
1
1
5
4
3
5
1
7
1
1
1
1
1
1
1
1
1
1
1
28
1
1
1
1
1
5
1
1
1
26
40
1
1
1
1
7
1
56
1
1
2
3
2
1
1
1
78
2
22
1
1
12
1
3
10
2
1
1
5
1
1
1
1
168
3
2
1
3

In [53]:
entities = None
with open('./cache/arr/entities.pkl', 'rb') as file:
    entities = pickle.load(file)
    
entity_metadata = None
with open('./cache/dicts/entity_metadata.pkl', 'rb') as file:
    entity_metadata = pickle.load(file)

In [8]:
"""
Getting a dictionary having elements as another dictionary reviewer_list, The aim is to get the number of comments 
made by reviewer r on pull request p
"""

reviewer_count ={}
for pr_id in pr_id_list:
    reviewer_count[pr_id] = get_num_reviews(pr_id,data)

In [None]:
def build_graph(graph,data):
    pr_id_list = data['pr_id'].unique()
    for pr_id in pr_id_list:
        pr_owner_set, reviewer_set = get_node_ids(pr_id, data)

        for pr_id in pr_id_list:
            if not graph.has_node(pr_id):
                user_info = node_info(pr_id, data)
                graph.add_node(pr_id, **user_info)

            for reviewer_id in reviewer_set:
                if not graph.has_node(reviewer_id):
                    user_info = node_info(reviewer_id, data)
                    graph.add_node(reviewer_id, **user_info)
                
                if not graph.has_edge(pr_id, reviewer_id):
                    edge_feat = get_edge_features(pr_id, reviewer_id, data)
                    graph.add_edge(pr_id, reviewer_id, feat=edge_feat)

    return graph


def get_node_ids(pr_id, data):
    
    pr_df = data.loc[data.pr_id == pr_id, :]
    
    commenter_set = set(pr_df.commenter_id.values)
    head_commiter_set = set(pr_df.head_commit_committer_id.values)
    head_author_set = set(pr_df.head_commit_author_id.values)
    pr_owner_set = (head_author_set | head_commiter_set) & commenter_set
    reviewer_set = commenter_set - pr_owner_set
    
    return pr_owner_set, reviewer_set

 def node_info(target,data):
    
    info_cols = [
        'commenter_username_enc', 
        'commenter_follower_count',
        'commenter_total_github_commit_count', 
        'commenter_base_repo_commit_count'
    ]
    
    info_dict = data.loc[(data.commenter_id == target), info_cols].iloc[0].to_dict()
    info_dict = create_features(info_dict)
    
    return info_dict

def create_features(info_dict):
    
    features = {}
    features["features"] = list(info_dict.values())
    
    return features      

In [12]:
reviewer_count[25020530]

{3119227: 2, 10965870: 3}

### Load Entities:

In [14]:
import pickle

In [15]:
entities = None
with open('./cache/entities.pkl', 'rb') as file:
    entities = pickle.load(file)

In [21]:
total_pr_reviewed = {}

for reviewer in entities:
    
    for pr_id in set(pr_id_list):
        
        if reviewer in list(reviewer_count[pr_id].keys()):
            if reviewer not in total_pr_reviewed.keys():
                total_pr_reviewed.update({reviewer: {'pr_list': [pr_id], 'total_counts': 1}})
            else:
                total_pr_reviewed[reviewer]['pr_list'] += [pr_id]
                total_pr_reviewed[reviewer]['total_counts'] += 1
                
tot_pr_count = {}
count = 0 
for (rev, pr) in tot_pr_reviewed.items():
    count = len(pr)
    tot_pr_count[rev] = count

In [25]:
list(total_pr_reviewed.keys())[0]

3782661

In [24]:
total_pr_reviewed[list(total_pr_reviewed.keys())[0]]

{'pr_list': [9115444, 9855523, 12511590], 'total_counts': 3}

In [20]:
total_pr_reviewed

{3782661: [9115444, 9855523, 12511590],
 6123527: [11212613, 10713886],
 3653642: [27730309, 26105726, 18542150],
 61452: [10257553, 18772932],
 6123537: [8981843, 16005488, 11682049, 11980907, 10899860, 13201116, 9829448],
 569360: [16234170],
 5931027: [11109531],
 9154581: [24599226],
 751638: [14221519,
  21201136,
  17170974,
  33358387,
  27918920,
  24871517,
  25985650,
  25035380,
  10584775,
  11141885,
  33653675,
  18285626,
  31458392,
  22217929,
  24970484,
  34014459,
  27690304,
  21628271,
  24806922,
  17008157,
  14255739,
  12683015,
  25102096,
  34867019,
  16943092,
  11831574,
  10914072,
  25233701,
  17729962,
  21891526,
  22350293,
  16648694,
  21432830,
  9112157,
  24185468,
  33196708,
  19303186,
  16190228,
  19434296,
  26446675,
  34835326,
  13536194,
  17370068,
  13077619,
  29822259,
  15994432,
  11636502,
  26480493,
  9080687,
  27234371,
  9801824,
  22647145,
  16159141,
  13472334,
  31953662,
  31789997,
  13735056,
  20255926,
  13833580

In [16]:
#Total pull requests submitted by reviewer r
tot_pr_reviewed = {}

for reviewer in entities:
    
    
    for pr_id in set(pr_id_list):
        
        
        if reviewer in list(reviewer_count[pr_id].keys()):
            tot_pr_reviewed[reviewer].append(pr_id)

tot_pr_count = {}
count = 0 
for (rev, pr) in tot_pr_reviewed.items():
    count = len(pr)
    tot_pr_count[rev] = count

KeyError: 3782661

In [None]:
tot_pr_diff_pr={}
for (rev,pr) in tot_pr_reviewed.items():
  pr_set_count = len(list(set(pr)))
  tot_pr_diff_pr[rev] = pr_set_count

  

In [None]:
def transition_matrix(a_uv,A_u):
    Q = a_uv/np.sum(A_u)