In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
data = pd.read_csv('./raw/kubernetes.csv')

In [3]:
pr_id_list = data['pr_id'].unique()

In [4]:
def get_num_reviews(pr_id, data):
    """
    Input : pull request id (pr_id) and the dataset(data)
    Return : a dictionary with each entry as (reviewer_id, no. of comments on pr_id)
    """
    
    reviewer_list={}
    
    pr_df = data.loc[data.pr_id == pr_id, :]
    
    commenter_list = list(pr_df.commenter_id.values)
    commenter_set = set(commenter_list)
    
    head_commiter_set = set(pr_df.head_commit_committer_id.values)
    head_author_set = set(pr_df.head_commit_author_id.values)
    pr_owner_set = (head_author_set | head_commiter_set) & commenter_set
    reviewer_set = commenter_set - pr_owner_set
    
    buf_list = []
    for c in commenter_list:
        if c not in pr_owner_set:
            buf_list += [c]
    
    commenter_freq = {}
    for reviewer in buf_list:
        if reviewer not in commenter_freq:
            commenter_freq.update({reviewer: 1})
        else:
            commenter_freq[reviewer] += 1

    return commenter_freq

### Feature 1: Number of comments made by Reviewer r on PullRequest p

In [5]:
def ncomments_on_pr_by_reviewer(pr_id, reviewer_id, data):

    res = get_num_reviews(pr_id, data)
    
    if reviewer_id in res.keys():
        return res[reviewer_id]
    else:
        return 0

In [6]:
print(get_num_reviews(16078235, data))

{10456340: 3, 145523: 6, 13194: 6, 3035095: 1}


In [7]:
print(ncomments_on_pr_by_reviewer(16078235, 145523, data))

6


### All Reviewers:

In [8]:
all_reviewers = []
for pr_id in pr_id_list:
    all_reviewers += list()

### Feature 2: Number of total Pull-Requests submitted by Reviewer r to the repository

In [9]:
def npullreq_on_repo_by_reviewer(reviewer, data):
    
    df = set(data[data.commenter_id == reviewer]['head_repo_id'])
    
    return len(df)

In [10]:
entities = None
with open('./cache/arr/entities.pkl', 'rb') as file:
    entities = pickle.load(file)
    
entity_metadata = None
with open('./cache/dicts/entity_metadata.pkl', 'rb') as file:
    entity_metadata = pickle.load(file)

In [11]:
for e in entities:
    print(npullreq_on_repo_by_reviewer(e, data))

1
1
1
2
4
1
3
1
1
7
2
2
1
121
30
10
1
1
2
2
1
6
1
1
76
4
21
1
1
1
1
1
2
1
3
1
2
1
9
7
1
1
1
1
1
4
32
1
2
1
23
4
1
77
25
4
1
2
2
1
1
1
1
1
2
12
1
35
2
7
1
1
1
43
10
46
1
138
16
1
1
6
11
1
3
4
1
2
1
1
1
1
1
1
2
1
1
1
2
1
2
18
1
1
17
3
1
6
2
142
1
66
1
1
3
2
197
1
2
3
1
1
1
14
1
1
2
1
3
1
1
1
2
3
29
1
72
5
1
1
1
1
5
2
1
1
1
2
3
1
2
5
14
1
1
1
11
2
37
1
1
1
4
133
1
2
1
2
1
9
2
32
2
2
1
1
1
1
1
1
1
1
1
3
3
1
1
6
1
14
3
1
1
1
8
27
50
12
1
1
1
1
1
2
3
1
4
3
1
13
5
1
1
3
3
1
1
1
3
1
1
1
3
1
2
64
10
6
1
16
1
23
6
2
1
18
1
32
1
1
29
3
1
2
1
1
3
23
3
9
1
1
2
9
54
1
1
62
10
1
7
17
1
2
5
1
2
1
2
14
132
29
26
1
23
1
7
24
1
2
1
18
2
1
1
1
94
3
1
7
2
13
2
1
1
2
3
1
1
1
3
1
1
2
55
39
2
1
4
23
2
7
1
48
1
1
1
1
1
4
1
45
1
1
191
1
90
1
1
7
14
83
6
1
1
8
1
3
1
4
15
1
23
7
1
2
1
3
2
1
305
3
1
4
15
1
1
1
4
1
2
3
1
2
2
1
10
4
8
2
1
2
2
1
11
5
1
1
1
1
10
1
4
2
13
1
2
1
1
5
4
3
5
1
7
1
1
1
1
1
1
1
1
1
1
1
28
1
1
1
1
1
5
1
1
1
26
40
1
1
1
1
7
1
56
1
1
2
3
2
1
1
1
78
2
22
1
1
12
1
3
10
2
1
1
5
1
1
1
1
168
3
2
1
3

In [12]:
"""
Getting a dictionary having elements as another dictionary reviewer_list, The aim is to get the number of comments 
made by reviewer r on pull request p
"""

reviewer_count ={}
for pr_id in pr_id_list:
    reviewer_count[pr_id] = get_num_reviews(pr_id,data)

In [13]:
def build_graph(graph,data):
    pr_id_list = data['pr_id'].unique()
    for pr_id in pr_id_list:
        pr_owner_set, reviewer_set = get_node_ids(pr_id, data)

        for pr_id in pr_id_list:
            if not graph.has_node(pr_id):
                user_info = node_info(pr_id, data)
                graph.add_node(pr_id, **user_info)

            for reviewer_id in reviewer_set:
                if not graph.has_node(reviewer_id):
                    user_info = node_info(reviewer_id, data)
                    graph.add_node(reviewer_id, **user_info)
                
                if not graph.has_edge(pr_id, reviewer_id):
                    edge_feat = get_edge_features(pr_id, reviewer_id, data)
                    graph.add_edge(pr_id, reviewer_id, feat=edge_feat)

    return graph


def get_node_ids(pr_id, data):
    
    pr_df = data.loc[data.pr_id == pr_id, :]
    
    commenter_set = set(pr_df.commenter_id.values)
    head_commiter_set = set(pr_df.head_commit_committer_id.values)
    head_author_set = set(pr_df.head_commit_author_id.values)
    pr_owner_set = (head_author_set | head_commiter_set) & commenter_set
    reviewer_set = commenter_set - pr_owner_set
    
    return pr_owner_set, reviewer_set

def node_info(target,data):
    
    info_cols = [
        'commenter_username_enc', 
        'commenter_follower_count',
        'commenter_total_github_commit_count', 
        'commenter_base_repo_commit_count'
    ]
    
    info_dict = data.loc[(data.commenter_id == target), info_cols].iloc[0].to_dict()
    info_dict = create_features(info_dict)
    
    return info_dict

def create_features(info_dict):
    
    features = {}
    features["features"] = list(info_dict.values())
    
    return features

In [14]:
reviewer_count[25020530]

{3119227: 2, 10965870: 3}

### Load Entities:

In [15]:
import pickle

In [16]:
entities = None
with open('./cache/arr/entities.pkl', 'rb') as file:
    entities = pickle.load(file)

In [17]:
total_pr_reviewed = {}

for reviewer in entities:
    
    for pr_id in set(pr_id_list):
        
        if reviewer in list(reviewer_count[pr_id].keys()):
            if reviewer not in total_pr_reviewed.keys():
                total_pr_reviewed.update({reviewer: {'pr_list': [pr_id], 'total_counts': 1}})
            else:
                total_pr_reviewed[reviewer]['pr_list'] += [pr_id]
                total_pr_reviewed[reviewer]['total_counts'] += 1
                
tot_pr_count = {}
count = 0 
for (rev, pr) in tot_pr_reviewed.items():
    count = len(pr)
    tot_pr_count[rev] = count

NameError: name 'tot_pr_reviewed' is not defined

In [18]:
list(total_pr_reviewed.keys())[0]

3782661

In [19]:
total_pr_reviewed[list(total_pr_reviewed.keys())[0]]

{'pr_list': [9115444, 9855523, 12511590], 'total_counts': 3}

In [20]:
#Total pull requests submitted by reviewer r
tot_pr_reviewed = {}

for reviewer in entities:
    
    
    for pr_id in set(pr_id_list):
        
        
        if reviewer in list(reviewer_count[pr_id].keys()):
            tot_pr_reviewed[reviewer].append(pr_id)

tot_pr_count = {}
count = 0 
for (rev, pr) in tot_pr_reviewed.items():
    count = len(pr)
    tot_pr_count[rev] = count

KeyError: 3782661

In [21]:
tot_pr_diff_pr = {}

for (rev, pr) in tot_pr_reviewed.items():
    pr_set_count = len(list(set(pr)))
    tot_pr_diff_pr[rev] = pr_set_count

  

In [22]:
def transition_matrix(a_uv,A_u):
    Q = a_uv/np.sum(A_u)

In [23]:
data = pd.read_csv('raw/kubernetes.csv')

In [24]:
len(set(data['pr_id']))

11473

# Graph Matrix Creation:

In [25]:
reviewers = None
with open('./cache/arr/reviewers.pkl', 'rb') as file:
    reviewers = pickle.load(file)

In [26]:
pr_id_2_matrix_row_id = {}
matrix_row_id_2_pr_id = {}

counter = 0
for pr_id in sorted(pr_id_list):
    pr_id_2_matrix_row_id.update({pr_id: counter})
    matrix_row_id_2_pr_id.update({counter: pr_id})
    counter += 1

In [27]:
reviewer_id_2_matrix_col_id = {}
matrix_col_id_2_reviewer_id = {}

counter = 0
for reviewer in sorted(reviewers):
    reviewer_id_2_matrix_col_id.update({reviewer: counter})
    matrix_col_id_2_reviewer_id.update({counter: reviewer})
    counter += 1

### SRW Graph:

In [32]:
srw_psi_matrix = np.zeros(
    shape=(
        len(pr_id_2_matrix_row_id.keys()), 
        len(reviewer_id_2_matrix_col_id.keys()), 
        1# + 1 # len(feat_vec) + 1
    ), 
    dtype=np.object
)

In [33]:
for pr_id in pr_id_list:
    res = get_num_reviews(pr_id, data)
    for reviewer in reviewers:
        
        if reviewer in res.keys():
            # Feature present or not:
            #srw_psi_matrix[pr_id_2_matrix_row_id[pr_id]][reviewer_id_2_matrix_col_id[reviewer]][0] = 1
            
            # Actual Features:
            srw_psi_matrix[pr_id_2_matrix_row_id[pr_id]][reviewer_id_2_matrix_col_id[reviewer]][0] = res[reviewer]
            
        else:
            srw_psi_matrix[pr_id_2_matrix_row_id[pr_id]][reviewer_id_2_matrix_col_id[reviewer]][0] = 0

In [35]:
import scipy.io

scipy.io.savemat('matrix.mat', {'srw_mat': srw_psi_matrix})

In [34]:
for i in range(500):
    for j in range(500):
        if srw_psi_matrix[i][j][0] != 0:
            print(srw_psi_matrix[i][j])

[4]
[6]
[3]
[6]
[4]
[1]
[6]
[1]
[11]
[1]
[1]
[6]
[1]
[2]
[2]
[1]
[7]
[3]
[1]
[2]
[1]
[4]
[1]
[9]
[1]
[3]
[3]
[1]
[3]
[9]
[1]
[3]
[3]
[8]
[4]
[3]
[2]
[1]
[4]
[1]
[2]
[1]
[9]
[3]
[5]
[1]
[1]
[4]
[1]
[1]
[4]
[10]
[1]
[1]
[1]
[4]
[9]
[1]
[3]
[3]
[1]
[15]
[8]
[1]
[2]
[3]
[2]
[3]
[1]
[4]
[2]
[2]
[5]
[4]
[5]
[1]
[1]
[1]
[6]
[1]
[6]
[1]
[2]
[1]
[3]
[1]
[25]
[46]
[15]
[1]
[2]
[1]
[5]
[2]
[3]
[2]
[2]
[1]
[2]
[8]
[1]
[7]
[8]
[1]
[4]
[1]
[2]
[2]
[4]
[4]
[2]
[3]
[3]
[2]
[2]
[1]
[1]
[3]
[1]
[1]
[26]
[9]
[2]
[4]
[12]
[1]
[20]
[2]
[4]
[2]
[4]
[13]
[26]
[7]
[13]
[2]
[3]
[4]
[1]
[2]
[2]
[3]
[3]
[2]
[1]
[1]
[15]
[5]
[1]
[2]
[7]
[16]
[1]
[5]
[1]
[4]
[1]
[18]
[5]
[2]
[13]
[24]
[7]
[4]
[2]
[7]
[1]
[2]
[3]
[6]
[12]
[23]
[2]
[3]
[66]
[7]
[1]
[1]
[1]
[2]
[11]
[2]
[7]
[47]
[10]
[1]
[2]
[17]
[5]
[1]
[14]
[2]
[2]
[14]
[3]
[16]
[8]
[1]
[2]
[13]
[4]
[13]
[2]
[2]
[2]
[2]
[1]
[2]
[1]
[2]
[7]
[5]
[2]
[1]
[3]
[2]
[3]
[11]
[6]
[9]
[8]
[1]
[3]
[1]
[4]
[6]
[14]
[6]
[7]
[5]
[1]
[1]
[3]
[1]
[1]
[1]
[10]
[15]
[4]
[1]
[15]
[1