In [1]:
from collections import Counter
from datetime import datetime
import pandas as pd
import numpy as np
import threading
import requests
import pickle
import json
import time

### CSV to JSON:

In [2]:
df = pd.read_csv('./raw/kubernetes.csv').to_numpy()

In [3]:
export_json = False

if export_json == True:
    data = []
    for row in df:
        data += [{
            'pr_id': row[0],
            'github_pr_id': row[1],
            'commenter_id': row[2],
            'commenter_username': row[3],
            'commenter_follower_count': row[4],
            'commenter_total_github_commit_count': row[5],
            'commenter_base_repo_commit_count': row[6],
            'head_commit_author_id': row[7],
            'head_commit_author_username': row[8],
            'head_commit_committer_id': row[9],
            'head_commit_committer_username': row[10],
            'base_commit_author_id': row[11],
            'base_commit_author_username': row[12],
            'base_commit_committer_id': row[13],
            'base_commit_committer_username': row[14],
            'head_repo_id': row[15],
            'base_repo_id': row[16],
            'base_repo_owner': row[17],
            'base_repo_owner_username': row[18],
            'head_repo_owner_id': row[19],
            'head_repo_owner_username': row[20],
            'head_commit_id': row[21],
            'base_commit_id': row[22],
            'pullreq_id': row[23],
            'comment_created_at': row[24],
            'comment_position': row[25],
            'comment_id': row[26],
            'comment_body': row[27]
        }]
    with open('./kubernetes.json', 'w') as file:
        json.dump(data, file)

In [4]:
data = None
with open('./kubernetes.json') as file:
    data = json.load(file)

### List of Pull Request IDs:

In [5]:
pull_req_ids = []

for row in data:
    pull_req_ids += [row['github_pr_id']]

pull_req_ids = list(set(pull_req_ids))

In [6]:
print('Total Pull Requests:', len(pull_req_ids))

Total Pull Requests: 11473


### Scraping Pull Request File Changes From Github API:

Turn on internet connection. API provides **5000 free requests/token/hour**. So requests are broken in chunks of 4096 requests.

In [7]:
pull_request_files = {}

In [8]:
def fetch_pr_file_changes(pr_id):

    global pull_request_files
    
    # 5000 requests/hour/token
    headers = {'Authorization': 'token <gh-auth-token>'}
    
    response = requests.get(
        url='https://api.github.com/repos/kubernetes/kubernetes/pulls/{}/files'.format(pr_id), 
        headers=headers
    )

    pull_request_files.update({pr_id: response})

### $\checkmark$ - pull_requests(0, 4096):
### $\checkmark$ - pull_requests(4096, 8192):
### $\checkmark$ - pull_requests(8192, 11473):

In [9]:
# 
# This code is functional. But commented because we cached the data
# so that no boring work everytime.
# 
#
# =============================================================
# 0 to 4096:
# =============================================================
# n_threads = 64
# counter = 0
# for i in range(0, 4096, n_threads):
#     thread_executor = {}
#     
#     # define threads:
#     for j in range(n_threads):
#         thread_executor.update({
#             't_{}'.format(str(j)): threading.Thread(target=fetch_pr_file_changes, args=(pull_req_ids[i+j],))
#         })
#     
#     # start threads:
#     for j in range(n_threads):
#         thread_executor['t_{}'.format(str(j))].start()
#     
#     # join threads:
#     for j in range(n_threads):
#         thread_executor['t_{}'.format(str(j))].join()
#     
#     counter += n_threads
#     print('Pull Requests Processed:', counter)
#     time.sleep(4)
# 
# ==============================================================
# 4096 to 8192:
# ==============================================================
# 
# n_threads = 64
# 
# counter = 4096
# for i in range(4096, 8192, n_threads):
#     thread_executor = {}
#     
#     # define threads:
#     for j in range(n_threads):
#         thread_executor.update({
#             't_{}'.format(str(j)): threading.Thread(target=fetch_pr_file_changes, args=(pull_req_ids[i+j],))
#         })
#     
#     # start threads:
#     for j in range(n_threads):
#         thread_executor['t_{}'.format(str(j))].start()
#     
#     # join threads:
#     for j in range(n_threads):
#         thread_executor['t_{}'.format(str(j))].join()
#     
#     counter += n_threads
#     print('Pull Requests Processed:', counter)
#     time.sleep(4)
#
# ==============================================================
# 8192 to 11456:
# ==============================================================
# 
# n_threads = 64
# 
# counter = 8192
# for i in range(8192, 11456, n_threads):
#     thread_executor = {}
#     
#     # define threads:
#     for j in range(n_threads):
#         thread_executor.update({
#             't_{}'.format(str(j)): threading.Thread(target=fetch_pr_file_changes, args=(pull_req_ids[i+j],))
#         })
#     
#     # start threads:
#     for j in range(n_threads):
#         thread_executor['t_{}'.format(str(j))].start()
#     
#     # join threads:
#     for j in range(n_threads):
#         thread_executor['t_{}'.format(str(j))].join()
#     
#     counter += n_threads
#     print('Pull Requests Processed:', counter)
#     time.sleep(4)
# 
# for i in range(11456, 11473, 1):
#     fetch_pr_file_changes(pull_req_ids[i])
#     print(i)
#     time.sleep(1)

### $\checkmark$ - save filepath(0, 4096)
### $\checkmark$ - save filepath(4096, 8192)
### $\checkmark$ - save filepath(8192, 11473)

In [10]:
export_filepaths = False

if export_filepaths == True:
    with open('./filepath[0-4096].pkl', 'wb') as file:
        pickle.dump(pull_request_files, file)

    with open('./filepath[4096-8192].pkl', 'wb') as file:
        pickle.dump(pull_request_files, file)

    with open('./filepath[8192-11473].pkl', 'wb') as file:
        pickle.dump(pull_request_files, file)

### Check Request Failures:

In [11]:
count = 0
for pr_id in list(pull_request_files.keys()):
    if pull_request_files[pr_id].status_code != 200:
        count += 1
        
print('Unsuccessful Hits:', count)

Unsuccessful Hits: 0


### Loading filepaths data:

In [12]:
with open('./cache/dicts/req_dicts/filepath/filepath[0-4096].pkl', 'rb') as file:
    t = pickle.load(file)
    for pr_id in list(t.keys()):
        pull_request_files.update({pr_id: t[pr_id]})
        
with open('./cache/dicts/req_dicts/filepath/filepath[4096-8192].pkl', 'rb') as file:
    t = pickle.load(file)
    for pr_id in list(t.keys()):
        pull_request_files.update({pr_id: t[pr_id]})
        
with open('./cache/dicts/req_dicts/filepath/filepath[8192-11473].pkl', 'rb') as file:
    t = pickle.load(file)
    for pr_id in list(t.keys()):
        pull_request_files.update({pr_id: t[pr_id]})

In [13]:
len(pull_request_files.keys())

11473

### Github Pull Request to BigQuery Pull Request:

In [14]:
git_pr_2_bigquery_pr = {}
bigquery_pr_2_git_pr = {}

for row in data:
    git_pr_2_bigquery_pr.update({row['github_pr_id']: row['pr_id']})
    bigquery_pr_2_git_pr.update({row['pr_id']: row['github_pr_id']})

### Extract Owners' and Reviewers' Sets:

In [15]:
import pprint

In [16]:
def extract_owner_and_reviewer_set(pr_id, data):
    
    pr_data = []
    for row in data:
        pr_data += [row] if (row['pr_id'] == pr_id) else {}
    
    commenters = []
    head_authors = []
    head_committers = []
    
    for row in pr_data:
        commenters += [row['commenter_id']]
        head_authors += [row['head_commit_author_id']]
        head_committers += [row['head_commit_committer_id']]
        
    commenters_set = set(commenters)
    head_authors_set = set(head_authors)
    head_committers_set = set(head_committers)
    
    # ========================================
    # pr_owners_set = AND(commenter_set, OR(head_committers_set, head_authors_set))
    # ========================================
    
    pr_owners_set = (head_committers_set.union(head_authors_set)).intersection(commenters_set)
    
    # ========================================
    # pr_reviewers_set = commenters_set - pr_owners_set
    # ========================================
    
    pr_reviewers_set = (commenters_set - pr_owners_set)
    
    return [list(pr_owners_set), list(pr_reviewers_set)]

### Blacklisting Pull Requests Whose Owner Set or Reviewer Set is Empty.

In [17]:
cached = True

blacklists = []
if cached == True:
    with open('./cache/arr/blacklists.pkl', 'rb') as file:
        blacklists = pickle.load(file)
else:
    for pr_id in pull_req_ids:
        res = extract_owner_and_reviewer_set(git_pr_2_bigquery_pr[pr_id], data)
        if len(res[0]) == 0 or len(res[1]) == 0:
            blacklists += [pr_id]
    with open('./cache/arr/blacklists.pkl', 'wb') as file:
        pickle.dump(blacklists, file)

In [18]:
len(blacklists)

2634

### Extract User's Metadata:

In [19]:
def extract_user_metadata(user_id, data):
    metadata = dict()
    for row in data:
        if row['commenter_id'] == user_id:
            metadata.update({'username': row['commenter_username']})
            metadata.update({'follower_count': row['commenter_follower_count']})
            metadata.update({'total_github_commit_count': row['commenter_total_github_commit_count']})
            metadata.update({'base_repo_commit_count': row['commenter_base_repo_commit_count']})
    return metadata

### Entities:

In [20]:
cached = True
entities = []

if cached == True:
    with open('./cache/arr/entities.pkl', 'rb') as file:
        entities = pickle.load(file)
else:
    for pr_id in list(set(pull_req_ids) - set(blacklists)):
        t = extract_owner_and_reviewer_set(git_pr_2_bigquery_pr[pr_id], data)
        entities += t[0]
        entities += t[1]
    entities = list(set(entities))   
    with open('./cache/arr/entities.pkl', 'wb') as file:
        pickle.dump(entities, file)

In [21]:
len(entities)

1187

### Entity Metadata:

In [22]:
cached = True
entity_metadata = dict()

if cached == True:
    with open('./cache/dicts/entity_metadata.pkl', 'rb') as file:
        entity_metadata = pickle.load(file)
else:
    for entity in entities:
        entity_metadata.update({entity: extract_user_metadata(entity, data)})
    with open('./cache/dicts/entity_metadata.pkl', 'wb') as file:
        pickle.dump(entity_metadata, file)

In [23]:
len(entity_metadata.keys())

1187

### Get Number of Comments:

In [24]:
def get_num_comments(owner, commenter, data):
    comments = []
    for row in data:
        is_owner = (row['head_commit_author_id'] == owner) or (row['base_commit_committer_id'] == owner)
        if (row['commenter_id'] == commenter) and is_owner:
            comments += [row]
    return len(comments)

In [25]:
def temporal_min_max(timestamp):
    s_time = time.mktime(datetime(2014, 1, 1).timetuple())
    e_time = time.mktime(datetime(2019, 1, 1).timetuple())
    dt = (e_time - s_time)
    return float(time.mktime(timestamp) - s_time) / dt

In [26]:
def get_edge_weight(owner_id, reviewer_id, data):
    
    common_rows = []
    comment_scaled_times = []
    
    for row in data:
        is_owner = (row['head_commit_author_id'] == owner_id) or (row['head_commit_committer_id'] == reviewer_id)
        if (row['commenter_id'] == reviewer_id) and is_owner:
            common_rows += [row]
    
    for row in common_rows:
        comment_scaled_times += [temporal_min_max(pd.to_datetime(row['comment_created_at']).timetuple())]
        
    print(comment_scaled_times)

### Export Filepath Changes to Pandas Dataframe:

In [27]:
new_data = []
for i in range(len(data)):
    if data[i]['github_pr_id'] not in blacklists:
        data[i].update({'commit_file_changes': pull_request_files[data[i]['github_pr_id']]})
        new_data += [data[i]]

In [28]:
df = pd.DataFrame(new_data)

In [29]:
export_df = False

if export_df == True:
    with open('dataset_with_file_changes.csv', 'wb') as file:
        pickle.dump(df, file)