In [1]:
from collections import Counter
from datetime import datetime
import chars2vec as c2v
import pandas as pd
import numpy as np
import threading
import requests
import pickle
import json
import time

### CSV to Dictionary:

In [2]:
data_dict = []
df = pd.read_csv('./raw/kubernetes.csv', index_col=0)

In [3]:
cached = True

if cached == True:
    with open('./raw/kubernetes_json.pkl', 'rb') as file:
        data = pickle.load(file)
else:
    data_dict = dict(df.to_dict())
    data_cols = list(data_dict.keys())

    for index in range(df.shape[0]):
        temp = {}
        for col_no in range(df.shape[1]):
            temp.update({data_cols[col_no]: data_dict[data_cols[col_no]][index]})

        data += [temp]
    
    with open('./raw/kubernetes_json.pkl', 'wb') as file:
        pickle.dump(data, file)

### List of Pull Request IDs:

In [4]:
pull_req_ids = list(set(df['github_pr_id']))

### Scraping PR File Changes:

In [5]:
pull_request_files = {}

#### Callback:

In [6]:
def fetch_pr_file_changes(pr_id):

    global pull_request_files
    
    # ===============================================
    # Note: 5000 requests/hour for a user:
    # ===============================================

    url     = 'https://api.github.com/repos/kubernetes/kubernetes/pulls/{}/files'.format(pr_id)
    headers = {'Authorization': 'token <gh-auth-token>'}

    response = requests.get(url=url, headers=headers)
    pull_request_files.update({pr_id: response})

In [7]:
url     = 'https://api.github.com/rate_limit'
headers = {'Authorization': 'token <gh-auth-token>'}

response = requests.get(url=url, headers=headers)

#### PR[0:4096] - Batch 0:

In [8]:
cached = True

if cached == False:

    num_threads = 64
    
    counter = 0
    for i in range(0, 4096, num_threads):
        
        thread_executor = {}

        # define threads:
        for j in range(num_threads):
            target = fetch_pr_file_changes
            args   = tuple({pull_req_ids[i+j]})
            t_obj  = threading.Thread(target=target, args=args)
            thread_executor.update({'t_{}'.format(str(j)): t_obj})

        # start threads:
        for j in range(num_threads):
            thread_executor['t_{}'.format(str(j))].start()

        # join threads:
        for j in range(num_threads):
            thread_executor['t_{}'.format(str(j))].join()

        counter += num_threads
        print('Pull Requests Processed:', counter)
        time.sleep(4)

#### PR[4096-8192] Batch 1:

In [9]:
cached = True

if cached == False:

    num_threads = 64
    
    counter = 4096
    for i in range(4096, 8192, num_threads):
        
        thread_executor = {}

        # define threads:
        for j in range(num_threads):
            target = fetch_pr_file_changes
            args   = tuple({pull_req_ids[i+j]})
            t_obj  = threading.Thread(target=target, args=args)
            thread_executor.update({'t_{}'.format(str(j)): t_obj})

        # start threads:
        for j in range(num_threads):
            thread_executor['t_{}'.format(str(j))].start()

        # join threads:
        for j in range(num_threads):
            thread_executor['t_{}'.format(str(j))].join()

        counter += num_threads
        print('Pull Requests Processed:', counter)
        time.sleep(4)

#### PR[8192-11473] Batch 2:

In [10]:
cached = True

if cached == False:
    
    num_threads = 64
    
    counter = 8192
    for i in range(8192, 11456, num_threads):
        
        thread_executor = {}

        # define threads:
        for j in range(num_threads):
            target = fetch_pr_file_changes
            args   = tuple({pull_req_ids[i+j]})
            t_obj  = threading.Thread(target=target, args=args)
            thread_executor.update({'t_{}'.format(str(j)): t_obj})

        # start threads:
        for j in range(num_threads):
            thread_executor['t_{}'.format(str(j))].start()

        # join threads:
        for j in range(num_threads):
            thread_executor['t_{}'.format(str(j))].join()

        counter += num_threads
        print('Pull Requests Processed:', counter)
        time.sleep(4)
        
    for i in range(11456, 11473, 1):
        fetch_pr_file_changes(pull_req_ids[i])
        print('Pull Requests Processed:', i)
        time.sleep(1)

### Load Cached PR Filepath Changes:

In [11]:
cached = True

if cached == True:
    with open('./cache/dicts/requests/pr_filepath_changes.pkl', 'rb') as file:
        pull_request_files = pickle.load(file)

### Check Request Failures and Re-Request:

In [12]:
len(pull_request_files)

11473

#### Detect Failures:

In [13]:
count = 0
unsuccessful = []
for pr_id in list(pull_request_files.keys()):
    if pull_request_files[pr_id].status_code != 200:
        unsuccessful += [pr_id]
        count += 1
        
print('Unsuccessful Hits:', count)

Unsuccessful Hits: 0


#### Re-Request:

In [14]:
for u in unsuccessful:
    fetch_pr_file_changes(u)

### Update Cache:

In [15]:
update = False

if update == True:
    with open('./cache/dicts/req_dicts/filepath/filepath[0-4096].pkl', 'wb') as file:
        pickle.dump(pull_request_files, file)

    with open('./cache/dicts/req_dicts/filepath/filepath[4096-8192].pkl', 'wb') as file:
        pickle.dump(pull_request_files, file)

    with open('./cache/dicts/req_dicts/filepath/filepath[8192-11473].pkl', 'wb') as file:
        pickle.dump(pull_request_files, file)

### Github Pull Request to BigQuery Pull Request:

In [16]:
git_pr_2_bigquery_pr = {}
bigquery_pr_2_git_pr = {}

for row in data:
    git_pr_2_bigquery_pr.update({row['github_pr_id']: row['pr_id']})
    bigquery_pr_2_git_pr.update({row['pr_id']: row['github_pr_id']})

### Extract Owners' and Reviewers' Sets:

In [17]:
import pprint

In [18]:
def extract_owner_and_reviewer_set(pr_id, data):
    
    pr_data = []
    for row in data:
        pr_data += [row] if (row['pr_id'] == pr_id) else {}
    
    commenters = []
    head_authors = []
    head_committers = []
    
    for row in pr_data:
        commenters += [row['commenter_id']]
        head_authors += [row['head_commit_author_id']]
        head_committers += [row['head_commit_committer_id']]
        
    commenters_set = set(commenters)
    head_authors_set = set(head_authors)
    head_committers_set = set(head_committers)
    
    # pr_owners_set = commenter_set & (head_committers_set | head_authors_set)
    pr_owners_set = (head_committers_set.union(head_authors_set)).intersection(commenters_set)
    
    # pr_reviewers_set = commenters_set - pr_owners_set
    pr_reviewers_set = (commenters_set - pr_owners_set)
    
    return [list(pr_owners_set), list(pr_reviewers_set)]

In [19]:
def extract_owner_and_reviewer_set(pr_id, data):
    
    pr_data = []
    for row in data:
        pr_data += [row] if (row['pr_id'] == pr_id) else {}
    
    commenters = []
    head_authors = []
    head_committers = []
    
    for row in pr_data:
        commenters += [row['commenter_id']]
        head_authors += [row['head_commit_author_id']]
        head_committers += [row['head_commit_committer_id']]
        
    commenters_set = set(commenters)
    head_authors_set = set(head_authors)
    head_committers_set = set(head_committers)
    
    # pr_owners_set = commenter_set & (head_committers_set | head_authors_set)
    pr_owners_set = (head_committers_set.union(head_authors_set)).intersection(commenters_set)
    
    # pr_reviewers_set = commenters_set - pr_owners_set
    pr_reviewers_set = (commenters_set - pr_owners_set)
    
    return [list(pr_owners_set), list(pr_reviewers_set)]

### Blacklisting Pull Requests Whose Owner Set or Reviewer Set is Empty.

In [20]:
cached = True

blacklists = []
if cached == True:
    with open('./cache/arr/blacklists.pkl', 'rb') as file:
        blacklists = pickle.load(file)
else:
    for pr_id in pull_req_ids:
        res = extract_owner_and_reviewer_set(git_pr_2_bigquery_pr[pr_id], data)
        if len(res[0]) == 0 or len(res[1]) == 0:
            blacklists += [pr_id]
    with open('./cache/arr/blacklists.pkl', 'wb') as file:
        pickle.dump(blacklists, file)

In [21]:
len(blacklists)

2634

### Extract User's Metadata:

In [22]:
def extract_user_metadata(user_id, data):
    metadata = dict()
    for row in data:
        if row['commenter_id'] == user_id:
            metadata.update({'username': row['commenter_username']})
            metadata.update({'follower_count': row['commenter_follower_count']})
            metadata.update({'total_github_commit_count': row['commenter_total_github_commit_count']})
            metadata.update({'base_repo_commit_count': row['commenter_base_repo_commit_count']})
    return metadata

### Entities:

In [23]:
cached = True
entities = []

if cached == True:
    with open('./cache/arr/entities.pkl', 'rb') as file:
        entities = pickle.load(file)
else:
    for pr_id in list(set(pull_req_ids) - set(blacklists)):
        t = extract_owner_and_reviewer_set(git_pr_2_bigquery_pr[pr_id], data)
        entities += t[0]
        entities += t[1]
    entities = list(set(entities))   
    with open('./cache/arr/entities.pkl', 'wb') as file:
        pickle.dump(entities, file)

In [24]:
len(entities)

1187

### Entity Metadata:

In [25]:
cached = True
entity_metadata = dict()

if cached == True:
    with open('./cache/dicts/entity_metadata.pkl', 'rb') as file:
        entity_metadata = pickle.load(file)
else:
    for entity in entities:
        entity_metadata.update({entity: extract_user_metadata(entity, data)})
    with open('./cache/dicts/entity_metadata.pkl', 'wb') as file:
        pickle.dump(entity_metadata, file)

In [26]:
len(entity_metadata.keys())

1187

### Get Number of Comments:

In [27]:
def get_num_comments(owner, commenter, data):
    comments = []
    for row in data:
        is_owner = (row['head_commit_author_id'] == owner) or (row['base_commit_committer_id'] == owner)
        if (row['commenter_id'] == commenter) and is_owner:
            comments += [row]
    return len(comments)

### Exporting Filepath Data to a  CSV:

In [28]:
filechanges = dict()

for pr_id in pull_request_files.keys():
    
    preq_json = pull_request_files[pr_id].json()
    
    filechanges.update({pr_id: []})
    for f_no in range(len(preq_json)):
        filechanges[pr_id] += [{
            'filename': preq_json[f_no]['filename'],
            'status': preq_json[f_no]['status'],
            'additions': preq_json[f_no]['additions'],
            'deletions': preq_json[f_no]['deletions'],
            'changes': preq_json[f_no]['changes']
        }]

In [29]:
df = []

for pr_id in filechanges.keys():
    for i in range(len(filechanges[pr_id])):
        df += [[
            pr_id, 
            filechanges[pr_id][i]['filename'],
            filechanges[pr_id][i]['status'],
            filechanges[pr_id][i]['additions'], 
            filechanges[pr_id][i]['deletions'], 
            filechanges[pr_id][i]['changes']
        ]]

In [30]:
df = pd.DataFrame(df)

In [31]:
export_df = False

if export_df == True:
    df.to_csv(
        path_or_buf='filepath.csv', 
        header=['pr_id', 'filename', 'status', 'additions', 'deletions', 'changes'], 
        index=False
    )

### Scrapping Pull Request Titles:

In [32]:
pull_request_titles = {}

In [33]:
def fetch_pr_titles(pr_id):

    global pull_request_files
    
    # 5000 requests/hour/token
    headers = {'Authorization': 'token <gh-auth-token>'}
    
    response = requests.get(
        url='https://api.github.com/repos/kubernetes/kubernetes/pulls/{}'.format(pr_id), 
        headers=headers
    )

    pull_request_titles.update({pr_id: response})

### Check Remaining Requests:

In [34]:
headers = {'Authorization': 'token <gh-auth-token>'}

In [35]:
requests.get('https://api.github.com/rate_limit', params=headers).json()

{'resources': {'core': {'limit': 60,
   'remaining': 60,
   'reset': 1607130791,
   'used': 0},
  'graphql': {'limit': 0, 'remaining': 0, 'reset': 1607130791, 'used': 0},
  'integration_manifest': {'limit': 5000,
   'remaining': 5000,
   'reset': 1607130791,
   'used': 0},
  'search': {'limit': 10, 'remaining': 10, 'reset': 1607127251, 'used': 0}},
 'rate': {'limit': 60, 'remaining': 60, 'reset': 1607130791, 'used': 0}}

### Scrapping Pull Request Titles:

In [36]:
# 
# This code is functional. But commented because we cached the data
# so that no boring work everytime.
# 
#
# =============================================================
# 0 to 4096:
# =============================================================
# n_threads = 64
# counter = 0
# for i in range(0, 4096, n_threads):
#     thread_executor = {}
#     
#     # define threads:
#     for j in range(n_threads):
#         thread_executor.update({
#             't_{}'.format(str(j)): threading.Thread(target=fetch_pr_titles, args=(pull_req_ids[i+j],))
#         })
#     
#     # start threads:
#     for j in range(n_threads):
#         thread_executor['t_{}'.format(str(j))].start()
#     
#     # join threads:
#     for j in range(n_threads):
#         thread_executor['t_{}'.format(str(j))].join()
#     
#     counter += n_threads
#     print('Pull Requests Processed:', counter)
#     time.sleep(5)
# ==============================================================
# 4096 to 8192:
# ==============================================================
# 
# n_threads = 64
# 
# counter = 4096
# for i in range(4096, 8192, n_threads):
#     thread_executor = {}
#     
#     # define threads:
#     for j in range(n_threads):
#         thread_executor.update({
#             't_{}'.format(str(j)): threading.Thread(target=fetch_pr_titles, args=(pull_req_ids[i+j],))
#         })
#     
#     # start threads:
#     for j in range(n_threads):
#         thread_executor['t_{}'.format(str(j))].start()
#     
#     # join threads:
#     for j in range(n_threads):
#         thread_executor['t_{}'.format(str(j))].join()
#     
#     counter += n_threads
#     print('Pull Requests Processed:', counter)
#     time.sleep(4)
# 
# ==============================================================
# 8192 to 11456:
# ==============================================================
#
# n_threads = 64
# 
# counter = 8192
# for i in range(8192, 11456, n_threads):
#     thread_executor = {}
#     
#     # define threads:
#     for j in range(n_threads):
#         thread_executor.update({
#             't_{}'.format(str(j)): threading.Thread(target=fetch_pr_titles, args=(pull_req_ids[i+j],))
#         })
#     
#     # start threads:
#     for j in range(n_threads):
#         thread_executor['t_{}'.format(str(j))].start()
#     
#     # join threads:
#     for j in range(n_threads):
#         thread_executor['t_{}'.format(str(j))].join()
#     
#     counter += n_threads
#     print('Pull Requests Processed:', counter)
#     time.sleep(4)
# 
# for i in range(11456, 11473, 1):
#     fetch_pr_titles(pull_req_ids[i])
#     print(i)
#     time.sleep(1)

### Load PR Titles and Body:

In [37]:
with open('./cache/dicts/req_dicts/titles/titles[0-4096].pkl', 'rb') as file:
    temp = pickle.load(file)
    for t in list(temp.keys()):
        pull_request_titles.update({t: temp[t]})

with open('./cache/dicts/req_dicts/titles/titles[4096-8192].pkl', 'rb') as file:
    temp = pickle.load(file)
    for t in list(temp.keys()):
        pull_request_titles.update({t: temp[t]})
    
with open('./cache/dicts/req_dicts/titles/titles[8192-11473].pkl', 'rb') as file:
    temp = pickle.load(file)
    for t in list(temp.keys()):
        pull_request_titles.update({t: temp[t]})

In [38]:
len(pull_request_titles.keys())

11473

### Sanity Check:

In [39]:
count = 0
unsuccessful = []
for pr_id in list(pull_request_titles.keys()):
    if pull_request_titles[pr_id].status_code != 200:
        unsuccessful += [pr_id]
        count += 1

print('Unsuccessful Hits:', count)

Unsuccessful Hits: 0


### Get Failed Requests:

In [40]:
# for u in unsuccessful:
#     fetch_pr_titles(u)

### Extracting Titles from Request Files:

In [41]:
pr_titles = []

for pr_id in pull_req_ids:
    t = pull_request_titles[pr_id].json()
    pr_titles += [{
        'pr_id': pr_id, 
        'state': t['state'],
        'title': t['title'],
        'body': t['body'],
        'created_at': t['created_at'],
        'updated_at': t['updated_at'],
        'closed_at': t['closed_at'],
        'merged_at': t['merged_at'],
    }]

### Exporting the Pull Request Titles and Body:

In [42]:
export_df = False

if export_df == True:
    df = pd.DataFrame(data=pr_titles)
    df.to_csv('./pull_req_titles.csv', index=False)

### Reviewers List:

In [43]:
cached = True

reviewers = []
if cached == True:
    with open('./cache/arr/reviewers.pkl', 'rb') as file:
        reviewers = pickle.load(file)
else:
    for pr_id in pull_req_ids:
        reviewers += extract_owner_and_reviewer_set(git_pr_2_bigquery_pr[pr_id], data)[1]
    reviewers = list(set(reviewers))
    with open('./cache/arr/reviewers.pkl', 'wb') as file:
        pickle.dump(reviewers, file)