In [27]:
import pandas as pd
import numpy as np
import os
import requests
from tqdm import tqdm
import re

# Label GitHub Issues

In [2]:
gh_bq_issues = pd.read_csv('./data/gokube_phase1_jun19/GH_unlabeled_issues.csv')
gh_bq_issues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195866 entries, 0 to 195865
Data columns (total 18 columns):
repo_name                195866 non-null object
event_type               195866 non-null object
actor_id                 195866 non-null int64
actor_name               195866 non-null object
issue_status             195866 non-null object
issue_api_url            195866 non-null object
issue_url                195866 non-null object
issue_creator_name       195866 non-null object
issue_creator_api_url    195866 non-null object
issue_creator_url        195866 non-null object
comment_count            195866 non-null int64
issue_id                 195866 non-null int64
issue_number             195866 non-null int64
issue_created_at         195866 non-null object
issue_updated_at         195866 non-null object
issue_closed_at          89574 non-null object
issue_title              195863 non-null object
issue_body               194279 non-null object
dtypes: int64(4), object(14)
m

In [14]:
gh_bq_issues = gh_bq_issues[~pd.isnull(gh_bq_issues.issue_id)]
gh_bq_issues.issue_created_at = pd.to_datetime(gh_bq_issues.issue_created_at)
gh_bq_issues.issue_updated_at = pd.to_datetime(gh_bq_issues.issue_updated_at)
gh_bq_issues.issue_closed_at = pd.to_datetime(gh_bq_issues.issue_closed_at)

gh_bq_issues = gh_bq_issues[gh_bq_issues.issue_created_at.dt.year <= 2019]
gh_bq_issues = gh_bq_issues[gh_bq_issues.issue_updated_at.dt.year <= 2019]
gh_bq_issues = gh_bq_issues[gh_bq_issues.issue_closed_at.dt.year <= 2019]

gh_bq_issues = gh_bq_issues.loc[gh_bq_issues.groupby('issue_id').issue_updated_at.idxmax(skipna=False)]

gh_bq_issues.shape

(86356, 18)

In [24]:
gh_cve_issue_links = pd.read_csv('./data/gokube_phase1_jun19/gh_cve_issue_links.csv')
cve_issue_links = gh_cve_issue_links.issue.tolist()
cve_issue_links.extend(['https://github.com/golang/go/issues/30642', 
                        'https://github.com/golang/go/issues/30794', 
                        'https://github.com/hashicorp/consul/issues/5423'])
print('Total CVE issues:', len(cve_issue_links))

Total CVE issues: 289


In [25]:
cve_issues = gh_bq_issues[gh_bq_issues.issue_url.isin(cve_issue_links)]
cve_issues.shape

(22, 18)

In [26]:
found_issue_urls = cve_issues.issue_url.tolist()
not_found_issue_urls = list(set(cve_issue_links) - set(found_issue_urls))
print('Issues not found in unlabeled data: ', len(not_found_issue_urls))

Issues not found in unlabeled data:  267


In [28]:
not_found_api_urls = ['https://api.github.com/repos/'+re.search(r'.*github.com/(.*)', link, re.I).groups()[0]
                          for link in not_found_issue_urls]

data = []
for link in tqdm(not_found_api_urls):
    response = requests.get(link,
                            auth=('dipanjanS', ''))
    if not response.status_code == 200:
        print('Failed for link: '+link)
        # log this later
    else:
        content = response.json()
        issue_dict = {
            'repo_name': re.search(r'.*github.com/repos/(.*?)/issues', 
                                   link, re.I).groups()[0],
            'event_type': 'IssuesEvent',
            'actor_id': content.get('user').get('id'),
            'actor_name': content.get('user').get('login'),
            'issue_status': content.get('state'),
            'issue_api_url': content.get('url'),
            'issue_url': content.get('html_url'),
            'issue_creator_name': content.get('user').get('login'),
            'issue_creator_api_url': content.get('user').get('url'),
            'issue_creator_url': content.get('user').get('html_url'),
            'comment_count': content.get('comments'),
            'issue_id': content.get('id'),
            'issue_number': content.get('number'),
            'issue_created_at': content.get('created_at'),
            'issue_updated_at': content.get('updated_at'),
            'issue_closed_at': content.get('closed_at'),
            'issue_title': content.get('title'),
            'issue_body': content.get('body')
        }
        if issue_dict:
            data.append(issue_dict)

print('Found missing issues:', len(data))

100%|██████████| 267/267 [01:09<00:00,  4.41it/s]

Found missing issues: 267





In [30]:
cve_issues_nf = pd.DataFrame(data)
cve_issues_nf = cve_issues_nf[cve_issues.columns.tolist()]

gh_bq_issues_cve = pd.concat([cve_issues, cve_issues_nf], axis=0).reset_index(drop=True)
gh_bq_issues_negative = gh_bq_issues.drop(cve_issues.index.tolist()).reset_index(drop=True)
gh_bq_issues_cve.shape, gh_bq_issues_negative.shape

((289, 18), (86334, 18))

In [34]:
gh_bq_issues_cve['class_label'] = 2
gh_bq_issues_negative['class_label'] = 0

gh_bq_issues_processed = pd.concat([gh_bq_issues_negative, gh_bq_issues_cve], axis=0).reset_index(drop=True)
gh_bq_issues_processed.shape

(86623, 19)

In [38]:
gh_bq_issues_processed.to_csv('./data/gokube_phase1_jun19/GH_cve_labeled_issues.csv', index=False)

# Label GitHub Pull Requests

In [39]:
gh_bq_prs = pd.read_csv('./data/gokube_phase1_jun19/GH_unlabeled_prs.csv')
gh_bq_prs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168759 entries, 0 to 168758
Data columns (total 27 columns):
repo_name                   168759 non-null object
event_type                  168759 non-null object
actor_id                    168759 non-null int64
actor_name                  168759 non-null object
pr_status                   168759 non-null object
pr_id                       168759 non-null int64
pr_number                   168759 non-null int64
pr_api_url                  168759 non-null object
pr_url                      168759 non-null object
pr_diff_url                 168759 non-null object
pr_patch_url                168759 non-null object
pr_creator_name             168759 non-null object
pr_creator_api_url          168759 non-null object
pr_creator_url              168759 non-null object
pr_created_at               168759 non-null object
pr_updated_at               168759 non-null object
pr_closed_at                83684 non-null object
pr_merged_at              

In [40]:
gh_bq_prs = gh_bq_prs[~pd.isnull(gh_bq_prs.pr_title)]

def fill_missing_links(record):
    repo_name = record['repo_name']
    pr_number = str(record['pr_number'])
    if pd.isnull(record['pr_api_url']):
        record['pr_api_url'] = 'https://api.github.com/repos/'+repo_name+'/pulls/'+pr_number
    if pd.isnull(record['pr_url']):
        record['pr_url'] = 'https://github.com/'+repo_name+'/pull/'+pr_number 
    if pd.isnull(record['pr_diff_url']):
        record['pr_diff_url'] = 'https://github.com/'+repo_name+'/pull/'+pr_number+'.diff' 
    if pd.isnull(record['pr_patch_url']):
        record['pr_patch_url'] = 'https://github.com/'+repo_name+'/pull/'+pr_number+'.patch'
    return record

gh_bq_prs = gh_bq_prs.apply(lambda row: fill_missing_links(row), axis=1)

In [41]:
gh_bq_prs_missing_info = gh_bq_prs[pd.isnull(gh_bq_prs.pr_updated_at)]
gh_bq_prs_full_info = gh_bq_prs[~pd.isnull(gh_bq_prs.pr_updated_at)]
gh_bq_prs_missing_info.shape, gh_bq_prs_full_info.shape

((0, 27), (168759, 27))

In [42]:
gh_bq_prs_missing_info = gh_bq_prs_missing_info.drop_duplicates(subset=['pr_url'], keep="last")
gh_bq_prs_missing_info.shape

(0, 27)

In [43]:
gh_bq_prs_full_info.pr_created_at = pd.to_datetime(gh_bq_prs_full_info.pr_created_at)
gh_bq_prs_full_info.pr_updated_at = pd.to_datetime(gh_bq_prs_full_info.pr_updated_at)
gh_bq_prs_full_info.pr_closed_at = pd.to_datetime(gh_bq_prs_full_info.pr_closed_at)
gh_bq_prs_full_info.pr_merged_at = pd.to_datetime(gh_bq_prs_full_info.pr_merged_at)

gh_bq_prs_full_info = gh_bq_prs_full_info[gh_bq_prs_full_info.pr_created_at.dt.year <= 2019]
gh_bq_prs_full_info = gh_bq_prs_full_info[gh_bq_prs_full_info.pr_updated_at.dt.year <= 2019]
gh_bq_prs_full_info = gh_bq_prs_full_info[gh_bq_prs_full_info.pr_closed_at.dt.year <= 2019]
gh_bq_prs_full_info = gh_bq_prs_full_info[gh_bq_prs_full_info.pr_merged_at.dt.year <= 2019]

gh_bq_prs_full_info = gh_bq_prs_full_info.loc[gh_bq_prs_full_info.groupby('pr_url').pr_updated_at.idxmax(skipna=False)]
gh_bq_prs_full_info.shape

(65176, 27)

In [44]:
gh_bq_prs = pd.concat([gh_bq_prs_full_info, gh_bq_prs_missing_info], axis=0).reset_index(drop=True)
gh_bq_prs.shape

(65176, 27)

In [48]:
gh_cve_pr_links = pd.read_csv('./data/gokube_phase1_jun19/gh_cve_pr_links.csv')
cve_pr_links = gh_cve_pr_links.pull_request.tolist()
print('Total CVE PRs:', len(cve_pr_links))

Total CVE PRs: 382


In [49]:
cve_prs = gh_bq_prs[gh_bq_prs.pr_url.isin(cve_pr_links)]
cve_prs.shape

(30, 27)

In [51]:
found_pr_urls = cve_prs.pr_url.tolist()
not_found_pr_urls = list(set(cve_pr_links) - set(found_pr_urls))
print('PRs not found in unlabeled data:', len(not_found_pr_urls))

PRs not found in unlabeled data: 352


In [52]:
not_found_api_urls = [re.sub('/pull/', '/pulls/', 
                             ('https://api.github.com/repos/'+re.search(r'.*github.com/(.*)', link, re.I)
                              .groups()[0]), re.I)
                          for link in not_found_pr_urls]

data = []
for link in tqdm(not_found_api_urls):
    response = requests.get(link,
                            auth=('dipanjanS', ''))
    if not response.status_code == 200:
        print('Failed for link: '+link)
        # log this later
    else:
        content = response.json()
        pr_dict = {
            'repo_name': re.search(r'.*github.com/repos/(.*?)/pulls', 
                                   link, re.I).groups()[0],
            'event_type': 'PullRequestEvent',
            'actor_id': content.get('user').get('id'),
            'actor_name': content.get('user').get('login'),
            'pr_status': content.get('state'),
            'pr_id': content.get('id'),
            'pr_number': content.get('number'),
            'pr_api_url': content.get('url'),
            'pr_url': content.get('html_url'),
            'pr_diff_url': content.get('diff_url'),
            'pr_patch_url': content.get('patch_url'),
            'pr_creator_name': content.get('user').get('login'),
            'pr_creator_api_url': content.get('user').get('url'),
            'pr_creator_url': content.get('user').get('html_url'),         
            'pr_created_at': content.get('created_at'),
            'pr_updated_at': content.get('updated_at'),
            'pr_closed_at': content.get('closed_at'),
            'pr_merged_at': content.get('merged_at'),
            'pr_merged_status': content.get('merged'),
            'pr_comments_count': content.get('comments'),
            'pr_review_comments_count': content.get('review_comments'),
            'pr_commits_count': content.get('commits'),
            'pr_additions_count': content.get('additions'),
            'pr_deletions_count': content.get('deletions'),
            'pr_changed_files_count': content.get('changed_files'),
            'pr_title': content.get('title'),
            'pr_body': content.get('body')
        }
        if pr_dict:
            data.append(pr_dict)

print(len(data))

100%|██████████| 352/352 [02:50<00:00,  2.55it/s]

352





In [53]:
cve_prs_nf = pd.DataFrame(data)
cve_prs_nf = cve_prs_nf[cve_prs.columns.tolist()]

gh_bq_prs_cve = pd.concat([cve_prs, cve_prs_nf], axis=0).reset_index(drop=True)
gh_bq_prs_negative = gh_bq_prs.drop(cve_prs.index.tolist()).reset_index(drop=True)
gh_bq_prs_cve.shape, gh_bq_prs_negative.shape

((382, 27), (65146, 27))

In [54]:
gh_bq_prs_cve['class_label'] = 2
gh_bq_prs_negative['class_label'] = 0

gh_bq_prs_processed = pd.concat([gh_bq_prs_negative, gh_bq_prs_cve], axis=0).reset_index(drop=True)
gh_bq_prs_processed.shape

(65528, 28)

In [55]:
gh_bq_prs_processed.to_csv('./data/gokube_phase1_jun19/GH_cve_labeled_prs.csv', index=False)