In [1]:
import pandas as pd
import numpy as np
import re
import gc

# Get GoKube Repos which had past CVEs

In [2]:
cve_df = pd.read_csv('./data/positive_github_links/go_cves_positive_links-may2019.csv')
cve_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 12 columns):
CVE ID               1328 non-null object
Package name         1804 non-null object
Ecosystem            1874 non-null object
GH issue             715 non-null object
GH PR                373 non-null object
GH Commit            948 non-null object
Bugzilla             259 non-null object
ML                   482 non-null object
Other sources        65 non-null object
Issue Reported on    51 non-null object
CVE reported on      58 non-null object
Fixed on             76 non-null object
dtypes: object(12)
memory usage: 176.4+ KB


In [3]:
go_df = cve_df[cve_df.Ecosystem == 'go']
go_df.shape

(153, 12)

In [4]:
gh_issues = go_df['GH issue'].dropna().values
print('Issues Before:', gh_issues.shape)
gh_issues = [item.strip().split('\n') for item in gh_issues]
gh_issues = np.array([str(item.strip()) for sublist in gh_issues for item in sublist])
print('Issues After:', gh_issues.shape)

gh_prs = go_df['GH PR'].dropna().values
print('PRs Before:', gh_prs.shape)
gh_prs = [item.strip().split('\n') for item in gh_prs]
gh_prs = np.array([str(item.strip()) for sublist in gh_prs for item in sublist])
print('PRs After:', gh_prs.shape)

gh_commits = go_df['GH Commit'].dropna().values
print('Commits Before:', gh_commits.shape)
gh_commits = [item.strip().split('\n') for item in gh_commits]
gh_commits = np.array([str(item.strip()) for sublist in gh_commits for item in sublist])
print('Commits After:', gh_commits.shape)

Issues Before: (36,)
Issues After: (39,)
PRs Before: (40,)
PRs After: (45,)
Commits Before: (73,)
Commits After: (78,)


In [5]:
gh_links = np.concatenate((gh_issues, gh_commits, gh_prs))
gh_links.shape

(162,)

In [6]:
gh_links = np.unique(gh_links)
gh_links.shape

(155,)

In [7]:
gh_links = np.array([item for item in gh_links if 'github' in item])
gh_links.shape

(155,)

In [8]:
gh_links[:10]

array(['https://github.com/apache/thrift/commit/2007783e874d524a46b818598a45078448ecc53e',
       'https://github.com/apache/thrift/pull/1061',
       'https://github.com/astaxie/beego/commit/9865779f149669777ee33aae71cd29c8db8ffd66',
       'https://github.com/astaxie/beego/pull/3383',
       'https://github.com/brancz/kube-rbac-proxy/commit/c41c4dee92abc0859d952559e37cf9ad8a442789',
       'https://github.com/brancz/kube-rbac-proxy/pull/27',
       'https://github.com/cisco/node-jose/pull/88',
       'https://github.com/cloudflare/cfssl/commit/f74c74db7f22df0051d7f872b5161dfa2a797ace',
       'https://github.com/cloudflare/cfssl/pull/776',
       'https://github.com/cloudfoundry-incubator/bits-service/commit/9e4010e42a4b462fef69889a453b5c32d56e3100'],
      dtype='<U130')

In [9]:
pattern = re.compile(r'.*?github.com/(.*?/.*?)/', re.I)
repo_names = np.array(list(filter(None,[pattern.search(item).group(1) 
                                            if pattern.search(item) else None 
                                               for item in gh_links])))
repo_names.shape

(155,)

In [10]:
repo_names = np.unique(repo_names)
repo_names.shape

(47,)

# Get Unlabeled GitHub Data for Repos having past CVEs 
## (Go-Kube ecosystem only) 

In [11]:
import bq_utils as bqu
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize

In [12]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'bq_key.json'
gh_archive = bqu.BigQueryHelper(active_project= "githubarchive", 
                                dataset_name = "year")
gh_archive

<bq_utils.BigQueryHelper at 0x7fb07e91f390>

In [13]:
gh_archive.list_tables()

['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']

In [14]:
query = """
SELECT  type, count(*)
        FROM `githubarchive.year.20*`
        WHERE _TABLE_SUFFIX IN ('16', '17', '18')
        AND repo.name in {repos}
        GROUP BY type
""".format(repos=tuple(repo_names))
gh_archive.estimate_query_size(query)

41.9256986239925

In [15]:
df = gh_archive.query_to_pandas(query)
df[df.type.isin(['PushEvent', 'PullRequestEvent', 'IssuesEvent', 
                'PullRequestReviewCommentEvent', 'IssueCommentEvent'])]

Unnamed: 0,type,f0_
2,PullRequestEvent,168759
3,IssueCommentEvent,1623715
6,PullRequestReviewCommentEvent,236683
10,PushEvent,117640
12,IssuesEvent,195866


In [16]:
query = """
SELECT 
    repo.name as repo_name, 
    type as event_type, 
    actor.id as actor_id,
    actor.login as actor_name,
    JSON_EXTRACT_SCALAR(payload, '$.action') as issue_status,
    JSON_EXTRACT_SCALAR(payload, '$.issue.url') as issue_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') as issue_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') as issue_creator_name,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.url') as issue_creator_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.html_url') as issue_creator_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.comments') as comment_count,
    JSON_EXTRACT_SCALAR(payload, '$.issue.id') as issue_id,
    JSON_EXTRACT_SCALAR(payload, '$.issue.number') as issue_number,
    JSON_EXTRACT_SCALAR(payload, '$.issue.created_at') as issue_created_at,
    JSON_EXTRACT_SCALAR(payload, '$.issue.updated_at') as issue_updated_at,
    JSON_EXTRACT_SCALAR(payload, '$.issue.closed_at') as issue_closed_at,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.issue.title'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as issue_title,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.issue.body'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as issue_body
        
FROM `githubarchive.year.20*`
    WHERE _TABLE_SUFFIX IN ('16', '17', '18')
    AND repo.name in {repos}
    AND type = 'IssuesEvent'
    """.replace('{repos}', str(tuple(repo_names)))
gh_archive.estimate_query_size(query)

2583.8375390227884

In [17]:
issues_df = gh_archive.query_to_pandas(query)
issues_df.to_csv('./data/gokube_phase1_jun19/GH_unlabeled_issues.csv', index=False)

In [18]:
query = """
SELECT 
    repo.name as repo_name, 
    type as event_type, 
    actor.id as actor_id,
    actor.login as actor_name,
    JSON_EXTRACT_SCALAR(payload, '$.action') as pr_status,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.id') as pr_id,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.number') as pr_number,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.url') as pr_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as pr_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.diff_url') as pr_diff_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.patch_url') as pr_patch_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') as pr_creator_name,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.url') as pr_creator_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.html_url') as pr_creator_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.created_at') as pr_created_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.updated_at') as pr_updated_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.closed_at') as pr_closed_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged_at') as pr_merged_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged') as pr_merged_status,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.comments') as pr_comments_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.review_comments') as pr_review_comments_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.commits') as pr_commits_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.additions') as pr_additions_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.deletions') as pr_deletions_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.changed_files') as pr_changed_files_count,    
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.pull_request.title'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as pr_title,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.pull_request.body'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as pr_body
        
FROM `githubarchive.year.20*`
    WHERE _TABLE_SUFFIX IN ('16', '17', '18')
    AND repo.name in {repos}
    AND type = 'PullRequestEvent'
""".replace('{repos}', str(tuple(repo_names)))
gh_archive.estimate_query_size(query)

2583.8375390227884

In [19]:
prs_df = gh_archive.query_to_pandas(query)
prs_df.to_csv('./data/gokube_phase1_jun19/GH_unlabeled_prs.csv', index=False)

In [20]:
repos = pd.DataFrame(repo_names, columns=['repo_names'])
repos.to_csv('./data/gokube_phase1_jun19/Go_GH_repo_names.csv', index=False)