In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import requests
import gc

In [3]:
cve_df = pd.read_csv('./data/go_cves_positive_links-may2019.csv')
cve_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 12 columns):
CVE ID               1328 non-null object
Package name         1804 non-null object
Ecosystem            1874 non-null object
GH issue             715 non-null object
GH PR                373 non-null object
GH Commit            948 non-null object
Bugzilla             259 non-null object
ML                   482 non-null object
Other sources        65 non-null object
Issue Reported on    51 non-null object
CVE reported on      58 non-null object
Fixed on             76 non-null object
dtypes: object(12)
memory usage: 176.4+ KB


In [4]:
cve_df.Ecosystem.value_counts()

maven     1272
python     259
ruby       182
go         153
nuget        6
npm          1
php          1
Name: Ecosystem, dtype: int64

In [5]:
gh_issues = cve_df['GH issue'].dropna().values
print('Issues Before:', gh_issues.shape)
gh_issues = [item.strip().split('\n') for item in gh_issues]
gh_issues = np.array([str(item.strip()) for sublist in gh_issues for item in sublist])
print('Issues After:', gh_issues.shape)

gh_prs = cve_df['GH PR'].dropna().values
print('PRs Before:', gh_prs.shape)
gh_prs = [item.strip().split('\n') for item in gh_prs]
gh_prs = np.array([str(item.strip()) for sublist in gh_prs for item in sublist])
print('PRs After:', gh_prs.shape)

gh_commits = cve_df['GH Commit'].dropna().values
print('Commits Before:', gh_commits.shape)
gh_commits = [item.strip().split('\n') for item in gh_commits]
gh_commits = np.array([str(item.strip()) for sublist in gh_commits for item in sublist])
print('Commits After:', gh_commits.shape)

Issues Before: (715,)
Issues After: (759,)
PRs Before: (373,)
PRs After: (415,)
Commits Before: (948,)
Commits After: (1153,)


In [6]:
gh_links = np.concatenate((gh_issues, gh_commits, gh_prs))
gh_links.shape

(2327,)

In [7]:
gh_links = np.unique(gh_links)
gh_links.shape

(2218,)

In [8]:
def get_github_cve_links(links):
    
    issues = []
    prs = []
    commits = []
    
    gh_pr_commit_pattern = r'(.*github.com/.*?)/pull/(.*?)/commits/(.*)'
    gh_issue_pattern = r'(.*github.com/.*?)/issues/(.*)'    
    gh_pr_pattern = r'(.*github.com/.*?)/pull/([0-9]*)'
    gh_commit_pattern = r'(.*github.com/.*?)/commit/(.*)'
    gh_compare_pattern = r'(.*github.com/.*?)/compare/.*'
    
    for link in tqdm(links):
        
        if re.search(gh_pr_commit_pattern, link, re.I):
            matches = re.search(gh_pr_commit_pattern, link, re.I).groups()
            pr = matches[0]+'/pull/'+matches[1].rstrip('/')
            prs.append(pr)
            
            cm = matches[0]+'/commit/'+matches[2].rstrip('/')
            commits.append(cm)
            
        elif re.search(gh_issue_pattern, link, re.I):
            issues.append(link.rstrip('/'))   
            
        elif re.search(gh_pr_pattern, link, re.I):
            matches = re.search(gh_pr_pattern, link, re.I).groups()
            repo_name = matches[0]
            pr = repo_name+'/pull/'+matches[1].rstrip('/')
            prs.append(pr)
            
            patch_link = pr+'.patch'
            response = requests.get(patch_link)
            if response.status_code != 200:
                print('Failed for link:'+patch_link)
                # log failure here not print
            else:
                data = response.text
                commit_hashes = re.findall(r'(?:\n|^)from (.*?)\s', data, re.I)
                commit_hashes = [item for item in commit_hashes if item.isalnum()]
                commit_links = [repo_name+'/commit/'+item.rstrip('/') for item in commit_hashes]
                commits.extend(commit_links)                  
            
        elif re.search(gh_commit_pattern, link, re.I):
            link = link.split('#')[0].strip().rstrip('/')
            commits.append(link)
            
        elif re.search(gh_compare_pattern, link, re.I):
            match = re.search(gh_compare_pattern, link, re.I).groups()
            repo_name = match[0]
            patch_link = link.rstrip('/')+'.patch'
            response = requests.get(patch_link)
            if response.status_code != 200:
                print('Failed for link:'+patch_link)
                # log failure here not print
            else:
                data = response.text
                commit_hashes = re.findall(r'(?:\n|^)from (.*?)\s', data, re.I)
                commit_hashes = [item for item in commit_hashes if item.isalnum()]
                commit_links = [repo_name+'/commit/'+item.rstrip('/') for item in commit_hashes]
                commits.extend(commit_links)
                
    
    issues = np.array(list(set(issues)))
    prs = np.array(list(set(prs)))
    commits = np.array(list(set(commits)))
            
    return issues, prs, commits

In [9]:
issues, prs, commits = get_github_cve_links(gh_links)

 24%|██▎       | 523/2218 [00:58<04:02,  7.00it/s]

Failed for link:https://github.com/apache/tomcat70/compare/6b41fb05c0f6af5e6cc103ac8e5ae9da5f128606...e519f4e86bf3447934f1c399ecaff8a222e38241.patch


 24%|██▍       | 528/2218 [00:58<03:11,  8.84it/s]

Failed for link:https://github.com/apache/tomcat70/compare/72a8692370b4323f4d05b166f48a0913801fbe4f...a27df4fd31b1cd85f100b8b94e3b33dde92a3c0a.patch


100%|██████████| 2218/2218 [03:30<00:00, 10.55it/s]


In [10]:
issues.shape, prs.shape, commits.shape

((286,), (382,), (3201,))

In [11]:
issue_df = pd.DataFrame(issues, columns=['issue'])
pr_df = pd.DataFrame(prs, columns=['pull_request'])
commits_df = pd.DataFrame(commits, columns=['commit'])

In [12]:
issue_df.to_csv('./data/gh_cve_issue_links.csv', index=False)
pr_df.to_csv('./data/gh_cve_pr_links.csv', index=False)

In [13]:
go_df = cve_df[cve_df.Ecosystem == 'go']
go_df.shape

(153, 12)

In [14]:
gh_issues = go_df['GH issue'].dropna().values
print('Issues Before:', gh_issues.shape)
gh_issues = [item.strip().split('\n') for item in gh_issues]
gh_issues = np.array([str(item.strip()) for sublist in gh_issues for item in sublist])
print('Issues After:', gh_issues.shape)

gh_prs = go_df['GH PR'].dropna().values
print('PRs Before:', gh_prs.shape)
gh_prs = [item.strip().split('\n') for item in gh_prs]
gh_prs = np.array([str(item.strip()) for sublist in gh_prs for item in sublist])
print('PRs After:', gh_prs.shape)

gh_commits = go_df['GH Commit'].dropna().values
print('Commits Before:', gh_commits.shape)
gh_commits = [item.strip().split('\n') for item in gh_commits]
gh_commits = np.array([str(item.strip()) for sublist in gh_commits for item in sublist])
print('Commits After:', gh_commits.shape)

Issues Before: (36,)
Issues After: (39,)
PRs Before: (40,)
PRs After: (45,)
Commits Before: (73,)
Commits After: (78,)


In [15]:
gh_links = np.concatenate((gh_issues, gh_commits, gh_prs))
gh_links.shape

(162,)

In [16]:
gh_links = np.unique(gh_links)
gh_links.shape

(155,)

In [17]:
gh_links = np.array([item for item in gh_links if 'github' in item])
gh_links.shape

(155,)

In [18]:
gh_links[:10]

array(['https://github.com/apache/thrift/commit/2007783e874d524a46b818598a45078448ecc53e',
       'https://github.com/apache/thrift/pull/1061',
       'https://github.com/astaxie/beego/commit/9865779f149669777ee33aae71cd29c8db8ffd66',
       'https://github.com/astaxie/beego/pull/3383',
       'https://github.com/brancz/kube-rbac-proxy/commit/c41c4dee92abc0859d952559e37cf9ad8a442789',
       'https://github.com/brancz/kube-rbac-proxy/pull/27',
       'https://github.com/cisco/node-jose/pull/88',
       'https://github.com/cloudflare/cfssl/commit/f74c74db7f22df0051d7f872b5161dfa2a797ace',
       'https://github.com/cloudflare/cfssl/pull/776',
       'https://github.com/cloudfoundry-incubator/bits-service/commit/9e4010e42a4b462fef69889a453b5c32d56e3100'],
      dtype='<U130')

In [19]:
pattern = re.compile(r'.*?github.com/(.*?/.*?)/', re.I)
repo_names = np.array(list(filter(None,[pattern.search(item).group(1) 
                                            if pattern.search(item) else None 
                                               for item in gh_links])))
repo_names.shape

(155,)

In [20]:
repo_names = np.unique(repo_names)
repo_names.shape

(47,)

In [22]:
!pip install google-cloud-bigquery

Collecting google-cloud-bigquery
[?25l  Downloading https://files.pythonhosted.org/packages/2e/95/64e92560983db41ff1de7c08839f38ae7c5326a8aad71f5e893098cd1c85/google_cloud_bigquery-1.15.0-py2.py3-none-any.whl (133kB)
[K     |████████████████████████████████| 133kB 3.4MB/s eta 0:00:01
[?25hCollecting google-cloud-core<2.0dev,>=1.0.0 (from google-cloud-bigquery)
  Downloading https://files.pythonhosted.org/packages/98/7f/ff56aec313787577e262d5a2e306c04aef61c5c274699ff9fb40095e6691/google_cloud_core-1.0.2-py2.py3-none-any.whl
Collecting google-resumable-media>=0.3.1 (from google-cloud-bigquery)
  Downloading https://files.pythonhosted.org/packages/e2/5d/4bc5c28c252a62efe69ed1a1561da92bd5af8eca0cdcdf8e60354fae9b29/google_resumable_media-0.3.2-py2.py3-none-any.whl
Collecting google-api-core<2.0.0dev,>=1.11.0 (from google-cloud-core<2.0dev,>=1.0.0->google-cloud-bigquery)
[?25l  Downloading https://files.pythonhosted.org/packages/a2/78/bbd685dda48a291b4cc81568ed3e1a89af7a61958dc88a3d52a81

In [23]:
import bq_utils as bqu
import json
from pandas.io.json import json_normalize

In [24]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'bq_key.json'
gh_archive = bqu.BigQueryHelper(active_project= "githubarchive", 
                                dataset_name = "year")
gh_archive

<bq_utils.BigQueryHelper at 0x7fdfa2d3d080>

In [25]:
gh_archive.list_tables()

['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']

In [26]:
query = """
SELECT  type, count(*)
        FROM `githubarchive.year.20*`
        WHERE _TABLE_SUFFIX IN ('16', '17', '18')
        AND repo.name in {repos}
        GROUP BY type
""".format(repos=tuple(repo_names))
gh_archive.estimate_query_size(query)

41.9256986239925

In [27]:
df = gh_archive.query_to_pandas(query)
df[df.type.isin(['PushEvent', 'PullRequestEvent', 'IssuesEvent', 
                'PullRequestReviewCommentEvent', 'IssueCommentEvent'])]

Unnamed: 0,type,f0_
0,PullRequestEvent,168759
2,PullRequestReviewCommentEvent,236683
6,PushEvent,117640
8,IssueCommentEvent,1623715
13,IssuesEvent,195866


In [28]:
query = """
SELECT 
    repo.name as repo_name, 
    type as event_type, 
    actor.id as actor_id,
    actor.login as actor_name,
    JSON_EXTRACT_SCALAR(payload, '$.action') as issue_status,
    JSON_EXTRACT_SCALAR(payload, '$.issue.url') as issue_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') as issue_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') as issue_creator_name,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.url') as issue_creator_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.html_url') as issue_creator_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.comments') as comment_count,
    JSON_EXTRACT_SCALAR(payload, '$.issue.id') as issue_id,
    JSON_EXTRACT_SCALAR(payload, '$.issue.number') as issue_number,
    JSON_EXTRACT_SCALAR(payload, '$.issue.created_at') as issue_created_at,
    JSON_EXTRACT_SCALAR(payload, '$.issue.updated_at') as issue_updated_at,
    JSON_EXTRACT_SCALAR(payload, '$.issue.closed_at') as issue_closed_at,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.issue.title'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as issue_title,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.issue.body'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as issue_body
        
FROM `githubarchive.year.20*`
    WHERE _TABLE_SUFFIX IN ('16', '17', '18')
    AND repo.name in {repos}
    AND type = 'IssuesEvent'
    """.replace('{repos}', str(tuple(repo_names)))
gh_archive.estimate_query_size(query)

2583.8375390227884

In [29]:
issues_df = gh_archive.query_to_pandas(query)
issues_df.to_csv('./data/GH_unlabeled_issues.csv', index=False)

In [30]:
query = """
SELECT 
    repo.name as repo_name, 
    type as event_type, 
    actor.id as actor_id,
    actor.login as actor_name,
    JSON_EXTRACT_SCALAR(payload, '$.action') as pr_status,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.id') as pr_id,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.number') as pr_number,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.url') as pr_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as pr_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.diff_url') as pr_diff_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.patch_url') as pr_patch_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') as pr_creator_name,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.url') as pr_creator_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.html_url') as pr_creator_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.created_at') as pr_created_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.updated_at') as pr_updated_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.closed_at') as pr_closed_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged_at') as pr_merged_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged') as pr_merged_status,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.comments') as pr_comments_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.review_comments') as pr_review_comments_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.commits') as pr_commits_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.additions') as pr_additions_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.deletions') as pr_deletions_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.changed_files') as pr_changed_files_count,    
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.pull_request.title'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as pr_title,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.pull_request.body'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as pr_body
        
FROM `githubarchive.year.20*`
    WHERE _TABLE_SUFFIX IN ('16', '17', '18')
    AND repo.name in {repos}
    AND type = 'PullRequestEvent'
""".replace('{repos}', str(tuple(repo_names)))
gh_archive.estimate_query_size(query)

2583.8375390227884

In [31]:
prs_df = gh_archive.query_to_pandas(query)
prs_df.to_csv('./data/GH_unlabeled_prs.csv', index=False)

In [32]:
repos = pd.DataFrame(repo_names, columns=['repo_names'])
repos.to_csv('./data/Go_GH_repo_names.csv', index=False)

In [33]:
import os

In [34]:
gh_bq_issues = pd.read_csv('./data/GH_unlabeled_issues.csv')
gh_bq_issues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195866 entries, 0 to 195865
Data columns (total 18 columns):
repo_name                195866 non-null object
event_type               195866 non-null object
actor_id                 195866 non-null int64
actor_name               195866 non-null object
issue_status             195866 non-null object
issue_api_url            195866 non-null object
issue_url                195866 non-null object
issue_creator_name       195866 non-null object
issue_creator_api_url    195866 non-null object
issue_creator_url        195866 non-null object
comment_count            195866 non-null int64
issue_id                 195866 non-null int64
issue_number             195866 non-null int64
issue_created_at         195866 non-null object
issue_updated_at         195866 non-null object
issue_closed_at          89574 non-null object
issue_title              195863 non-null object
issue_body               194279 non-null object
dtypes: int64(4), object(14)
m

In [35]:
gh_bq_issues = gh_bq_issues[~pd.isnull(gh_bq_issues.issue_id)]
gh_bq_issues.issue_created_at = pd.to_datetime(gh_bq_issues.issue_created_at)
gh_bq_issues.issue_updated_at = pd.to_datetime(gh_bq_issues.issue_updated_at)
gh_bq_issues.issue_closed_at = pd.to_datetime(gh_bq_issues.issue_closed_at)

gh_bq_issues = gh_bq_issues[gh_bq_issues.issue_created_at.dt.year <= 2019]
gh_bq_issues = gh_bq_issues[gh_bq_issues.issue_updated_at.dt.year <= 2019]
gh_bq_issues = gh_bq_issues[gh_bq_issues.issue_closed_at.dt.year <= 2019]

gh_bq_issues = gh_bq_issues.loc[gh_bq_issues.groupby('issue_id').issue_updated_at.idxmax(skipna=False)]

gh_bq_issues.shape

(86356, 18)

In [38]:
gh_cve_issue_links = pd.read_csv('./data/gh_cve_issue_links.csv')
cve_issue_links = gh_cve_issue_links.issue.tolist()
cve_issue_links.extend(['https://github.com/golang/go/issues/30642', 
                        'https://github.com/golang/go/issues/30794', 
                        'https://github.com/hashicorp/consul/issues/5423'])
print('Total CVE issues:', len(cve_issue_links))

Total CVE issues: 289


In [39]:
cve_issues = gh_bq_issues[gh_bq_issues.issue_url.isin(cve_issue_links)]
cve_issues.shape

(22, 18)

In [40]:
found_issue_urls = cve_issues.issue_url.tolist()
not_found_issue_urls = list(set(cve_issue_links) - set(found_issue_urls))
print('Issues not found in unlabeled data: ', len(not_found_issue_urls))

Issues not found in unlabeled data:  267


In [41]:
not_found_api_urls = ['https://api.github.com/repos/'+re.search(r'.*github.com/(.*)', link, re.I).groups()[0]
                          for link in not_found_issue_urls]

data = []
for link in tqdm(not_found_api_urls):
    response = requests.get(link,
                            auth=('dipanjanS', ''))
    if not response.status_code == 200:
        print('Failed for link: '+link)
        # log this later
    else:
        content = response.json()
        issue_dict = {
            'repo_name': re.search(r'.*github.com/repos/(.*?)/issues', 
                                   link, re.I).groups()[0],
            'event_type': 'IssuesEvent',
            'actor_id': content.get('user').get('id'),
            'actor_name': content.get('user').get('login'),
            'issue_status': content.get('state'),
            'issue_api_url': content.get('url'),
            'issue_url': content.get('html_url'),
            'issue_creator_name': content.get('user').get('login'),
            'issue_creator_api_url': content.get('user').get('url'),
            'issue_creator_url': content.get('user').get('html_url'),
            'comment_count': content.get('comments'),
            'issue_id': content.get('id'),
            'issue_number': content.get('number'),
            'issue_created_at': content.get('created_at'),
            'issue_updated_at': content.get('updated_at'),
            'issue_closed_at': content.get('closed_at'),
            'issue_title': content.get('title'),
            'issue_body': content.get('body')
        }
        if issue_dict:
            data.append(issue_dict)

print('Found missing issues:', len(data))

100%|██████████| 267/267 [00:57<00:00,  4.80it/s]

Found missing issues: 267





In [42]:
cve_issues_nf = pd.DataFrame(data)
cve_issues_nf = cve_issues_nf[cve_issues.columns.tolist()]

gh_bq_issues_cve = pd.concat([cve_issues, cve_issues_nf], axis=0).reset_index(drop=True)
gh_bq_issues_negative = gh_bq_issues.drop(cve_issues.index.tolist()).reset_index(drop=True)
gh_bq_issues_cve.shape, gh_bq_issues_negative.shape

((289, 18), (86334, 18))

In [43]:
gh_bq_issues_cve['class_label'] = 2
gh_bq_issues_negative['class_label'] = 0

gh_bq_issues_processed = pd.concat([gh_bq_issues_negative, gh_bq_issues_cve], axis=0).reset_index(drop=True)
gh_bq_issues_processed.shape

(86623, 19)

In [44]:
gh_bq_issues_processed.to_csv('./data/GH_cve_labeled_issues.csv', index=False)

In [45]:
gh_bq_prs = pd.read_csv('./data/GH_unlabeled_prs.csv')
gh_bq_prs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168759 entries, 0 to 168758
Data columns (total 27 columns):
repo_name                   168759 non-null object
event_type                  168759 non-null object
actor_id                    168759 non-null int64
actor_name                  168759 non-null object
pr_status                   168759 non-null object
pr_id                       168759 non-null int64
pr_number                   168759 non-null int64
pr_api_url                  168759 non-null object
pr_url                      168759 non-null object
pr_diff_url                 168759 non-null object
pr_patch_url                168759 non-null object
pr_creator_name             168759 non-null object
pr_creator_api_url          168759 non-null object
pr_creator_url              168759 non-null object
pr_created_at               168759 non-null object
pr_updated_at               168759 non-null object
pr_closed_at                83684 non-null object
pr_merged_at              

In [46]:
gh_bq_prs = gh_bq_prs[~pd.isnull(gh_bq_prs.pr_title)]

def fill_missing_links(record):
    repo_name = record['repo_name']
    pr_number = str(record['pr_number'])
    if pd.isnull(record['pr_api_url']):
        record['pr_api_url'] = 'https://api.github.com/repos/'+repo_name+'/pulls/'+pr_number
    if pd.isnull(record['pr_url']):
        record['pr_url'] = 'https://github.com/'+repo_name+'/pull/'+pr_number 
    if pd.isnull(record['pr_diff_url']):
        record['pr_diff_url'] = 'https://github.com/'+repo_name+'/pull/'+pr_number+'.diff' 
    if pd.isnull(record['pr_patch_url']):
        record['pr_patch_url'] = 'https://github.com/'+repo_name+'/pull/'+pr_number+'.patch'
    return record

gh_bq_prs = gh_bq_prs.apply(lambda row: fill_missing_links(row), axis=1)

In [47]:
gh_bq_prs_missing_info = gh_bq_prs[pd.isnull(gh_bq_prs.pr_updated_at)]
gh_bq_prs_full_info = gh_bq_prs[~pd.isnull(gh_bq_prs.pr_updated_at)]
gh_bq_prs_missing_info.shape, gh_bq_prs_full_info.shape

((0, 27), (168759, 27))

In [48]:
gh_bq_prs_missing_info = gh_bq_prs_missing_info.drop_duplicates(subset=['pr_url'], keep="last")
gh_bq_prs_missing_info.shape

(0, 27)

In [49]:
gh_bq_prs_full_info.pr_created_at = pd.to_datetime(gh_bq_prs_full_info.pr_created_at)
gh_bq_prs_full_info.pr_updated_at = pd.to_datetime(gh_bq_prs_full_info.pr_updated_at)
gh_bq_prs_full_info.pr_closed_at = pd.to_datetime(gh_bq_prs_full_info.pr_closed_at)
gh_bq_prs_full_info.pr_merged_at = pd.to_datetime(gh_bq_prs_full_info.pr_merged_at)

gh_bq_prs_full_info = gh_bq_prs_full_info[gh_bq_prs_full_info.pr_created_at.dt.year <= 2019]
gh_bq_prs_full_info = gh_bq_prs_full_info[gh_bq_prs_full_info.pr_updated_at.dt.year <= 2019]
gh_bq_prs_full_info = gh_bq_prs_full_info[gh_bq_prs_full_info.pr_closed_at.dt.year <= 2019]
gh_bq_prs_full_info = gh_bq_prs_full_info[gh_bq_prs_full_info.pr_merged_at.dt.year <= 2019]

gh_bq_prs_full_info = gh_bq_prs_full_info.loc[gh_bq_prs_full_info.groupby('pr_url').pr_updated_at.idxmax(skipna=False)]
gh_bq_prs_full_info.shape

(65176, 27)

In [50]:
gh_bq_prs = pd.concat([gh_bq_prs_full_info, gh_bq_prs_missing_info], axis=0).reset_index(drop=True)
gh_bq_prs.shape

(65176, 27)

In [51]:
gh_cve_pr_links = pd.read_csv('./data/gh_cve_pr_links.csv')
cve_pr_links = gh_cve_pr_links.pull_request.tolist()
print('Total CVE PRs:', len(cve_pr_links))

Total CVE PRs: 382


In [52]:
cve_prs = gh_bq_prs[gh_bq_prs.pr_url.isin(cve_pr_links)]
cve_prs.shape

(30, 27)

In [53]:
found_pr_urls = cve_prs.pr_url.tolist()
not_found_pr_urls = list(set(cve_pr_links) - set(found_pr_urls))
print('PRs not found in unlabeled data:', len(not_found_pr_urls))

PRs not found in unlabeled data: 352


In [54]:
not_found_api_urls = [re.sub('/pull/', '/pulls/', 
                             ('https://api.github.com/repos/'+re.search(r'.*github.com/(.*)', link, re.I)
                              .groups()[0]), re.I)
                          for link in not_found_pr_urls]

data = []
for link in tqdm(not_found_api_urls):
    response = requests.get(link,
                            auth=('dipanjanS', ''))
    if not response.status_code == 200:
        print('Failed for link: '+link)
        # log this later
    else:
        content = response.json()
        pr_dict = {
            'repo_name': re.search(r'.*github.com/repos/(.*?)/pulls', 
                                   link, re.I).groups()[0],
            'event_type': 'PullRequestEvent',
            'actor_id': content.get('user').get('id'),
            'actor_name': content.get('user').get('login'),
            'pr_status': content.get('state'),
            'pr_id': content.get('id'),
            'pr_number': content.get('number'),
            'pr_api_url': content.get('url'),
            'pr_url': content.get('html_url'),
            'pr_diff_url': content.get('diff_url'),
            'pr_patch_url': content.get('patch_url'),
            'pr_creator_name': content.get('user').get('login'),
            'pr_creator_api_url': content.get('user').get('url'),
            'pr_creator_url': content.get('user').get('html_url'),         
            'pr_created_at': content.get('created_at'),
            'pr_updated_at': content.get('updated_at'),
            'pr_closed_at': content.get('closed_at'),
            'pr_merged_at': content.get('merged_at'),
            'pr_merged_status': content.get('merged'),
            'pr_comments_count': content.get('comments'),
            'pr_review_comments_count': content.get('review_comments'),
            'pr_commits_count': content.get('commits'),
            'pr_additions_count': content.get('additions'),
            'pr_deletions_count': content.get('deletions'),
            'pr_changed_files_count': content.get('changed_files'),
            'pr_title': content.get('title'),
            'pr_body': content.get('body')
        }
        if pr_dict:
            data.append(pr_dict)

print(len(data))

100%|██████████| 352/352 [02:21<00:00,  2.54it/s]

352





In [55]:
cve_prs_nf = pd.DataFrame(data)
cve_prs_nf = cve_prs_nf[cve_prs.columns.tolist()]

gh_bq_prs_cve = pd.concat([cve_prs, cve_prs_nf], axis=0).reset_index(drop=True)
gh_bq_prs_negative = gh_bq_prs.drop(cve_prs.index.tolist()).reset_index(drop=True)
gh_bq_prs_cve.shape, gh_bq_prs_negative.shape

((382, 27), (65146, 27))

In [56]:
gh_bq_prs_cve['class_label'] = 2
gh_bq_prs_negative['class_label'] = 0

gh_bq_prs_processed = pd.concat([gh_bq_prs_negative, gh_bq_prs_cve], axis=0).reset_index(drop=True)
gh_bq_prs_processed.shape

(65528, 28)

In [57]:
gh_bq_prs_processed.to_csv('./data/GH_cve_labeled_prs.csv', index=False)

In [59]:
issues_df = pd.read_csv('./data/GH_cve_labeled_issues.csv')
prs_df = pd.read_csv('./data/GH_cve_labeled_prs.csv')

issues_df['issue_title'] = issues_df.issue_title.fillna('')
issues_df['issue_body'] = issues_df.issue_body.fillna('')
issues_df['description'] = issues_df['issue_title'].map(str) + ' ' + issues_df['issue_body']

prs_df['pr_title'] = prs_df.pr_title.fillna('')
prs_df['pr_body'] = prs_df.pr_body.fillna('')
prs_df['description'] = prs_df['pr_title'].map(str) + ' ' + prs_df['pr_body']

df = pd.concat([issues_df[['description', 'class_label']], 
                prs_df[['description', 'class_label']]], axis=0).reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152151 entries, 0 to 152150
Data columns (total 2 columns):
description    152151 non-null object
class_label    152151 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


In [60]:
df.class_label.value_counts()

0    151480
2       671
Name: class_label, dtype: int64

In [61]:
def security_vulnerability_labeler(issue_description):
    strong_vuln_pattern = ('(?i)(advisory|attack|(un)?authoriz(e|ation)|'
                           'clickjack|cross.site|csrf|\\bCVE.*?\\b|deadlock|'
                           'denial.of.service|\\bEOP\\b|exploit|hijack|'
                           'infinite.loop|malicious|\\bNVD\\b|OSVDB|'
                           '\\bRCE\\b|\\bReDoS\\b|\\bDDoS\\b|remote.code.execution|'
                           'security|victim|\\bvuln|\\bXEE\\b|\\bXSRF\\b|'
                           '\\bXSS\\b|\\bXXE\\b)')

    medium_vuln_pattern = ('(?i)(authenticat(e|ion)|brute force|bypass|'
                           'constant.time|crack|credential|\\bDoS\\b|'
                           'expos(e|ing)|hack|harden|injection|lockout|'
                           'overflow|password|\\bPoC\\b|proof.of.concept|'
                           'poison|priv(ilege|elege|elage|lage)|\\b(in)?secur(e|ity)|'
                           '(de)?serializ|spoof|timing|traversal)')

    low_vuln_pattern = ('(?i)(abuse|compliant|constant.time|credential|\\bcrypto|'
                        'escalate|exhaustion|forced|infinite|RFC\\d{4,5})')
    
    if (re.findall(low_vuln_pattern, issue_description, re.I) or
        re.findall(medium_vuln_pattern, issue_description, re.I) or
        re.findall(strong_vuln_pattern, issue_description, re.I)):
        return 1
    else:
        return 0

In [62]:
negative_df = df[df.class_label == 0]
positive_df = df[df.class_label == 2]

In [63]:
negative_df['label'] = negative_df.apply(lambda row: security_vulnerability_labeler(row.description), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [64]:
positive_df['label'] = positive_df['class_label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [65]:
df = pd.concat([positive_df[['description', 'label']],
                negative_df[['description', 'label']]], axis=0).reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152151 entries, 0 to 152150
Data columns (total 2 columns):
description    152151 non-null object
label          152151 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


In [66]:
df['label'].value_counts()

0    128908
1     22572
2       671
Name: label, dtype: int64

In [67]:
df.to_csv('./data/GH_complete_labeled_issues_prs.csv', index=False)