In [8]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import requests
import gc

# Get CVE GitHub Links

In [9]:
cve_df = pd.read_csv('./data/positive_github_links/go_cves_positive_links-may2019.csv')
cve_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 12 columns):
CVE ID               1328 non-null object
Package name         1804 non-null object
Ecosystem            1874 non-null object
GH issue             715 non-null object
GH PR                373 non-null object
GH Commit            948 non-null object
Bugzilla             259 non-null object
ML                   482 non-null object
Other sources        65 non-null object
Issue Reported on    51 non-null object
CVE reported on      58 non-null object
Fixed on             76 non-null object
dtypes: object(12)
memory usage: 176.4+ KB


In [10]:
cve_df.Ecosystem.value_counts()

maven     1272
python     259
ruby       182
go         153
nuget        6
php          1
npm          1
Name: Ecosystem, dtype: int64

In [11]:
gh_issues = cve_df['GH issue'].dropna().values
print('Issues Before:', gh_issues.shape)
gh_issues = [item.strip().split('\n') for item in gh_issues]
gh_issues = np.array([str(item.strip()) for sublist in gh_issues for item in sublist])
print('Issues After:', gh_issues.shape)

gh_prs = cve_df['GH PR'].dropna().values
print('PRs Before:', gh_prs.shape)
gh_prs = [item.strip().split('\n') for item in gh_prs]
gh_prs = np.array([str(item.strip()) for sublist in gh_prs for item in sublist])
print('PRs After:', gh_prs.shape)

gh_commits = cve_df['GH Commit'].dropna().values
print('Commits Before:', gh_commits.shape)
gh_commits = [item.strip().split('\n') for item in gh_commits]
gh_commits = np.array([str(item.strip()) for sublist in gh_commits for item in sublist])
print('Commits After:', gh_commits.shape)

Issues Before: (715,)
Issues After: (759,)
PRs Before: (373,)
PRs After: (415,)
Commits Before: (948,)
Commits After: (1153,)


In [12]:
gh_links = np.concatenate((gh_issues, gh_commits, gh_prs))
gh_links.shape

(2327,)

In [13]:
gh_links = np.unique(gh_links)
gh_links.shape

(2218,)

In [14]:
def get_github_cve_links(links):
    
    issues = []
    prs = []
    commits = []
    
    gh_pr_commit_pattern = r'(.*github.com/.*?)/pull/(.*?)/commits/(.*)'
    gh_issue_pattern = r'(.*github.com/.*?)/issues/(.*)'    
    gh_pr_pattern = r'(.*github.com/.*?)/pull/([0-9]*)'
    gh_commit_pattern = r'(.*github.com/.*?)/commit/(.*)'
    gh_compare_pattern = r'(.*github.com/.*?)/compare/.*'
    
    for link in tqdm(links):
        
        if re.search(gh_pr_commit_pattern, link, re.I):
            matches = re.search(gh_pr_commit_pattern, link, re.I).groups()
            pr = matches[0]+'/pull/'+matches[1].rstrip('/')
            prs.append(pr)
            
            cm = matches[0]+'/commit/'+matches[2].rstrip('/')
            commits.append(cm)
            
        elif re.search(gh_issue_pattern, link, re.I):
            issues.append(link.rstrip('/'))   
            
        elif re.search(gh_pr_pattern, link, re.I):
            matches = re.search(gh_pr_pattern, link, re.I).groups()
            repo_name = matches[0]
            pr = repo_name+'/pull/'+matches[1].rstrip('/')
            prs.append(pr)
            
            patch_link = pr+'.patch'
            response = requests.get(patch_link)
            if response.status_code != 200:
                print('Failed for link:'+patch_link)
                # log failure here not print
            else:
                data = response.text
                commit_hashes = re.findall(r'(?:\n|^)from (.*?)\s', data, re.I)
                commit_hashes = [item for item in commit_hashes if item.isalnum()]
                commit_links = [repo_name+'/commit/'+item.rstrip('/') for item in commit_hashes]
                commits.extend(commit_links)                  
            
        elif re.search(gh_commit_pattern, link, re.I):
            link = link.split('#')[0].strip().rstrip('/')
            commits.append(link)
            
        elif re.search(gh_compare_pattern, link, re.I):
            match = re.search(gh_compare_pattern, link, re.I).groups()
            repo_name = match[0]
            patch_link = link.rstrip('/')+'.patch'
            response = requests.get(patch_link)
            if response.status_code != 200:
                print('Failed for link:'+patch_link)
                # log failure here not print
            else:
                data = response.text
                commit_hashes = re.findall(r'(?:\n|^)from (.*?)\s', data, re.I)
                commit_hashes = [item for item in commit_hashes if item.isalnum()]
                commit_links = [repo_name+'/commit/'+item.rstrip('/') for item in commit_hashes]
                commits.extend(commit_links)
                
    
    issues = np.array(list(set(issues)))
    prs = np.array(list(set(prs)))
    commits = np.array(list(set(commits)))
            
    return issues, prs, commits

In [15]:
issues, prs, commits = get_github_cve_links(gh_links)

 24%|██▎       | 523/2218 [00:55<03:46,  7.48it/s]

Failed for link:https://github.com/apache/tomcat70/compare/6b41fb05c0f6af5e6cc103ac8e5ae9da5f128606...e519f4e86bf3447934f1c399ecaff8a222e38241.patch


 24%|██▍       | 529/2218 [00:55<02:58,  9.46it/s]

Failed for link:https://github.com/apache/tomcat70/compare/72a8692370b4323f4d05b166f48a0913801fbe4f...a27df4fd31b1cd85f100b8b94e3b33dde92a3c0a.patch


100%|██████████| 2218/2218 [03:24<00:00, 10.87it/s]


In [16]:
issues.shape, prs.shape, commits.shape

((286,), (382,), (3190,))

In [17]:
issue_df = pd.DataFrame(issues, columns=['issue'])
pr_df = pd.DataFrame(prs, columns=['pull_request'])
commits_df = pd.DataFrame(commits, columns=['commit'])

In [19]:
issue_df.to_csv('./data/gokube_phase1_jun19/gh_cve_issue_links.csv', index=False)
pr_df.to_csv('./data/gokube_phase1_jun19/gh_cve_pr_links.csv', index=False)