# Extract Potential CVE related Github Issue\PR\Commit links

In [88]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import requests

In [2]:
df = pd.read_csv('./GO CVEs Data - Data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1697 entries, 0 to 1696
Data columns (total 9 columns):
CVE ID           1222 non-null object
Package name     1683 non-null object
Ecosystem        1683 non-null object
GH issue         679 non-null object
GH PR            322 non-null object
GH Commit        856 non-null object
Bugzilla         245 non-null object
ML               465 non-null object
Other sources    34 non-null object
dtypes: object(9)
memory usage: 119.4+ KB


In [3]:
df.head()

Unnamed: 0,CVE ID,Package name,Ecosystem,GH issue,GH PR,GH Commit,Bugzilla,ML,Other sources
0,CVE-2017-9805,struts2-rest-plugin,maven,,,https://github.com/apache/struts/commit/194947...,https://bugzilla.redhat.com/show_bug.cgi?id=14...,,https://blogs.apache.org/foundation/entry/apac...
1,CVE-2018-8617,Microsoft.ChakraCore,nuget,,https://github.com/Microsoft/ChakraCore/pull/5869,,,,
2,CVE-2018-8629,Microsoft.ChakraCore,nuget,,https://github.com/Microsoft/ChakraCore/pull/5869,,,,
3,CVE-2018-6824,Microsoft.ChakraCore,nuget,,https://github.com/Microsoft/ChakraCore/pull/5869,,,,
4,CVE-2018-8618,Microsoft.ChakraCore,nuget,,https://github.com/Microsoft/ChakraCore/pull/5869,,,,


In [4]:
gh_issues = df['GH issue'].dropna().values
print('Before:', gh_issues.shape)
gh_issues = [item.strip().split('\n') for item in gh_issues]
gh_issues = np.array([str(item.strip()) for sublist in gh_issues for item in sublist])
print('After:', gh_issues.shape)

Before: (679,)
After: (720,)


In [5]:
gh_prs = df['GH PR'].dropna().values
print('Before:', gh_prs.shape)
gh_prs = [item.strip().split('\n') for item in gh_prs]
gh_prs = np.array([str(item.strip()) for sublist in gh_prs for item in sublist])
print('After:', gh_prs.shape)

Before: (322,)
After: (357,)


In [6]:
gh_commits = df['GH Commit'].dropna().values
print('Before:', gh_commits.shape)
gh_commits = [item.strip().split('\n') for item in gh_commits]
gh_commits = np.array([str(item.strip()) for sublist in gh_commits for item in sublist])
print('After:', gh_commits.shape)

Before: (856,)
After: (1056,)


In [7]:
gh_links = np.concatenate((gh_issues, gh_commits, gh_prs))
gh_links.shape

(2133,)

In [44]:
gh_links = np.unique(gh_links)
gh_links.shape

(2033,)

# Extract and Save CVE related GitHub Links

In [92]:
def get_github_cve_links(links):
    
    issues = []
    prs = []
    commits = []
    
    gh_pr_commit_pattern = r'(.*github.com/.*?)/pull/(.*?)/commits/(.*)'
    gh_issue_pattern = r'(.*github.com/.*?)/issues/(.*)'    
    gh_pr_pattern = r'(.*github.com/.*?)/pull/([0-9]*)'
    gh_commit_pattern = r'(.*github.com/.*?)/commit/(.*)'
    gh_compare_pattern = r'(.*github.com/.*?)/compare/.*'
    
    for link in tqdm(links):
        
        if re.search(gh_pr_commit_pattern, link, re.I):
            matches = re.search(gh_pr_commit_pattern, link, re.I).groups()
            pr = matches[0]+'/pull/'+matches[1].rstrip('/')
            prs.append(pr)
            
            cm = matches[0]+'/commit/'+matches[2].rstrip('/')
            commits.append(cm)
            
        elif re.search(gh_issue_pattern, link, re.I):
            issues.append(link.rstrip('/'))   
            
        elif re.search(gh_pr_pattern, link, re.I):
            matches = re.search(gh_pr_pattern, link, re.I).groups()
            repo_name = matches[0]
            pr = repo_name+'/pull/'+matches[1].rstrip('/')
            prs.append(pr)
            
            patch_link = pr+'.patch'
            response = requests.get(patch_link)
            if response.status_code != 200:
                print('Failed for link:'+patch_link)
                # log failure here not print
            else:
                data = response.text
                commit_hashes = re.findall(r'(?:\n|^)from (.*?)\s', data, re.I)
                commit_hashes = [item for item in commit_hashes if item.isalnum()]
                commit_links = [repo_name+'/commit/'+item.rstrip('/') for item in commit_hashes]
                commits.extend(commit_links)                  
            
        elif re.search(gh_commit_pattern, link, re.I):
            link = link.split('#')[0].strip().rstrip('/')
            commits.append(link)
            
        elif re.search(gh_compare_pattern, link, re.I):
            match = re.search(gh_compare_pattern, link, re.I).groups()
            repo_name = match[0]
            patch_link = link.rstrip('/')+'.patch'
            response = requests.get(patch_link)
            if response.status_code != 200:
                print('Failed for link:'+patch_link)
                # log failure here not print
            else:
                data = response.text
                commit_hashes = re.findall(r'(?:\n|^)from (.*?)\s', data, re.I)
                commit_hashes = [item for item in commit_hashes if item.isalnum()]
                commit_links = [repo_name+'/commit/'+item.rstrip('/') for item in commit_hashes]
                commits.extend(commit_links)
                
    
    issues = np.array(list(set(issues)))
    prs = np.array(list(set(prs)))
    commits = np.array(list(set(commits)))
            
    return issues, prs, commits

In [93]:
issues, prs, commits = get_github_cve_links(gh_links)

100%|██████████| 2033/2033 [14:55<00:00,  2.27it/s] 


In [94]:
issues.shape, prs.shape, commits.shape

((249,), (329,), (2980,))

In [97]:
issue_df = pd.DataFrame(issues, columns=['issue'])
pr_df = pd.DataFrame(prs, columns=['pull_request'])
commits_df = pd.DataFrame(commits, columns=['commit'])

In [98]:
issue_df.to_csv('gh_sec_issue_links.csv', index=False)
pr_df.to_csv('gh_sec_pr_links.csv', index=False)
commits_df.to_csv('gh_sec_commit_links.csv', index=False)