# Processing GitHub Issues

In [1]:
import pandas as pd
import numpy as np
import os

In [131]:
gh_bq_issues = pd.read_csv('GH_issues.csv')
gh_bq_issues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434016 entries, 0 to 434015
Data columns (total 18 columns):
repo_name                434016 non-null object
event_type               434016 non-null object
actor_id                 434016 non-null int64
actor_name               434016 non-null object
issue_status             434016 non-null object
issue_api_url            427611 non-null object
issue_url                427611 non-null object
issue_creator_name       427611 non-null object
issue_creator_api_url    427611 non-null object
issue_creator_url        422971 non-null object
comment_count            427611 non-null float64
issue_id                 427611 non-null float64
issue_number             427611 non-null float64
issue_created_at         427611 non-null object
issue_updated_at         427611 non-null object
issue_closed_at          198629 non-null object
issue_title              427609 non-null object
issue_body               414324 non-null object
dtypes: float64(3), int

In [132]:
gh_bq_issues[pd.isnull(gh_bq_issues.issue_id)].shape

(6405, 18)

In [133]:
gh_bq_issues = gh_bq_issues[~pd.isnull(gh_bq_issues.issue_id)]
gh_bq_issues.shape

(427611, 18)

In [134]:
gh_bq_issues.issue_id.value_counts().iloc[:10]

179150396.0    18
316179685.0    10
230355205.0    10
196166690.0    10
266556716.0    10
126604565.0     9
313417242.0     9
197549453.0     8
201596242.0     8
95072991.0      8
Name: issue_id, dtype: int64

# Checking and Removing Issues with wrong timestamp

In [135]:
[item for item in gh_bq_issues.issue_created_at.tolist() if item.startswith('3')]

['3012-03-02T14:10:15Z', '3012-03-02T14:10:15Z', '3012-03-02T14:10:15Z']

In [136]:
[item for item in gh_bq_issues.issue_updated_at.tolist() if item.startswith('3')]

[]

In [137]:
[item for item in gh_bq_issues.issue_closed_at.tolist() if not pd.isnull(item) and item.startswith('3')]

[]

In [138]:
gh_bq_issues = gh_bq_issues[~(gh_bq_issues.issue_created_at == '3012-03-02T14:10:15Z')]

In [139]:
gh_bq_issues.issue_created_at = pd.to_datetime(gh_bq_issues.issue_created_at)
gh_bq_issues.issue_updated_at = pd.to_datetime(gh_bq_issues.issue_updated_at)
gh_bq_issues.issue_closed_at = pd.to_datetime(gh_bq_issues.issue_closed_at)
gh_bq_issues.shape

(427608, 18)

# Removing Duplicate Issues 
### Keeping each issue record based on last updated timestamp

In [140]:
gh_bq_issues = gh_bq_issues.loc[gh_bq_issues.groupby('issue_id').issue_updated_at.idxmax(skipna=False)]
gh_bq_issues.shape

(241886, 18)

In [141]:
gh_bq_issues.issue_id.value_counts().iloc[:10]

268435422.0    1
348539014.0    1
174274733.0    1
348548638.0    1
87137133.0     1
174273911.0    1
2723023.0      1
348546546.0    1
84961244.0     1
174272484.0    1
Name: issue_id, dtype: int64

# Reading in Positive CVE related GitHub Issue Links

In [142]:
gh_pos_issue_links = pd.read_csv('gh_sec_issue_links.csv')
gh_pos_issue_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 1 columns):
issue    249 non-null object
dtypes: object(1)
memory usage: 2.0+ KB


In [143]:
cve_issue_links = gh_pos_issue_links.issue.tolist()
len(cve_issue_links)

249

# Extracting CVE issues from the larger issues dataset

In [144]:
cve_issues = gh_bq_issues[gh_bq_issues.issue_url.isin(cve_issue_links)]
cve_issues.shape

(201, 18)

In [145]:
cve_issues.issue_url.value_counts().iloc[:10]

https://github.com/pippo-java/pippo/issues/466          1
https://github.com/bram85/topydo/issues/240             1
https://github.com/dlitz/pycrypto/issues/253            1
https://github.com/primefaces/primefaces/issues/3468    1
https://github.com/spray/spray-json/issues/278          1
https://github.com/coveo/saml-client/issues/7           1
https://github.com/geminabox/geminabox/issues/278       1
https://github.com/nahsra/antisamy/issues/2             1
https://github.com/twbs/bootstrap-sass/issues/1157      1
https://github.com/ansible/ansible/issues/42388         1
Name: issue_url, dtype: int64

In [146]:
cve_issues.head()

Unnamed: 0,repo_name,event_type,actor_id,actor_name,issue_status,issue_api_url,issue_url,issue_creator_name,issue_creator_api_url,issue_creator_url,comment_count,issue_id,issue_number,issue_created_at,issue_updated_at,issue_closed_at,issue_title,issue_body
108557,pypa/pip,IssuesEvent,716529,glyph,opened,https://api.github.com/repos/pypa/pip/issues/425,https://github.com/pypa/pip/issues/425,glyph,https://api.github.com/users/glyph,,0.0,2728444.0,425.0,2012-01-04 21:15:03,2012-01-04 21:15:03,NaT,pip should not execute arbitrary code from the...,"When you 'pip install' something, it fetches t..."
307488,yaml/pyyaml,IssuesEvent,240830,sigmavirus24,closed,https://api.github.com/repos/yaml/pyyaml/issues/5,https://github.com/yaml/pyyaml/issues/5,sigmavirus24,https://api.github.com/users/sigmavirus24,https://github.com/sigmavirus24,8.0,23552298.0,5.0,2013-12-02 04:56:41,2017-08-26 15:26:02,2017-08-26 15:26:02,Make load safe_load,- [ ] Make `yaml.load` default to safe - [ ] A...
213339,neo4j/neo4j,IssuesEvent,79651,cleishm,closed,https://api.github.com/repos/neo4j/neo4j/issue...,https://github.com/neo4j/neo4j/issues/2826,DinisCruz,https://api.github.com/users/DinisCruz,https://github.com/DinisCruz,2.0,39909610.0,2826.0,2014-08-10 15:09:12,2015-04-20 01:27:27,2015-04-20 01:27:27,CSRF vulnerability CVE-2013-7259,"Hi, I'm looking at using Neo4J for my current ..."
280227,jruby/jruby,IssuesEvent,45967,kares,closed,https://api.github.com/repos/jruby/jruby/issue...,https://github.com/jruby/jruby/issues/2452,headius,https://api.github.com/users/headius,https://github.com/headius,11.0,54080259.0,2452.0,2015-01-12 17:14:59,2017-06-22 18:08:14,2017-06-22 18:08:14,Add salt to Array#hash,"For #2437, we partially aligned our Array#hash..."
169019,pypiserver/pypiserver,IssuesEvent,501585,ankostis,closed,https://api.github.com/repos/pypiserver/pypise...,https://github.com/pypiserver/pypiserver/issue...,zenzic64,https://api.github.com/users/zenzic64,https://github.com/zenzic64,2.0,58386527.0,77.0,2015-02-20 17:47:09,2015-02-27 12:15:59,2015-02-27 12:15:59,Cross Site Scripting Vulnerability,Versions up to and including 1.1.7 contain a c...


In [147]:
found_issue_urls = cve_issues.issue_url.tolist()

# Getting CVE Positive issues not found in Big Query Data

In [151]:
not_found_issue_urls = list(set(cve_issue_links) - set(found_issue_urls))
not_found_issue_urls[:10], len(not_found_issue_urls)

(['https://github.com/spring-projects/spring-security/issues/3563',
  'https://github.com/DozerMapper/dozer/issues/217',
  'https://github.com/urllib3/urllib3/issues/1316',
  'https://github.com/ajenti/ajenti/issues/602',
  'https://github.com/elastic/elasticsearch/issues/5853',
  'https://github.com/netty/netty/issues/2441',
  'https://github.com/Snorby/snorby/issues/261',
  'https://github.com/Hygieia/Hygieia/issues/1031',
  'https://github.com/esapi/esapi-java-legacy/issues/306',
  'https://github.com/FusionAuth/fusionauth-jwt/issues/3'],
 48)

# Extracting not found CVE issues with GitHub API

In [160]:
import requests
from tqdm import tqdm
import re

In [164]:
not_found_api_urls = ['https://api.github.com/repos/'+re.search(r'.*github.com/(.*)', link, re.I).groups()[0]
                          for link in not_found_issue_urls]
not_found_api_urls[:10]

['https://api.github.com/repos/spring-projects/spring-security/issues/3563',
 'https://api.github.com/repos/DozerMapper/dozer/issues/217',
 'https://api.github.com/repos/urllib3/urllib3/issues/1316',
 'https://api.github.com/repos/ajenti/ajenti/issues/602',
 'https://api.github.com/repos/elastic/elasticsearch/issues/5853',
 'https://api.github.com/repos/netty/netty/issues/2441',
 'https://api.github.com/repos/Snorby/snorby/issues/261',
 'https://api.github.com/repos/Hygieia/Hygieia/issues/1031',
 'https://api.github.com/repos/esapi/esapi-java-legacy/issues/306',
 'https://api.github.com/repos/FusionAuth/fusionauth-jwt/issues/3']

In [172]:
data = []
for link in tqdm(not_found_api_urls):
    response = requests.get(link,
                            auth=('dipanjanS', os.environ['GITHUB_TOKEN']))
    if not response.status_code == 200:
        print('Failed for link: '+link)
        # log this later
    else:
        content = response.json()
        issue_dict = {
            'repo_name': re.search(r'.*github.com/repos/(.*?)/issues', 
                                   link, re.I).groups()[0],
            'event_type': 'IssuesEvent',
            'actor_id': content.get('user').get('id'),
            'actor_name': content.get('user').get('login'),
            'issue_status': content.get('state'),
            'issue_api_url': content.get('url'),
            'issue_url': content.get('html_url'),
            'issue_creator_name': content.get('user').get('login'),
            'issue_creator_api_url': content.get('user').get('url'),
            'issue_creator_url': content.get('user').get('html_url'),
            'comment_count': content.get('comments'),
            'issue_id': content.get('id'),
            'issue_number': content.get('number'),
            'issue_created_at': content.get('created_at'),
            'issue_updated_at': content.get('updated_at'),
            'issue_closed_at': content.get('closed_at'),
            'issue_title': content.get('title'),
            'issue_body': content.get('body')
        }
        if issue_dict:
            data.append(issue_dict)

print(len(data))

100%|██████████| 48/48 [00:59<00:00,  1.14s/it]

48





In [175]:
cve_issues_nf = pd.DataFrame(data)
cve_issues_nf = cve_issues_nf[cve_issues.columns.tolist()]
cve_issues_nf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 18 columns):
repo_name                48 non-null object
event_type               48 non-null object
actor_id                 48 non-null int64
actor_name               48 non-null object
issue_status             48 non-null object
issue_api_url            48 non-null object
issue_url                48 non-null object
issue_creator_name       48 non-null object
issue_creator_api_url    48 non-null object
issue_creator_url        48 non-null object
comment_count            48 non-null int64
issue_id                 48 non-null int64
issue_number             48 non-null int64
issue_created_at         48 non-null object
issue_updated_at         48 non-null object
issue_closed_at          43 non-null object
issue_title              48 non-null object
issue_body               48 non-null object
dtypes: int64(4), object(14)
memory usage: 6.8+ KB


In [176]:
cve_issues_nf.head()

Unnamed: 0,repo_name,event_type,actor_id,actor_name,issue_status,issue_api_url,issue_url,issue_creator_name,issue_creator_api_url,issue_creator_url,comment_count,issue_id,issue_number,issue_created_at,issue_updated_at,issue_closed_at,issue_title,issue_body
0,spring-projects/spring-security,IssuesEvent,362503,rwinch,closed,https://api.github.com/repos/spring-projects/s...,https://github.com/spring-projects/spring-secu...,rwinch,https://api.github.com/users/rwinch,https://github.com/rwinch,0,131819573,3563,2013-06-12T06:49:56Z,2014-03-25T13:35:23Z,2014-03-25T13:35:23Z,SEC-2177: DefaultRedirectStrategy could be les...,Backported https://github.com/spring-projects/...
1,DozerMapper/dozer,IssuesEvent,1059826,dfj,open,https://api.github.com/repos/DozerMapper/dozer...,https://github.com/DozerMapper/dozer/issues/217,dfj,https://api.github.com/users/dfj,https://github.com/dfj,5,51064459,217,2014-12-05T05:20:15Z,2018-01-11T02:00:57Z,,Potential remote code execution via dozer's re...,Dozer uses a reflection-based approach to type...
2,urllib3/urllib3,IssuesEvent,33599956,zockons12,closed,https://api.github.com/repos/urllib3/urllib3/i...,https://github.com/urllib3/urllib3/issues/1316,zockons12,https://api.github.com/users/zockons12,https://github.com/zockons12,17,289108377,1316,2018-01-17T01:02:04Z,2018-12-12T06:01:29Z,2018-03-29T14:38:43Z,Auth header remains during redirects,Requests does it: https://github.com/request/r...
3,ajenti/ajenti,IssuesEvent,4004716,mmetince,closed,https://api.github.com/repos/ajenti/ajenti/iss...,https://github.com/ajenti/ajenti/issues/602,mmetince,https://api.github.com/users/mmetince,https://github.com/mmetince,3,45386090,602,2014-10-09T16:16:36Z,2014-10-09T17:28:35Z,2014-10-09T17:20:46Z,Stored XSS Vulnerability,Hey\n\nI've found Stored XSS vulnerability on ...
4,elastic/elasticsearch,IssuesEvent,2914051,rtoma,closed,https://api.github.com/repos/elastic/elasticse...,https://github.com/elastic/elasticsearch/issue...,rtoma,https://api.github.com/users/rtoma,https://github.com/rtoma,11,31727017,5853,2014-04-17T13:38:05Z,2014-05-20T13:33:52Z,2014-04-25T21:45:03Z,Change default for script.disable_dynamic,Please make setting script.disable_dynamic=tru...


# Labeling CVE and non-CVE GitHub issues 

In [254]:
gh_bq_issues.shape

(241886, 18)

In [184]:
gh_bq_issues_positive = pd.concat([cve_issues, cve_issues_nf], axis=0)
gh_bq_issues_negative = gh_bq_issues.drop(cve_issues.index.tolist())
gh_bq_issues_positive.shape, gh_bq_issues_negative.shape

((249, 18), (241685, 19))

In [185]:
gh_bq_issues_positive['class_label'] = 2
gh_bq_issues_negative['class_label'] = 0

In [187]:
gh_bq_issues_processed = pd.concat([gh_bq_issues_negative, gh_bq_issues_positive], axis=0)
gh_bq_issues_processed.shape

(241934, 19)

In [189]:
gh_bq_issues_processed.tail()

Unnamed: 0,repo_name,event_type,actor_id,actor_name,issue_status,issue_api_url,issue_url,issue_creator_name,issue_creator_api_url,issue_creator_url,comment_count,issue_id,issue_number,issue_created_at,issue_updated_at,issue_closed_at,issue_title,issue_body,class_label
43,primefaces/primefaces,IssuesEvent,13161711,cnsgithub,closed,https://api.github.com/repos/primefaces/primef...,https://github.com/primefaces/primefaces/issue...,cnsgithub,https://api.github.com/users/cnsgithub,https://github.com/cnsgithub,3.0,291533030.0,3213.0,2018-01-25T11:11:31Z,2018-03-15T08:35:07Z,2018-01-26T12:38:10Z,XSS vulnerability in ButtonRenderer,## 1) Environment\r\n- PrimeFaces version: pri...,2
44,IdentityPython/pysaml2,IssuesEvent,15326,mrbrutti,closed,https://api.github.com/repos/IdentityPython/py...,https://github.com/IdentityPython/pysaml2/issu...,mrbrutti,https://api.github.com/users/mrbrutti,https://github.com/mrbrutti,21.0,181454016.0,366.0,2016-10-06T15:50:31Z,2017-11-15T11:17:13Z,2017-11-15T11:17:13Z,PySAML vulnerable to XXE,"Roland (@rohe), \n#### Description\n\nAn XML E...",2
45,saltstack/salt,IssuesEvent,507599,thatch45,closed,https://api.github.com/repos/saltstack/salt/is...,https://github.com/saltstack/salt/issues/19,thatch45,https://api.github.com/users/thatch45,https://github.com/thatch45,2.0,674338.0,19.0,2011-03-15T04:36:32Z,2011-03-15T04:58:35Z,2011-03-15T04:58:35Z,Sending a faulty command kills all the minions!,Catch faulty calls\n,2
46,jmurty/java-xmlbuilder,IssuesEvent,2177830,xiaoyongwu,closed,https://api.github.com/repos/jmurty/java-xmlbu...,https://github.com/jmurty/java-xmlbuilder/issu...,xiaoyongwu,https://api.github.com/users/xiaoyongwu,https://github.com/xiaoyongwu,1.0,38423199.0,6.0,2014-07-22T17:27:44Z,2014-07-22T20:55:04Z,2014-07-22T20:55:04Z,XMLBuilder2 is vulnerable to XML External Enti...,"I noticed that by default, the parser in XMLBu...",2
47,pypa/pip,IssuesEvent,414336,guettli,closed,https://api.github.com/repos/pypa/pip/issues/725,https://github.com/pypa/pip/issues/725,guettli,https://api.github.com/users/guettli,https://github.com/guettli,5.0,8381771.0,725.0,2012-11-15T09:30:29Z,2013-01-26T07:46:41Z,2013-01-26T06:28:27Z,/tmp/pip-build not secure,Well known temporary file names like /tmp/pip-...,2


In [190]:
gh_bq_issues_processed.to_csv('GH_issues_clean_labeled.csv', index=False)

# Processing Pull Requests

In [191]:
gh_bq_prs = pd.read_csv('GH_pull_requests.csv')
gh_bq_prs.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 746701 entries, 0 to 746700
Data columns (total 27 columns):
repo_name                   746701 non-null object
event_type                  746701 non-null object
actor_id                    746701 non-null int64
actor_name                  746701 non-null object
pr_status                   746701 non-null object
pr_id                       746701 non-null int64
pr_number                   746701 non-null int64
pr_api_url                  738849 non-null object
pr_url                      738849 non-null object
pr_diff_url                 738849 non-null object
pr_patch_url                738849 non-null object
pr_creator_name             738847 non-null object
pr_creator_api_url          738847 non-null object
pr_creator_url              679891 non-null object
pr_created_at               738849 non-null object
pr_updated_at               738849 non-null object
pr_closed_at                339114 non-null object
pr_merged_at             

# Adding in missing GitHub Pull Request Links

In [201]:
gh_bq_prs = gh_bq_prs[~pd.isnull(gh_bq_prs.pr_title)]

In [225]:
def fill_missing_links(record):
    repo_name = record['repo_name']
    pr_number = str(record['pr_number'])
    if pd.isnull(record['pr_api_url']):
        record['pr_api_url'] = 'https://api.github.com/repos/'+repo_name+'/pulls/'+pr_number
    if pd.isnull(record['pr_url']):
        record['pr_url'] = 'https://github.com/'+repo_name+'/pull/'+pr_number 
    if pd.isnull(record['pr_diff_url']):
        record['pr_diff_url'] = 'https://github.com/'+repo_name+'/pull/'+pr_number+'.diff' 
    if pd.isnull(record['pr_patch_url']):
        record['pr_patch_url'] = 'https://github.com/'+repo_name+'/pull/'+pr_number+'.patch'
    return record

In [226]:
gh_bq_prs = gh_bq_prs.apply(lambda row: fill_missing_links(row), axis=1)
gh_bq_prs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 746696 entries, 0 to 746700
Data columns (total 27 columns):
repo_name                   746696 non-null object
event_type                  746696 non-null object
actor_id                    746696 non-null int64
actor_name                  746696 non-null object
pr_status                   746696 non-null object
pr_id                       746696 non-null int64
pr_number                   746696 non-null int64
pr_api_url                  746696 non-null object
pr_url                      746696 non-null object
pr_diff_url                 746696 non-null object
pr_patch_url                746696 non-null object
pr_creator_name             738847 non-null object
pr_creator_api_url          738847 non-null object
pr_creator_url              679891 non-null object
pr_created_at               738849 non-null object
pr_updated_at               738849 non-null object
pr_closed_at                339114 non-null object
pr_merged_at             

# Checking and Removing Pull Requests with wrong timestamp

In [228]:
[item for item in gh_bq_prs.pr_created_at.tolist() if not pd.isnull(item) and item.startswith('3')]

[]

In [229]:
[item for item in gh_bq_prs.pr_updated_at.tolist() if not pd.isnull(item) and item.startswith('3')]

[]

In [230]:
[item for item in gh_bq_prs.pr_closed_at.tolist() if not pd.isnull(item) and item.startswith('3')]

[]

In [231]:
[item for item in gh_bq_prs.pr_merged_at.tolist() if not pd.isnull(item) and item.startswith('3')]

[]

# Removing Duplicate Pull Requests 
### Keeping each PR record based on last row where timestamp is missing

In [240]:
gh_bq_prs_missing_info = gh_bq_prs[pd.isnull(gh_bq_prs.pr_updated_at)]
gh_bq_prs_full_info = gh_bq_prs[~pd.isnull(gh_bq_prs.pr_updated_at)]
gh_bq_prs_missing_info.shape, gh_bq_prs_full_info.shape

((7847, 27), (738849, 27))

In [241]:
gh_bq_prs_missing_info = gh_bq_prs_missing_info.drop_duplicates(subset=['pr_url'], keep="last")
gh_bq_prs_missing_info.shape

(4307, 27)

# Removing Duplicate Pull Requests 
### Keeping each PR record based on last updated timestamp

In [244]:
gh_bq_prs_full_info.pr_created_at = pd.to_datetime(gh_bq_prs_full_info.pr_created_at)
gh_bq_prs_full_info.pr_updated_at = pd.to_datetime(gh_bq_prs_full_info.pr_updated_at)
gh_bq_prs_full_info.pr_closed_at = pd.to_datetime(gh_bq_prs_full_info.pr_closed_at)
gh_bq_prs_full_info.pr_merged_at = pd.to_datetime(gh_bq_prs_full_info.pr_merged_at)
gh_bq_prs_full_info.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


(738849, 27)

In [245]:
gh_bq_prs_full_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 738849 entries, 6 to 746700
Data columns (total 27 columns):
repo_name                   738849 non-null object
event_type                  738849 non-null object
actor_id                    738849 non-null int64
actor_name                  738849 non-null object
pr_status                   738849 non-null object
pr_id                       738849 non-null int64
pr_number                   738849 non-null int64
pr_api_url                  738849 non-null object
pr_url                      738849 non-null object
pr_diff_url                 738849 non-null object
pr_patch_url                738849 non-null object
pr_creator_name             738847 non-null object
pr_creator_api_url          738847 non-null object
pr_creator_url              679891 non-null object
pr_created_at               738849 non-null datetime64[ns]
pr_updated_at               738849 non-null datetime64[ns]
pr_closed_at                339114 non-null datetime64[ns]
p

In [246]:
gh_bq_prs_full_info = gh_bq_prs_full_info.loc[gh_bq_prs_full_info.groupby('pr_url').pr_updated_at.idxmax(skipna=False)]
gh_bq_prs_full_info.shape

(355448, 27)

In [248]:
gh_bq_prs = pd.concat([gh_bq_prs_full_info, gh_bq_prs_missing_info], axis=0)
gh_bq_prs.shape

(359755, 27)

# Reading in Positive CVE related GitHub Pull Request Links

In [249]:
gh_pos_pr_links = pd.read_csv('gh_sec_pr_links.csv')
gh_pos_pr_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329 entries, 0 to 328
Data columns (total 1 columns):
pull_request    329 non-null object
dtypes: object(1)
memory usage: 2.6+ KB


In [250]:
cve_pr_links = gh_pos_pr_links.pull_request.tolist()
len(cve_pr_links)

329

# Extracting CVE Pull Requests from the larger PRs dataset

In [251]:
cve_prs = gh_bq_prs[gh_bq_prs.pr_url.isin(cve_pr_links)]
cve_prs.shape

(280, 27)

In [252]:
cve_prs.head()

Unnamed: 0,repo_name,event_type,actor_id,actor_name,pr_status,pr_id,pr_number,pr_api_url,pr_url,pr_diff_url,...,pr_merged_at,pr_merged_status,pr_comments_count,pr_review_comments_count,pr_commits_count,pr_additions_count,pr_deletions_count,pr_changed_files_count,pr_title,pr_body
343338,3breadt/dd-plist,PullRequestEvent,1140937,3breadt,closed,88005493,26,https://api.github.com/repos/3breadt/dd-plist/...,https://github.com/3breadt/dd-plist/pull/26,https://github.com/3breadt/dd-plist/pull/26.diff,...,2016-10-05 18:08:59,True,3.0,0.0,2,61,48,2.0,Please consider a few minor changes,1) Take steps to guard against XXE attacks (no...
592948,Bedework/bw-webdav,PullRequestEvent,2991622,douglm,closed,232818232,1,https://api.github.com/repos/Bedework/bw-webda...,https://github.com/Bedework/bw-webdav/pull/1,https://github.com/Bedework/bw-webdav/pull/1.diff,...,2018-11-28 15:31:02,True,0.0,2.0,7,149,59,8.0,secure xml,I found out that this library is open to some ...
527042,CVEProject/cvelist,PullRequestEvent,32679304,cve-team,closed,153153827,78,https://api.github.com/repos/CVEProject/cvelis...,https://github.com/CVEProject/cvelist/pull/78,https://github.com/CVEProject/cvelist/pull/78....,...,2017-11-17 20:19:46,True,2.0,1.0,1,62,0,1.0,Added CVE-2017-1000190,
501691,FasterXML/jackson-core,PullRequestEvent,55065,cowtowncoder,closed,88346565,322,https://api.github.com/repos/FasterXML/jackson...,https://github.com/FasterXML/jackson-core/pull...,https://github.com/FasterXML/jackson-core/pull...,...,2017-01-12 03:35:59,True,13.0,0.0,2,98,2,3.0,Trim tokens in error messages to 256 byte to p...,See https://issues.jboss.org/browse/JBEAP-6316...
624805,FasterXML/jackson-modules-java8,PullRequestEvent,55065,cowtowncoder,closed,224078993,87,https://api.github.com/repos/FasterXML/jackson...,https://github.com/FasterXML/jackson-modules-j...,https://github.com/FasterXML/jackson-modules-j...,...,2018-10-24 02:41:33,True,1.0,0.0,3,367,50,6.0,Prevent unbounded latency converting decimals ...,This change prevents latency explosions when w...


# Getting CVE Positive Pull Requests not found in Big Query Data

In [253]:
found_pr_urls = cve_prs.pr_url.tolist()
not_found_pr_urls = list(set(cve_pr_links) - set(found_pr_urls))
not_found_pr_urls[:10], len(not_found_pr_urls)

(['https://github.com/webbynode/webbynode/pull/85',
  'https://github.com/scrapy/scrapy/pull/676',
  'https://github.com/jupyter-widgets/ipywidgets/pull/591',
  'https://github.com/ruby-i18n/i18n/pull/289',
  'https://github.com/resteasy/Resteasy/pull/425',
  'https://github.com/wildfly/wildfly/pull/5234',
  'https://github.com/apache/airflow/pull/2054',
  'https://github.com/divio/cmsplugin-filer/pull/185',
  'https://github.com/richfaces4/core/pull/21',
  'https://github.com/pypa/pip/pull/1098'],
 49)

In [257]:
not_found_api_urls = [re.sub('/pull/', '/pulls/', 
                             ('https://api.github.com/repos/'+re.search(r'.*github.com/(.*)', link, re.I)
                              .groups()[0]), re.I)
                          for link in not_found_pr_urls]
not_found_api_urls[:10]

['https://api.github.com/repos/webbynode/webbynode/pulls/85',
 'https://api.github.com/repos/scrapy/scrapy/pulls/676',
 'https://api.github.com/repos/jupyter-widgets/ipywidgets/pulls/591',
 'https://api.github.com/repos/ruby-i18n/i18n/pulls/289',
 'https://api.github.com/repos/resteasy/Resteasy/pulls/425',
 'https://api.github.com/repos/wildfly/wildfly/pulls/5234',
 'https://api.github.com/repos/apache/airflow/pulls/2054',
 'https://api.github.com/repos/divio/cmsplugin-filer/pulls/185',
 'https://api.github.com/repos/richfaces4/core/pulls/21',
 'https://api.github.com/repos/pypa/pip/pulls/1098']

# Extracting not found CVE Pull Requests with GitHub API

In [259]:
data = []
for link in tqdm(not_found_api_urls):
    response = requests.get(link,
                            auth=('dipanjanS', os.environ['GITHUB_TOKEN']))
    if not response.status_code == 200:
        print('Failed for link: '+link)
        # log this later
    else:
        content = response.json()
        pr_dict = {
            'repo_name': re.search(r'.*github.com/repos/(.*?)/pulls', 
                                   link, re.I).groups()[0],
            'event_type': 'PullRequestEvent',
            'actor_id': content.get('user').get('id'),
            'actor_name': content.get('user').get('login'),
            'pr_status': content.get('state'),
            'pr_id': content.get('id'),
            'pr_number': content.get('number'),
            'pr_api_url': content.get('url'),
            'pr_url': content.get('html_url'),
            'pr_diff_url': content.get('diff_url'),
            'pr_patch_url': content.get('patch_url'),
            'pr_creator_name': content.get('user').get('login'),
            'pr_creator_api_url': content.get('user').get('url'),
            'pr_creator_url': content.get('user').get('html_url'),         
            'pr_created_at': content.get('created_at'),
            'pr_updated_at': content.get('updated_at'),
            'pr_closed_at': content.get('closed_at'),
            'pr_merged_at': content.get('merged_at'),
            'pr_merged_status': content.get('merged'),
            'pr_comments_count': content.get('comments'),
            'pr_review_comments_count': content.get('review_comments'),
            'pr_commits_count': content.get('commits'),
            'pr_additions_count': content.get('additions'),
            'pr_deletions_count': content.get('deletions'),
            'pr_changed_files_count': content.get('changed_files'),
            'pr_title': content.get('title'),
            'pr_body': content.get('body')
        }
        if pr_dict:
            data.append(pr_dict)

print(len(data))

100%|██████████| 49/49 [01:12<00:00,  1.50s/it]

49





In [260]:
cve_prs_nf = pd.DataFrame(data)
cve_prs_nf = cve_prs_nf[cve_prs.columns.tolist()]
cve_prs_nf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 27 columns):
repo_name                   49 non-null object
event_type                  49 non-null object
actor_id                    49 non-null int64
actor_name                  49 non-null object
pr_status                   49 non-null object
pr_id                       49 non-null int64
pr_number                   49 non-null int64
pr_api_url                  49 non-null object
pr_url                      49 non-null object
pr_diff_url                 49 non-null object
pr_patch_url                49 non-null object
pr_creator_name             49 non-null object
pr_creator_api_url          49 non-null object
pr_creator_url              49 non-null object
pr_created_at               49 non-null object
pr_updated_at               49 non-null object
pr_closed_at                48 non-null object
pr_merged_at                33 non-null object
pr_merged_status            49 non-null bool
pr_commen

In [261]:
cve_prs_nf.head()

Unnamed: 0,repo_name,event_type,actor_id,actor_name,pr_status,pr_id,pr_number,pr_api_url,pr_url,pr_diff_url,...,pr_merged_at,pr_merged_status,pr_comments_count,pr_review_comments_count,pr_commits_count,pr_additions_count,pr_deletions_count,pr_changed_files_count,pr_title,pr_body
0,webbynode/webbynode,PullRequestEvent,1929595,lcashdol,closed,10282557,85,https://api.github.com/repos/webbynode/webbyno...,https://github.com/webbynode/webbynode/pull/85,https://github.com/webbynode/webbynode/pull/85...,...,,False,3,0,2,5,3,2,Untested fix for the command injection vulnera...,"Hi,\nI didn't test this fix, but it should mit..."
1,scrapy/scrapy,PullRequestEvent,360285,csalazar,closed,14236088,676,https://api.github.com/repos/scrapy/scrapy/pul...,https://github.com/scrapy/scrapy/pull/676,https://github.com/scrapy/scrapy/pull/676.diff,...,2014-04-08T17:40:04Z,True,10,2,4,34,3,4,Fixed XXE flaw in sitemap reader,"The XML reader, used by SitemapSpider to proce..."
2,jupyter-widgets/ipywidgets,PullRequestEvent,151929,minrk,closed,70481422,591,https://api.github.com/repos/jupyter-widgets/i...,https://github.com/jupyter-widgets/ipywidgets/...,https://github.com/jupyter-widgets/ipywidgets/...,...,2016-05-19T18:59:40Z,True,6,0,2,12,6,1,only filter stored snapshots,don't re-render outputs on the page\n\nThis sh...
3,ruby-i18n/i18n,PullRequestEvent,628,lmarlow,closed,21880255,289,https://api.github.com/repos/ruby-i18n/i18n/pu...,https://github.com/ruby-i18n/i18n/pull/289,https://github.com/ruby-i18n/i18n/pull/289.diff,...,2015-01-10T19:17:49Z,True,10,0,1,7,1,2,Teach Hash#slice to only include keys that exi...,Previously this would blow up if you asked for...
4,resteasy/Resteasy,PullRequestEvent,538611,ronsigal,closed,10075938,425,https://api.github.com/repos/resteasy/Resteasy...,https://github.com/resteasy/Resteasy/pull/425,https://github.com/resteasy/Resteasy/pull/425....,...,2013-12-04T15:39:29Z,True,0,0,7,376,368,15,"Branch_2_3: RESTEASY-869, RESTEASY-938",This is my attempt to unravel the disastrous P...


# Labeling CVE and non-CVE GitHub Pull Requests 

In [262]:
gh_bq_prs.shape

(359755, 27)

In [263]:
gh_bq_prs_positive = pd.concat([cve_prs, cve_prs_nf], axis=0)
gh_bq_prs_negative = gh_bq_prs.drop(cve_prs.index.tolist())
gh_bq_prs_positive.shape, gh_bq_prs_negative.shape

((329, 27), (359475, 27))

In [266]:
gh_bq_prs_positive['class_label'] = 2
gh_bq_prs_negative['class_label'] = 0

In [267]:
gh_bq_prs_processed = pd.concat([gh_bq_prs_negative, gh_bq_prs_positive], axis=0)
gh_bq_prs_processed.shape

(359804, 28)

In [268]:
gh_bq_prs_processed.to_csv('GH_prs_clean_labeled.csv', index=False)