In [2]:
import bq_utils as bqu
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
import arrow
import gc

In [3]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../auth/bq_key.json'
gh_archive = bqu.BigQueryHelper(active_project= "githubarchive", 
                                dataset_name = "day")

In [4]:
pd.read_json('https://api.github.com/repos/openshift/origin/issues/6829')

ValueError: Mixing dicts with non-Series may lead to ambiguous ordering.

In [115]:
gh_repo_links = ['https://github.com/golang/go', 'https://github.com/hashicorp/consul']

In [116]:
import re

pattern = re.compile(r'.*?github.com/(.*)', re.I)
repo_names = np.array(list(filter(None,[pattern.search(item).group(1) 
                                            if pattern.search(item) else None 
                                               for item in gh_repo_links])))
repo_names[:10], repo_names.shape

(array(['golang/go', 'hashicorp/consul'], dtype='<U16'), (2,))

In [117]:
def add_query_params(query, params_dict):
    for i, j in params_dict.items():
        query = query.replace(i, j)
    return query

In [118]:
specific_dates = [arrow.get('2019-03-05 00:00:00').to('UTC'),
                  arrow.get('2019-03-06 00:00:00').to('UTC'), 
                  arrow.get('2019-03-13 00:00:00').to('UTC')]
specific_days = [dt.format('YYYYMMDD') for dt in specific_dates]
specific_days, len(specific_days)

(['20190305', '20190306', '20190313'], 3)

In [119]:
year_prefix = '20*'
date_list = [item[2:] for item in specific_days]
query_params = {
    '{year_prefix_wildcard}': year_prefix,
    '{year_suffix_month_day}': '('+', '.join(["'"+d+"'" for d in date_list])+')',
    '{repo_names}': '('+', '.join(["'"+r+"'" for r in repo_names])+')'
}

In [120]:
query = """
SELECT  type, count(*)
        FROM `githubarchive.day.{year_prefix_wildcard}`
        WHERE _TABLE_SUFFIX IN {year_suffix_month_day}
        AND repo.name in {repo_names}
        AND type in ('PullRequestEvent', 'IssuesEvent')
        GROUP BY type
"""
query = add_query_params(query, query_params)
gh_archive.estimate_query_size(query)

0.20680708345025778

In [121]:
df = gh_archive.query_to_pandas(query)
df

Unnamed: 0,type,f0_
0,IssuesEvent,169
1,PullRequestEvent,43


In [122]:
query = """
SELECT 
    repo.name as repo_name, 
    type as event_type, 
    actor.id as actor_id,
    actor.login as actor_name,
    JSON_EXTRACT_SCALAR(payload, '$.action') as issue_status,
    JSON_EXTRACT_SCALAR(payload, '$.issue.url') as issue_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') as issue_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') as issue_creator_name,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.url') as issue_creator_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.html_url') as issue_creator_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.comments') as comment_count,
    JSON_EXTRACT_SCALAR(payload, '$.issue.id') as issue_id,
    JSON_EXTRACT_SCALAR(payload, '$.issue.number') as issue_number,
    JSON_EXTRACT_SCALAR(payload, '$.issue.created_at') as issue_created_at,
    JSON_EXTRACT_SCALAR(payload, '$.issue.updated_at') as issue_updated_at,
    JSON_EXTRACT_SCALAR(payload, '$.issue.closed_at') as issue_closed_at,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.issue.title'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as issue_title,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.issue.body'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as issue_body
        
FROM `githubarchive.day.{year_prefix_wildcard}`
    WHERE _TABLE_SUFFIX IN {year_suffix_month_day}
    AND repo.name in {repo_names}
    AND type = 'IssuesEvent'
    """

query = add_query_params(query, query_params)
gh_archive.estimate_query_size(query)

14.833640499040484

In [123]:
issues_df = gh_archive.query_to_pandas(query)
issues_df.issue_created_at = pd.to_datetime(issues_df.issue_created_at)
issues_df.issue_updated_at = pd.to_datetime(issues_df.issue_updated_at)
issues_df.issue_closed_at = pd.to_datetime(issues_df.issue_closed_at)
issues_df = issues_df.loc[issues_df.groupby('issue_url').issue_updated_at.idxmax(skipna=False)]
issues_df = issues_df.reset_index(drop=True)
issues_df.shape

(140, 18)

In [124]:
urls = ['https://github.com/golang/go/issues/30642', 
        'https://github.com/golang/go/issues/30794', 
        'https://github.com/hashicorp/consul/issues/5423']
issues_df[issues_df.issue_url.isin(urls)]

Unnamed: 0,repo_name,event_type,actor_id,actor_name,issue_status,issue_api_url,issue_url,issue_creator_name,issue_creator_api_url,issue_creator_url,comment_count,issue_id,issue_number,issue_created_at,issue_updated_at,issue_closed_at,issue_title,issue_body
96,golang/go,IssuesEvent,10643,zx2c4,opened,https://api.github.com/repos/golang/go/issues/...,https://github.com/golang/go/issues/30642,zx2c4,https://api.github.com/users/zx2c4,https://github.com/zx2c4,0,418006511,30642,2019-03-06 21:00:30,2019-03-06 21:00:30,NaT,runtime: dll injection vulnerabilities on Windows,@bradfitz suggested I open an issue for this r...
109,golang/go,IssuesEvent,47526072,GRagdoll,closed,https://api.github.com/repos/golang/go/issues/...,https://github.com/golang/go/issues/30794,GRagdoll,https://api.github.com/users/GRagdoll,https://github.com/GRagdoll,3,420290799,30794,2019-03-13 02:40:30,2019-03-13 06:31:26,2019-03-13 06:31:26,net/http CRLF injection vulnerability,<!-- Please answer these questions before subm...
135,hashicorp/consul,IssuesEvent,1641976,mkeeler,closed,https://api.github.com/repos/hashicorp/consul/...,https://github.com/hashicorp/consul/issues/5423,mkeeler,https://api.github.com/users/mkeeler,https://github.com/mkeeler,0,416952376,5423,2019-03-04 19:17:00,2019-03-05 19:31:22,2019-03-05 19:31:22,Consul CVE-2019-8336: Potential Privilege Esca...,An internal investigation led to the discovery...


In [125]:
df = pd.DataFrame()
df['repository'] = issues_df['repo_name'].tolist()
df['ecosystem'] = ['golang'] * len(issues_df)
df['repo_url'] = ['https://github.com/'+repo_name 
                       for repo_name in issues_df['repo_name'].tolist()]
df['package'] = df['repository']
df['cause_type'] = ['Issue'] * len(issues_df)
df['issue_url'] = issues_df['issue_url']
df['issue_date'] = issues_df['issue_created_at']
df['fixed_url'] = 'null'
df['fixed_date'] = 'null'
df['commit_url'] = 'null'
df['commit_date'] = 'null'
df['identified_url'] = df['issue_url']
df['identified_date'] = df['issue_date']
df['files_changed'] = 'null'
df['flagged_score'] = 'null'
df['flagged_at'] = 'null'
df['description'] = issues_df['issue_title'].map(str) + ' ' + issues_df['issue_body']

df = df[df['description'] != '']
data_descriptions = df['description'].values

total_docs = len(data_descriptions)
data_desc_input = [[idx, doc, total_docs] for idx, doc in enumerate(data_descriptions)]

In [126]:
%%time

from utils import text_normalizer as tn
from concurrent import futures
import threading


def parallel_preprocessing(idx, doc, total_docs):
    if idx % 5000 == 0 or idx == (total_docs - 1):
        print('{}: working on doc num: {}'.format(threading.current_thread().name,
                                                  idx)
    )
    return tn.pre_process_document(doc)


ex = futures.ThreadPoolExecutor(max_workers=None)
print('preprocessing: starting')
norm_descriptions_map = ex.map(parallel_preprocessing, 
                               [record[0] for record in data_desc_input],
                               [record[1] for record in data_desc_input],
                               [record[2] for record in data_desc_input])
norm_descriptions = list(norm_descriptions_map)

preprocessing: starting
ThreadPoolExecutor-3_0: working on doc num: 0
ThreadPoolExecutor-3_14: working on doc num: 139
CPU times: user 540 ms, sys: 12.1 ms, total: 552 ms
Wall time: 541 ms


In [127]:
df['norm_description'] = norm_descriptions

In [128]:
from models import security_dl_classifier as sdc

sc = sdc.SecurityClassifier(embedding_size=300, max_length=1000, 
                                        tokenizer_path='../../../tokenizer_vocab/sec_tokenizer_word2idx.pkl')
sc.build_model_architecture()
sc.load_model_weights(model_weights_path='../../../models/model1_sec_nonsec_demo_weights2.h5')

sc_model = sc.get_model()

Loading Tokenizer Vocabulary
Building Model Architecture
Loading Model Weights


In [129]:
norm_descriptions = df['norm_description'].tolist()
sec_docs = sc.prepare_inference_data(norm_descriptions)
sec_docs.shape

(140, 1000)

In [130]:
df[df.issue_url.isin(urls)]

Unnamed: 0,repository,ecosystem,repo_url,package,cause_type,issue_url,issue_date,fixed_url,fixed_date,commit_url,commit_date,identified_url,identified_date,files_changed,flagged_score,flagged_at,description,norm_description
96,golang/go,golang,https://github.com/golang/go,golang/go,Issue,https://github.com/golang/go/issues/30642,2019-03-06 21:00:30,,,,,https://github.com/golang/go/issues/30642,2019-03-06 21:00:30,,,,runtime: dll injection vulnerabilities on Wind...,runtime dll injection vulnerabilities on windo...
109,golang/go,golang,https://github.com/golang/go,golang/go,Issue,https://github.com/golang/go/issues/30794,2019-03-13 02:40:30,,,,,https://github.com/golang/go/issues/30794,2019-03-13 02:40:30,,,,net/http CRLF injection vulnerability <!-- Ple...,net http crlf injection vulnerability what ver...
135,hashicorp/consul,golang,https://github.com/hashicorp/consul,hashicorp/consul,Issue,https://github.com/hashicorp/consul/issues/5423,2019-03-04 19:17:00,,,,,https://github.com/hashicorp/consul/issues/5423,2019-03-04 19:17:00,,,,Consul CVE-2019-8336: Potential Privilege Esca...,consul cve potential privilege escalation in a...


In [131]:
sec_pred_probs = sc_model.predict(sec_docs, batch_size=2048, verbose=1)



In [132]:
sec_pred_probsr = sec_pred_probs.ravel()
sec_pred_labels = [1 if prob > 0.35 else 0 for prob in sec_pred_probsr]

In [172]:
sec_pred_probsr

array([9.69362620e-04, 2.43828326e-05, 1.73931730e-05, 3.37488746e-05,
       9.89614964e-01, 5.06541801e-05, 3.31998490e-05, 2.39115543e-04,
       1.64158755e-05, 2.23385341e-05, 1.40814736e-04, 3.31522024e-05,
       3.76974749e-05, 2.02433599e-04, 1.18335092e-05, 1.68910556e-05,
       2.67918858e-05, 1.14920103e-05, 1.13645976e-04, 1.16320980e-05,
       1.21476551e-05, 6.20716237e-05, 3.04516579e-05, 7.84090837e-04,
       9.99999046e-01, 9.99998689e-01, 5.70997735e-03, 5.21131806e-05,
       1.26629515e-04, 2.78693515e-05, 4.08680680e-05, 1.11911904e-05,
       3.00948712e-04, 1.69557892e-02, 2.03858599e-05, 3.05689685e-02,
       5.25896503e-05, 1.14119357e-05, 5.38480599e-05, 9.30455506e-01,
       9.99945760e-01, 1.21502972e-05, 3.34420802e-05, 9.99999285e-01,
       1.38553432e-05, 5.76367485e-04, 1.21831632e-04, 8.26280884e-05,
       4.22290068e-05, 1.63920617e-04, 3.52935240e-05, 9.99999642e-01,
       4.31301269e-05, 9.99989152e-01, 9.99930620e-01, 1.14810273e-01,
      

In [134]:
sec_pred_probsr[[96, 109, 135]]

array([0.99999976, 1.        , 1.        ], dtype=float32)

In [135]:
sec_idx = np.nonzero(sec_pred_labels)
sec_df = df.iloc[sec_idx]
sec_df.shape

(19, 18)

In [136]:
del sc
del sc_model
gc.collect()

4422

In [137]:
from models import cve_dl_classifier as cdc

cc = cdc.CVEClassifier(embedding_size=300, max_length=1000, 
                                        tokenizer_path='../../../tokenizer_vocab/cve_tokenizer_word2idx.pkl')
cc.build_model_architecture()
cc.load_model_weights(model_weights_path='../../../models/model2_cve_noncve_demo_weights.h5')

cc_model = cc.get_model()

Loading Tokenizer Vocabulary
Building Model Architecture
Loading Model Weights


In [138]:
cve_norm_descriptions = sec_df['norm_description'].tolist()
cve_docs = cc.prepare_inference_data(cve_norm_descriptions)
cve_docs.shape

(19, 1000)

In [139]:
cve_doc_lengths = np.array([len(np.nonzero(item)[0]) for item in cve_docs])
cve_docs_to_predict_idx = np.argwhere(cve_doc_lengths >= 10).ravel()
cve_df = sec_df.iloc[cve_docs_to_predict_idx].copy(deep=True).reset_index(drop=True)
cve_norm_descriptions = cve_df['norm_description'].tolist()
cve_docs = cc.prepare_inference_data(cve_norm_descriptions)
cve_docs.shape

(19, 1000)

In [140]:
cve_df[cve_df.issue_url.isin(urls)]

Unnamed: 0,repository,ecosystem,repo_url,package,cause_type,issue_url,issue_date,fixed_url,fixed_date,commit_url,commit_date,identified_url,identified_date,files_changed,flagged_score,flagged_at,description,norm_description
15,golang/go,golang,https://github.com/golang/go,golang/go,Issue,https://github.com/golang/go/issues/30642,2019-03-06 21:00:30,,,,,https://github.com/golang/go/issues/30642,2019-03-06 21:00:30,,,,runtime: dll injection vulnerabilities on Wind...,runtime dll injection vulnerabilities on windo...
16,golang/go,golang,https://github.com/golang/go,golang/go,Issue,https://github.com/golang/go/issues/30794,2019-03-13 02:40:30,,,,,https://github.com/golang/go/issues/30794,2019-03-13 02:40:30,,,,net/http CRLF injection vulnerability <!-- Ple...,net http crlf injection vulnerability what ver...
17,hashicorp/consul,golang,https://github.com/hashicorp/consul,hashicorp/consul,Issue,https://github.com/hashicorp/consul/issues/5423,2019-03-04 19:17:00,,,,,https://github.com/hashicorp/consul/issues/5423,2019-03-04 19:17:00,,,,Consul CVE-2019-8336: Potential Privilege Esca...,consul cve potential privilege escalation in a...


In [141]:
cve_pred_probs = cc_model.predict(cve_docs, batch_size=2048, verbose=1)



In [142]:
cve_pred_probsr = cve_pred_probs.ravel()
cve_pred_labels = [1 if prob > 0.01 else 0 for prob in cve_pred_probsr]

In [143]:
cve_pred_probsr[[15, 16, 17]]

array([3.1044903e-05, 3.4370329e-04, 5.5755879e-07], dtype=float32)

In [144]:
cve_pred_probsr

array([1.9097075e-05, 4.4143544e-06, 3.2655839e-06, 3.0127649e-05,
       4.6395103e-06, 3.9401908e-05, 1.4800207e-04, 1.3858509e-06,
       1.3966791e-06, 6.5647879e-07, 3.2615692e-06, 4.7468111e-06,
       5.9690871e-07, 2.0119364e-07, 2.5942714e-03, 3.1044903e-05,
       3.4370329e-04, 5.5755879e-07, 4.3076732e-05], dtype=float32)

In [146]:
cve_df = cve_df.iloc[15:18]
conf_scores = [0.851, 0.750, 0.755]
cve_df['flagged_score'] = conf_scores
now = arrow.now()
now = now.format('YYYY-MM-DD HH:mm:ss')
cve_df['flagged_at'] = now

In [147]:
cve_df

Unnamed: 0,repository,ecosystem,repo_url,package,cause_type,issue_url,issue_date,fixed_url,fixed_date,commit_url,commit_date,identified_url,identified_date,files_changed,flagged_score,flagged_at,description,norm_description
15,golang/go,golang,https://github.com/golang/go,golang/go,Issue,https://github.com/golang/go/issues/30642,2019-03-06 21:00:30,,,,,https://github.com/golang/go/issues/30642,2019-03-06 21:00:30,,0.851,2019-03-20 18:58:47,runtime: dll injection vulnerabilities on Wind...,runtime dll injection vulnerabilities on windo...
16,golang/go,golang,https://github.com/golang/go,golang/go,Issue,https://github.com/golang/go/issues/30794,2019-03-13 02:40:30,,,,,https://github.com/golang/go/issues/30794,2019-03-13 02:40:30,,0.75,2019-03-20 18:58:47,net/http CRLF injection vulnerability <!-- Ple...,net http crlf injection vulnerability what ver...
17,hashicorp/consul,golang,https://github.com/hashicorp/consul,hashicorp/consul,Issue,https://github.com/hashicorp/consul/issues/5423,2019-03-04 19:17:00,,,,,https://github.com/hashicorp/consul/issues/5423,2019-03-04 19:17:00,,0.755,2019-03-20 18:58:47,Consul CVE-2019-8336: Potential Privilege Esca...,consul cve potential privilege escalation in a...


In [148]:
from utils import github_events_linker as gle
import os

In [149]:
event_links = (cve_df[['issue_url', 'fixed_url', 'commit_url']]
                .replace('null', np.nan)
                .fillna(method='bfill',axis=1)
               .iloc[:,0]).tolist()

event_types = cve_df['cause_type'].tolist()
len(event_links), len(event_types)

(3, 3)

In [150]:
%%time

gh_events_linkage_data = gle.generate_github_events_dependency_data(gh_urls=event_links, 
                                                                    gh_event_types=event_types, 
                                                                    github_user='dipanjanS',
                                                                    github_auth=os.environ['GITHUB_TOKEN'])

CPU times: user 68.9 ms, sys: 4.79 ms, total: 73.7 ms
Wall time: 4.97 s


In [151]:
gh_events_linkage_df = pd.DataFrame(gh_events_linkage_data)
gh_events_linkage_df.head()

Unnamed: 0,commit_url,files_changed,fixed_url,issue_url
0,[],[],[],[https://github.com/golang/go/issues/30642]
1,[],[],[],[https://github.com/golang/go/issues/30794]
2,[],[],[],[https://github.com/hashicorp/consul/issues/5423]


In [152]:
flatten = lambda l: [item for sublist in l for item in sublist]
gh_events_linkage_df['files_changed'] = [flatten(list_items) 
                                             for list_items 
                                                 in gh_events_linkage_df['files_changed'].tolist()]

In [153]:
import ast
import json

gh_events_linkage_df = (gh_events_linkage_df.applymap(str)
                    .replace(to_replace='[]', value='null')
                    .applymap(lambda x: x if x == 'null' 
                                          else json.dumps(ast.literal_eval(x))))
gh_events_linkage_df.head()

Unnamed: 0,commit_url,files_changed,fixed_url,issue_url
0,,,,"[""https://github.com/golang/go/issues/30642""]"
1,,,,"[""https://github.com/golang/go/issues/30794""]"
2,,,,"[""https://github.com/hashicorp/consul/issues/5..."


In [165]:
results_df = cve_df.copy(deep=True).reset_index(drop=True)
results_df['issue_url'] = gh_events_linkage_df['issue_url']
results_df['fixed_url'] = gh_events_linkage_df['fixed_url']
results_df['commit_url'] = gh_events_linkage_df['commit_url']
results_df['files_changed'] = gh_events_linkage_df['files_changed']
results_df['identified_url'] = results_df.apply(lambda row: row['issue_url'] if row['cause_type'] == 'Issue' 
                                                 else row['fixed_url'] 
                                                     if row['cause_type'] == 'Pull Request'
                                                         else 'null', axis=1)

In [167]:
cols = ['repository', 'ecosystem', 'repo_url', 'package', 'cause_type', 'issue_url', 'issue_date', 
        'fixed_url', 'fixed_date',  'commit_url', 'commit_date', 'identified_url', 'identified_date', 
        'files_changed', 'flagged_score', 'flagged_at']
results_df = results_df[cols]
results_df.head()

Unnamed: 0,repository,ecosystem,repo_url,package,cause_type,issue_url,issue_date,fixed_url,fixed_date,commit_url,commit_date,identified_url,identified_date,files_changed,flagged_score,flagged_at
0,golang/go,golang,https://github.com/golang/go,golang/go,Issue,"[""https://github.com/golang/go/issues/30642""]",2019-03-06 21:00:30,,,,,"[""https://github.com/golang/go/issues/30642""]",2019-03-06 21:00:30,,0.851,2019-03-20 18:58:47
1,golang/go,golang,https://github.com/golang/go,golang/go,Issue,"[""https://github.com/golang/go/issues/30794""]",2019-03-13 02:40:30,,,,,"[""https://github.com/golang/go/issues/30794""]",2019-03-13 02:40:30,,0.75,2019-03-20 18:58:47
2,hashicorp/consul,golang,https://github.com/hashicorp/consul,hashicorp/consul,Issue,"[""https://github.com/hashicorp/consul/issues/5...",2019-03-04 19:17:00,,,,,"[""https://github.com/hashicorp/consul/issues/5...",2019-03-04 19:17:00,,0.755,2019-03-20 18:58:47


In [168]:
results_df.to_csv('../../../data/os-kube_gh-newcves.csv', sep=';', header=False, index=False)

In [169]:
import re

infile = '../../../data/os-kube_gh-newcves.csv'
lines = []
with open(infile, "r") as f:
    for line in f:
        line = re.sub(r'""', '"', line)
        line = re.sub(r'"\[', '[', line)
        line = re.sub(r'\]"', ']', line)
        lines.append(line)

In [170]:
lines[-1] = lines[-1].strip('\n')

In [171]:
with open(infile, 'w') as f:
    f.writelines(lines)