In [31]:
import bq_utils as bqu
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
import arrow
import gc

In [32]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../auth/bq_key.json'
gh_archive = bqu.BigQueryHelper(active_project= "githubarchive", 
                                dataset_name = "day")

In [33]:
gh_repo_links = open('../../../data/golang-repo-list.txt').readlines()
gh_repo_links = np.array([item.strip('\n').strip() for item in gh_repo_links])
gh_repo_links[:10], gh_repo_links.shape

(array(['https://github.com/urfave/cli',
        'https://github.com/mreiferson/go-httpclient',
        'https://github.com/crewjam/rfc5424',
        'https://github.com/kubernetes/heapster',
        'https://github.com/go-openapi/spec',
        'https://github.com/andygrunwald/go-gerrit',
        'https://github.com/openshift/ci-secret-mirroring-controller',
        'https://github.com/fsnotify/fsnotify',
        'https://github.com/BurntSushi/toml',
        'https://github.com/kubernetes-csi/drivers'], dtype='<U80'), (851,))

In [34]:
import re

pattern = re.compile(r'.*?github.com/(.*)', re.I)
repo_names = np.array(list(filter(None,[pattern.search(item).group(1) 
                                            if pattern.search(item) else None 
                                               for item in gh_repo_links])))
repo_names[:10], repo_names.shape

(array(['urfave/cli', 'mreiferson/go-httpclient', 'crewjam/rfc5424',
        'kubernetes/heapster', 'go-openapi/spec', 'andygrunwald/go-gerrit',
        'openshift/ci-secret-mirroring-controller', 'fsnotify/fsnotify',
        'BurntSushi/toml', 'kubernetes-csi/drivers'], dtype='<U61'), (845,))

In [35]:
def add_query_params(query, params_dict):
    for i, j in params_dict.items():
        query = query.replace(i, j)
    return query

In [36]:
st = arrow.now().shift(days=-5)
et = arrow.now()
last_5_days = [dt.format('YYYYMMDD') for dt in arrow.Arrow.range('day', st, et)]
last_5_days, len(last_5_days)

(['20190317', '20190318', '20190319', '20190320', '20190321', '20190322'], 6)

In [37]:
year_prefix = '20*'
date_list = [item[2:] for item in last_5_days]
query_params = {
    '{year_prefix_wildcard}': year_prefix,
    '{year_suffix_month_day}': '('+', '.join(["'"+d+"'" for d in date_list])+')',
    '{repo_names}': '('+', '.join(["'"+r+"'" for r in repo_names])+')'
}

In [38]:
query = """
SELECT  type, count(*)
        FROM `githubarchive.day.{year_prefix_wildcard}`
        WHERE _TABLE_SUFFIX IN {year_suffix_month_day}
        AND repo.name in {repo_names}
        AND type in ('PullRequestEvent', 'IssuesEvent')
        GROUP BY type
"""
query = add_query_params(query, query_params)
gh_archive.estimate_query_size(query)

0.33943079970777035

In [39]:
df = gh_archive.query_to_pandas(query)
df

Unnamed: 0,type,f0_
0,PullRequestEvent,2934
1,IssuesEvent,1608


In [40]:
query = """
SELECT 
    repo.name as repo_name, 
    type as event_type, 
    actor.id as actor_id,
    actor.login as actor_name,
    JSON_EXTRACT_SCALAR(payload, '$.action') as issue_status,
    JSON_EXTRACT_SCALAR(payload, '$.issue.url') as issue_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') as issue_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') as issue_creator_name,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.url') as issue_creator_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.user.html_url') as issue_creator_url,
    JSON_EXTRACT_SCALAR(payload, '$.issue.comments') as comment_count,
    JSON_EXTRACT_SCALAR(payload, '$.issue.id') as issue_id,
    JSON_EXTRACT_SCALAR(payload, '$.issue.number') as issue_number,
    JSON_EXTRACT_SCALAR(payload, '$.issue.created_at') as issue_created_at,
    JSON_EXTRACT_SCALAR(payload, '$.issue.updated_at') as issue_updated_at,
    JSON_EXTRACT_SCALAR(payload, '$.issue.closed_at') as issue_closed_at,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.issue.title'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as issue_title,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.issue.body'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as issue_body
        
FROM `githubarchive.day.{year_prefix_wildcard}`
    WHERE _TABLE_SUFFIX IN {year_suffix_month_day}
    AND repo.name in {repo_names}
    AND type = 'IssuesEvent'
    """

query = add_query_params(query, query_params)
gh_archive.estimate_query_size(query)

23.028084501624107

In [41]:
issues_df = gh_archive.query_to_pandas(query)

In [42]:
issues_df.shape

(1608, 18)

In [43]:
issues_df.issue_created_at = pd.to_datetime(issues_df.issue_created_at)
issues_df.issue_updated_at = pd.to_datetime(issues_df.issue_updated_at)
issues_df.issue_closed_at = pd.to_datetime(issues_df.issue_closed_at)
issues_df = issues_df.loc[issues_df.groupby('issue_url').issue_updated_at.idxmax(skipna=False)].reset_index(drop=True)
issues_df.shape

(1336, 18)

In [44]:
query = """
SELECT 
    repo.name as repo_name, 
    type as event_type, 
    actor.id as actor_id,
    actor.login as actor_name,
    JSON_EXTRACT_SCALAR(payload, '$.action') as pr_status,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.id') as pr_id,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.number') as pr_number,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.url') as pr_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as pr_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.diff_url') as pr_diff_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.patch_url') as pr_patch_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.login') as pr_creator_name,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.url') as pr_creator_api_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.user.html_url') as pr_creator_url,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.created_at') as pr_created_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.updated_at') as pr_updated_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.closed_at') as pr_closed_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged_at') as pr_merged_at,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged') as pr_merged_status,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.comments') as pr_comments_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.review_comments') as pr_review_comments_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.commits') as pr_commits_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.additions') as pr_additions_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.deletions') as pr_deletions_count,
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.changed_files') as pr_changed_files_count,    
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.pull_request.title'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as pr_title,
    TRIM(REGEXP_REPLACE(
             REGEXP_REPLACE(
                 JSON_EXTRACT_SCALAR(payload, '$.pull_request.body'), 
                 r'\\r\\n|\\r|\\n', 
                 ' '),
             r'\s{2,}', 
             ' ')) as pr_body
        
FROM `githubarchive.day.{year_prefix_wildcard}`
    WHERE _TABLE_SUFFIX IN {year_suffix_month_day}
    AND repo.name in {repo_names}
    AND type = 'PullRequestEvent'
"""

query = add_query_params(query, query_params)
gh_archive.estimate_query_size(query)

23.028084501624107

In [45]:
prs_df = gh_archive.query_to_pandas(query)
prs_df.shape

(2934, 27)

In [46]:
prs_df.pr_created_at = pd.to_datetime(prs_df.pr_created_at)
prs_df.pr_updated_at = pd.to_datetime(prs_df.pr_updated_at)
prs_df.pr_closed_at = pd.to_datetime(prs_df.pr_closed_at)
prs_df.pr_merged_at = pd.to_datetime(prs_df.pr_merged_at)
prs_df = prs_df.loc[prs_df.groupby('pr_url').pr_updated_at.idxmax(skipna=False)].reset_index(drop=True)
prs_df.shape

(2010, 27)

In [72]:
df1 = pd.DataFrame()
df1['repository'] = issues_df['repo_name'].tolist()
df1['ecosystem'] = ['golang'] * len(issues_df)
df1['repo_url'] = ['https://github.com/'+repo_name 
                       for repo_name in issues_df['repo_name'].tolist()]
df1['package'] = df1['repository']
df1['cause_type'] = ['Issue'] * len(issues_df)
df1['issue_url'] = issues_df['issue_url']
df1['issue_date'] = issues_df['issue_created_at']
df1['fixed_url'] = 'null'
df1['fixed_date'] = 'null'
df1['commit_url'] = 'null'
df1['commit_date'] = 'null'
df1['identified_url'] = df1['issue_url']
df1['identified_date'] = df1['issue_date']
df1['files_changed'] = 'null'
df1['flagged_score'] = 'null'
df1['flagged_at'] = 'null'
df1['description'] = issues_df['issue_title'].fillna(value='').map(str) + ' ' + issues_df['issue_body'].fillna(value='')

df2 = pd.DataFrame()
df2['repository'] = prs_df['repo_name'].tolist()
df2['ecosystem'] = ['golang'] * len(prs_df)
df2['repo_url'] = ['https://github.com/'+repo_name 
                       for repo_name in prs_df['repo_name'].tolist()]
df2['package'] = df2['repository']
df2['cause_type'] = ['Pull Request'] * len(prs_df)
df2['issue_url'] = 'null'
df2['issue_date'] = 'null'
df2['fixed_url'] = prs_df['pr_url']
df2['fixed_date'] = prs_df['pr_created_at']
df2['commit_url'] = 'null'
df2['commit_date'] = 'null'
df2['identified_url'] = df2['fixed_url']
df2['identified_date'] = df2['fixed_date']
df2['files_changed'] = 'null'
df2['flagged_score'] = 'null'
df2['flagged_at'] = 'null'
df2['description'] = prs_df['pr_title'].fillna(value='').map(str) + ' ' + prs_df['pr_body'].fillna(value='')

df = pd.concat([df1, df2], axis=0, sort=False).sample(frac=1)
df = df[df['description'] != ''].reset_index(drop=True)
df.shape

(3346, 17)

In [73]:
data_descriptions = df['description'].values
total_docs = len(data_descriptions)
data_desc_input = [[idx, doc, total_docs] for idx, doc in enumerate(data_descriptions)]

In [74]:
%%time

from utils import text_normalizer as tn
from concurrent import futures
import threading


def parallel_preprocessing(idx, doc, total_docs):
    if idx % 5000 == 0 or idx == (total_docs - 1):
        print('{}: working on doc num: {}'.format(threading.current_thread().name,
                                                  idx)
    )
    return tn.pre_process_document(doc)


ex = futures.ThreadPoolExecutor(max_workers=None)
print('preprocessing: starting')
norm_descriptions_map = ex.map(parallel_preprocessing, 
                               [record[0] for record in data_desc_input],
                               [record[1] for record in data_desc_input],
                               [record[2] for record in data_desc_input])
norm_descriptions = list(norm_descriptions_map)

preprocessing: starting
ThreadPoolExecutor-3_0: working on doc num: 0
ThreadPoolExecutor-3_14: working on doc num: 3345
CPU times: user 8.95 s, sys: 192 ms, total: 9.14 s
Wall time: 8.73 s


In [77]:
df['norm_description'] = norm_descriptions

In [75]:
from models import security_dl_classifier as sdc

sc = sdc.SecurityClassifier(embedding_size=300, max_length=1000, 
                                        tokenizer_path='../../../tokenizer_vocab/sec_tokenizer_word2idx.pkl')
sc.build_model_architecture()
sc.load_model_weights(model_weights_path='../../../models/model1_sec_nonsec_demo_weights2.h5')

sc_model = sc.get_model()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Loading Tokenizer Vocabulary
Building Model Architecture
Loading Model Weights


In [78]:
norm_descriptions = df['norm_description'].tolist()
sec_docs = sc.prepare_inference_data(norm_descriptions)
sec_docs.shape

(3346, 1000)

In [79]:
sec_pred_probs = sc_model.predict(sec_docs, batch_size=2048, verbose=1)



In [80]:
sec_pred_probsr = sec_pred_probs.ravel()
sec_pred_labels = [1 if prob > 0.35 else 0 for prob in sec_pred_probsr]

In [81]:
sec_idx = np.nonzero(sec_pred_labels)
sec_df = df.iloc[sec_idx]
sec_df.shape

(441, 18)

In [82]:
del sc
del sc_model
gc.collect()

2059

In [83]:
from models import cve_dl_classifier as cdc

cc = cdc.CVEClassifier(embedding_size=300, max_length=1000, 
                                        tokenizer_path='../../../tokenizer_vocab/cve_tokenizer_word2idx.pkl')
cc.build_model_architecture()
cc.load_model_weights(model_weights_path='../../../models/model2_cve_noncve_demo_weights.h5')

cc_model = cc.get_model()

Loading Tokenizer Vocabulary
Building Model Architecture
Loading Model Weights


In [84]:
cve_norm_descriptions = sec_df['norm_description'].tolist()
cve_docs = cc.prepare_inference_data(cve_norm_descriptions)
cve_docs.shape

(441, 1000)

In [85]:
cve_doc_lengths = np.array([len(np.nonzero(item)[0]) for item in cve_docs])
cve_docs_to_predict_idx = np.argwhere(cve_doc_lengths >= 10).ravel()
cve_df = sec_df.iloc[cve_docs_to_predict_idx].copy(deep=True).reset_index(drop=True)
cve_norm_descriptions = cve_df['norm_description'].tolist()
cve_docs = cc.prepare_inference_data(cve_norm_descriptions)
cve_docs.shape

(429, 1000)

In [87]:
cve_pred_probs = cc_model.predict(cve_docs, batch_size=2048, verbose=1)



In [88]:
cve_pred_probsr = cve_pred_probs.ravel()
cve_pred_labels = [1 if prob > 0.01 else 0 for prob in cve_pred_probsr]

In [90]:
cve_idx = np.nonzero(cve_pred_labels)
cve_df = cve_df.iloc[cve_idx].copy(deep=True)
conf_scores = cve_pred_probsr[cve_idx]
cve_df['flagged_score'] = conf_scores

import arrow

now = arrow.now()
now = now.format('YYYY-MM-DD HH:mm:ss')
cve_df['flagged_at'] = now

In [91]:
cve_df

Unnamed: 0,repository,ecosystem,repo_url,package,cause_type,issue_url,issue_date,fixed_url,fixed_date,commit_url,commit_date,identified_url,identified_date,files_changed,flagged_score,flagged_at,description,norm_description
14,json-iterator/go,golang,https://github.com/json-iterator/go,json-iterator/go,Issue,https://github.com/json-iterator/go/issues/349,2019-03-21 16:20:47,,,,,https://github.com/json-iterator/go/issues/349,2019-03-21 16:20:47,,0.019789,2019-03-22 11:05:08,[Java] Retriving string from Any Object Greeti...,java retriving string from any object greeting...
49,istio/istio,golang,https://github.com/istio/istio,istio/istio,Issue,https://github.com/istio/istio/issues/10822,2019-01-09 07:54:40,,,,,https://github.com/istio/istio/issues/10822,2019-01-09 07:54:40,,0.921935,2019-03-22 11:05:08,Pilot goroutine leak and memory leak **Describ...,pilot goroutine leak and memory leak describe ...
54,istio/istio,golang,https://github.com/istio/istio,istio/istio,Pull Request,,,https://github.com/istio/istio/pull/12609,2019-03-19 20:16:37,,,https://github.com/istio/istio/pull/12609,2019-03-19 20:16:37,,0.057859,2019-03-22 11:05:08,Add helm value to make whitebox default mode T...,add helm value to make whitebox default mode t...
63,coreos/ignition,golang,https://github.com/coreos/ignition,coreos/ignition,Issue,https://github.com/coreos/ignition/issues/762,2019-03-20 18:41:21,,,,,https://github.com/coreos/ignition/issues/762,2019-03-20 18:41:21,,0.088262,2019-03-22 11:05:08,Adding users ends up with unlabeled /etc/subui...,adding users ends up with unlabeled etc subuid...
79,brancz/kube-rbac-proxy,golang,https://github.com/brancz/kube-rbac-proxy,brancz/kube-rbac-proxy,Issue,https://github.com/brancz/kube-rbac-proxy/issu...,2019-03-21 04:27:32,,,,,https://github.com/brancz/kube-rbac-proxy/issu...,2019-03-21 04:27:32,,0.994179,2019-03-22 11:05:08,Is versio 0.4.1 supported for Openshift Origin...,is versio supported for openshift origin consi...
95,libopenstorage/openstorage,golang,https://github.com/libopenstorage/openstorage,libopenstorage/openstorage,Pull Request,,,https://github.com/libopenstorage/openstorage/...,2019-03-18 21:02:42,,,https://github.com/libopenstorage/openstorage/...,2019-03-18 21:02:42,,0.95201,2019-03-22 11:05:08,Add authentication to REST Calls for Migration...,add authentication to rest calls for migration...
144,knative/serving,golang,https://github.com/knative/serving,knative/serving,Pull Request,,,https://github.com/knative/serving/pull/3451,2019-03-18 22:11:30,,,https://github.com/knative/serving/pull/3451,2019-03-18 22:11:30,,0.99896,2019-03-22 11:05:08,Remove SetConditions and GetConditions for Clu...,remove setconditions and getconditions for clu...
147,openshift/openshift-azure,golang,https://github.com/openshift/openshift-azure,openshift/openshift-azure,Pull Request,,,https://github.com/openshift/openshift-azure/p...,2019-03-19 14:44:02,,,https://github.com/openshift/openshift-azure/p...,2019-03-19 14:44:02,,0.565799,2019-03-22 11:05:08,disable docker build strategy ```release-note ...,disable docker build strategy release note non...
193,fsouza/fake-gcs-server,golang,https://github.com/fsouza/fake-gcs-server,fsouza/fake-gcs-server,Pull Request,,,https://github.com/fsouza/fake-gcs-server/pull/20,2019-03-18 20:51:34,,,https://github.com/fsouza/fake-gcs-server/pull/20,2019-03-18 20:51:34,,0.999584,2019-03-22 11:05:08,Add CodeLingo Tenets It looks like this is a G...,add codelingo tenets it looks like this is a g...
217,containerd/containerd,golang,https://github.com/containerd/containerd,containerd/containerd,Issue,https://github.com/containerd/containerd/issue...,2019-03-19 09:54:49,,,,,https://github.com/containerd/containerd/issue...,2019-03-19 09:54:49,,0.860145,2019-03-22 11:05:08,io_pgetevents is blocked by the default seccom...,io pgetevents is blocked by the default seccom...


In [93]:
cve_df.description.values

array(['[Java] Retriving string from Any Object Greetings. I have a simple String object like this String blabla = "{"id": 8,"name": "SANTARÉM"}", in which i use JsonIterator.deserialize(blabla).get("name") and what i get is "SANTARɍ" and not "SANTARÉM"; I already try to check if JsonIterator has some configuration for enconding strings but didn\'t find anything. Kind Regards,',
       'Pilot goroutine leak and memory leak **Describe the bug** When Pilot Ads pushAll function return err , the goroutine will return at once . this will cause the receiveThread goroutine leak and also leak the XdsConnection instance which may has large data in a big istio cluster. The bug relative code is in pilot/pkg/proxy/v2/ads.go , (s *DiscoveryServer) StreamAggregatedResources function . **Expected behavior** no goroutine leak no memory leak **Steps to reproduce the bug** 1. building lots of service in your istio cluster 2. try to make pushAll fail , in our production environment there has an config up