# Security vs Non-Security Prediction - Deep Learning with Bi-directional GRUs 

In [0]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!ls "/content/drive/My Drive"

'Colab Notebooks'   dl_model4.h5   first_model_v1.h5   ytest_labels.pkl
 dl_model2.h5	    dl_model.h5    Xtest_norm.pkl      ytrain_labels.pkl
 dl_model3.h5	    file.txt	   Xtrain_norm.pkl


In [0]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
import dill
from sklearn.model_selection import train_test_split
from sklearn import metrics
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, CuDNNLSTM
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


# Data Retrieval

Following is normalized (pre-processed text data) issue\PR descriptions and corresponding labels which I had pickled earlier.
- This is the full data which we have BTW including positives and negatives 
- class label 0 - non-security related data potentially which was unflagged by Regexes
- class label 1 - potentially security related data which was flagged by Regexes
- class label 2 - security and CVE related data which we manually mapped

In [0]:
with open('/content/drive/My Drive/Xtrain_norm.pkl', 'rb') as f:
    X_train = []
    while True:
        try:
            X_train.extend(dill.load(f))
        except:
            print('EOF reached')
            break
            
with open('/content/drive/My Drive/Xtest_norm.pkl', 'rb') as f:
    X_test = []
    while True:
        try:
            X_test.extend(dill.load(f))
        except:
            print('EOF reached')
            break
            
with open('/content/drive/My Drive/ytrain_labels.pkl', 'rb') as f:
    y_train = dill.load(f)
    
with open('/content/drive/My Drive/ytest_labels.pkl', 'rb') as f:
    y_test = dill.load(f)
    
len(X_train), len(X_test), len(y_train), len(y_test)

EOF reached
EOF reached


(481390, 120348, 481390, 120348)

- We combine the security data under one label (1) this includes both CVE and Non-CVEs
- Our other class (0) is negative non-security related data
- Intent is to build a model which can tell is first if an issue is really security related and then pass it through our 2nd model (CVE vs. non-CVE)

In [0]:
train_positives = []
y_train_positives = []
for doc, label in zip(X_train, y_train):
  if label != 0:
    y_train_positives.append(1)
  else:
    y_train_positives.append(label)
  train_positives.append(doc)
        

test_positives = []
y_test_positives = []
for doc, label in zip(X_test, y_test):
  if label != 0:
    y_test_positives.append(1)
  else:
    y_test_positives.append(label)
  test_positives.append(doc)
        
        
len(train_positives), len(y_train_positives), len(test_positives), len(y_test_positives)

(481390, 481390, 120348, 120348)

# Data Preparation

In [0]:
X_train, X_val, y_train, y_val = train_test_split(train_positives, y_train_positives, test_size=0.1, random_state=42)
X_test, y_test = test_positives, y_test_positives
len(X_train), len(X_val), len(X_test), len(y_train), len(y_val), len(y_test)

(433251, 48139, 120348, 433251, 48139, 120348)

In [0]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 300000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 1000 # max number of words in a doc to use

In [0]:
## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
train_X = tokenizer.texts_to_sequences(X_train)
val_X = tokenizer.texts_to_sequences(X_val)
test_X = tokenizer.texts_to_sequences(X_test)

In [0]:
## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

In [0]:
train_y = np.array(y_train)
val_y = np.array(y_val)
test_y = np.array(y_test)

In [0]:
print('Sample Data:')
print(X_train[:3])
print(train_X[:3, :])
print(train_y[:3])

Sample Data:
['use responsestatus to set response status in error controller see suggestion from rstoyanchev what about checking what the current response status is if it is or not set look for an responsestatus annotation', 'replace none with empty string before creating td', 'sinatra dart better handling of reserved words for sinatra dart generators pr checklist read the contribution guildelines md ran the she will batch script under bin to update petstore sample so that cis can verify the change for instance only need to run bin lang petstore sh and bin security lang petstore sh if updating the lang e g php ruby python etc code generator or lang client s mustache templates filed the pr against the correct branch master for non breaking changes and branch for breaking non backward compatible changes description of the pr better handling of reserved words for sintatra dart generators for']
[[    0     0     0 ...    43 22668   834]
 [    0     0     0 ...   248   553  1774]
 [    0   

We build a `class_weight` dictionary to tell our deep learning model to give higher weightage to each security related issue. Based on the weights computed below, you can see the class imbalance is quite bad but not as terrible as our CVE imbalance in the other model.

In [0]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train_y),
                                                 train_y)
class_weights = dict(enumerate(class_weights))
class_weights

{0: 0.5814107347028605, 1: 3.5708481002225336}

# Model Training

In [0]:
inp = tf.keras.layers.Input(shape=(maxlen,))
x = tf.keras.layers.Embedding(max_features, embed_size)(inp)
x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
x = tf.keras.layers.GlobalMaxPool1D()(x)
x = tf.keras.layers.Dense(16, activation="relu")(x)
x = tf.keras.layers.Dropout(rate=0.1)(x)
x = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.models.Model(inputs=inp, outputs=x)
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding (Embedding)        (None, 1000, 300)         90000000  
_________________________________________________________________
bidirectional (Bidirectional (None, 1000, 128)         140160    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
_________________________________________________________________
dropout (Dropout)    

## Training on Google's TPUs

This was just done to train it faster, it's really experimental and has a lot of issues like we need to convert TPU to CPU model and then only make predictions. Won't recommend this in practice yet. We will retrain this in our GPU based systems.

In [0]:
tpu_model = tf.contrib.tpu.keras_to_tpu_model(
    model,
    strategy=tf.contrib.tpu.TPUDistributionStrategy(
        tf.contrib.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
    )
)
tpu_model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.binary_crossentropy,
    metrics=['accuracy']
)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

INFO:tensorflow:Querying Tensorflow master (grpc://10.102.37.26:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 14581925633064350827)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 17270929687828325762)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 6291855618003165313)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:

In [16]:
tpu_model.fit(train_X, train_y, batch_size=512, epochs=10,
          class_weight=class_weights, validation_data=(val_X, val_y))

Train on 433251 samples, validate on 48139 samples
Epoch 1/10
INFO:tensorflow:New input shapes; (re-)compiling: mode=train (# of cores 8), [TensorSpec(shape=(64,), dtype=tf.int32, name='core_id0'), TensorSpec(shape=(64, 1000), dtype=tf.float32, name='input_1_10'), TensorSpec(shape=(64, 1), dtype=tf.float32, name='dense_1_target_10')]
INFO:tensorflow:Overriding default placeholder.
INFO:tensorflow:Cloning Adam {'lr': 0.0010000000474974513, 'beta_1': 0.8999999761581421, 'beta_2': 0.9990000128746033, 'decay': 0.0, 'epsilon': 1e-07, 'amsgrad': False}
INFO:tensorflow:Remapping placeholder for input_1
Instructions for updating:
Use tf.cast instead.
INFO:tensorflow:KerasCrossShard: <tensorflow.python.keras.optimizers.Adam object at 0x7fcd1fd07748> []
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Started compiling
INFO:tensorflow:Finished compiling. Time elapsed: 10.316805362701416 secs
INFO:tensorf

<tensorflow.python.keras.callbacks.History at 0x7fcd1fd95f28>

# Model Evaluation

In [17]:
cpu_model = tpu_model.sync_to_cpu()
pred_y = cpu_model.predict([test_X], batch_size=1024, verbose=1)
pred_y

INFO:tensorflow:Copying TPU weights to the CPU
INFO:tensorflow:TPU -> CPU lr: 0.0010000000474974513
INFO:tensorflow:TPU -> CPU beta_1: 0.8999999761581421
INFO:tensorflow:TPU -> CPU beta_2: 0.9990000128746033
INFO:tensorflow:TPU -> CPU decay: 0.0
INFO:tensorflow:TPU -> CPU epsilon: 1e-07
INFO:tensorflow:TPU -> CPU amsgrad: False


array([[0.0000000e+00],
       [0.0000000e+00],
       [0.0000000e+00],
       ...,
       [3.1613294e-09],
       [6.4399886e-05],
       [9.2352337e-08]], dtype=float32)

In [0]:
pred_y = pred_y.ravel()
pred_y = [1 if prob > 0.5 else 0 for prob in pred_y]

In [0]:
from sklearn.metrics import confusion_matrix, classification_report

In [20]:
confusion_matrix(y_true=test_y, y_pred=pred_y)
#array([[102996,    501],
#       [  1438,  15413]])

array([[102999,    498],
       [  1470,  15381]])

In [21]:
print(classification_report(y_true=test_y, y_pred=pred_y))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    103497
           1       0.97      0.91      0.94     16851

   micro avg       0.98      0.98      0.98    120348
   macro avg       0.98      0.95      0.97    120348
weighted avg       0.98      0.98      0.98    120348



- This model uses a Bidirectional GRU + trying to tackle class imbalance with class weights
- Embeddings are trained from scratch
- We have trained on Security related data  vs. Non-Security data
- Model seems to perform quite well in distinguishing security vs non-security issues\PRs without much tuning


In [0]:
cpu_model.save('/content/drive/My Drive/first_model_v1cpu.h5')

In [23]:
tpu_model.save('/content/drive/My Drive/first_model_v1tpu.h5')

INFO:tensorflow:Copying TPU weights to the CPU
INFO:tensorflow:TPU -> CPU lr: 0.0010000000474974513
INFO:tensorflow:TPU -> CPU beta_1: 0.8999999761581421
INFO:tensorflow:TPU -> CPU beta_2: 0.9990000128746033
INFO:tensorflow:TPU -> CPU decay: 0.0
INFO:tensorflow:TPU -> CPU epsilon: 1e-07
INFO:tensorflow:TPU -> CPU amsgrad: False
