# Code Vulnerability Detection on IBM/D2A 

## 1. Introduction

The current dataset contains the samples generated from 6 open-source projects, OpenSSL, FFmpeg, HTTPD, NGINX, Libtiff, and Libav. For each project, there are 3 pickle.gz files like `nginx_after_fix_extractor_0.pickle.gz`, `nginx_labeler_1.pickle.gz`, and `nginx_labeler_0.pickle.gz`, which are generated by two slightly different extractors. Each `pickle.gz file` contains compressed samples in JSON.

The [field and discription](https://dax-cdn.cdn.appdomain.cloud/dax-d2a/1.0.0/data-preview/index.html?_ga=2.207765842.1169579792.1649770384-1526562817.1648132481) of the data is:

In [None]:
# import split_data as sd
# data = sd.read_pickle_data_file("./dataset/httpd_labeler_0.pickle.gz")

## 2. Data Preparation and Preprocessing

In Leaderboard Dataset, there are 4 directories corresponding to 4 tasks of the leaderboard. Each directory contains 3 csv files corresponding to the train (80%), dev (10%) and test (10%) split. The columns in the split files are identical except the test split which does not contain the label column. In this project, we are going to predict **whether a code snippet contains bugs or not** using the `Function` dataset.

Firstly, we are going to obtain the codes of train, dev, and test.

In [None]:
import csv

snippets_train = list()
snippets_dev = list()
snippets_test = list()
with open('./dataset/d2a_lbv1_function_train.csv') as csvfile:
    reader = csv.reader(csvfile)
    for i, row in enumerate(reader):
        if i == 0: 
            continue
        snippets_train.append((row[2], row[1]))

with open('./dataset/d2a_lbv1_function_dev.csv') as csvfile:
    reader = csv.reader(csvfile)
    for i, row in enumerate(reader):
        if i == 0: 
            continue
        snippets_dev.append((row[2], row[1]))
        
with open('./dataset/d2a_lbv1_function_test.csv') as csvfile:
    reader = csv.reader(csvfile)
    for i, row in enumerate(reader):
        if i == 0: 
            continue
        snippets_test.append(row[1])
        
print(f"Loaded: {len(snippets_train)} samples in train; {len(snippets_dev)} samples in dev; {len(snippets_test)} samples in test.")

## 3. Syntax Tree and Code Tensorization

Tree-sitter is a parser generator tool and an incremental parsing library. It can build a concrete syntax tree for a source file and efficiently update the syntax tree as the source file is edited. To use C++ parser in Python code:

In [None]:
from tree_sitter import Language, Parser

Language.build_library(
  'build/my-languages.so',
  [
    'vendor/tree-sitter-cpp'
  ]
)
CPP_LANGUAGE = Language('build/my-languages.so', 'cpp')
parser = Parser()
parser.set_language(CPP_LANGUAGE)

Try to put the second snippet in training set to the [playground](https://tree-sitter.github.io/tree-sitter/playground)!

<details>
<summary>The second snippet:</summary>

```cpp
static ngx_int_t
ngx_http_file_cache_lock(ngx_http_request_t *r, ngx_http_cache_t *c)
{
    ngx_msec_t                 now, timer;
    ngx_http_file_cache_t     *cache;

    if (!c->lock) {
        return NGX_DECLINED;
    }

    now = ngx_current_msec;

    cache = c->file_cache;

    ngx_shmtx_lock(&cache->shpool->mutex);

    timer = c->node->lock_time - now;

    if (!c->node->updating || (ngx_msec_int_t) timer <= 0) {
        c->node->updating = 1;
        c->node->lock_time = now + c->lock_age;
        c->updating = 1;
        c->lock_time = c->node->lock_time;
    }

    ngx_shmtx_unlock(&cache->shpool->mutex);

    ngx_log_debug2(NGX_LOG_DEBUG_HTTP, r->connection->log, 0,
                   "http file cache lock u:%d wt:%M",
                   c->updating, c->wait_time);

    if (c->updating) {
        return NGX_DECLINED;
    }

    if (c->lock_timeout == 0) {
        return NGX_HTTP_CACHE_SCARCE;
    }

    c->waiting = 1;

    if (c->wait_time == 0) {
        c->wait_time = now + c->lock_timeout;

        c->wait_event.handler = ngx_http_file_cache_lock_wait_handler;
        c->wait_event.data = r;
        c->wait_event.log = r->connection->log;
    }

    timer = c->wait_time - now;

    ngx_add_timer(&c->wait_event, (timer > 500) ? 500 : timer);

    r->main->blocked++;

    return NGX_AGAIN;
}    
```

</details>

To tensorize the data, we need to obtain tokens from the leaf nodes of the syntax tree. It's better to use DFS here, since it keeps the original order of the words in the code.

In [None]:
def code_tokenize(code: str):
    tree = parser.parse(code.encode())
    root = tree.root_node
    tokens = list()
    types = list()

    def DFS(tree, root, tokens):
        if not root.children:
            tokens.append(code.encode()[root.start_byte:root.end_byte].decode())
            types.append(root.type)
            return

        for child in root.children:
            DFS(tree, child, tokens)
            
    
    DFS(tree, root, tokens)
    return tokens, types

After getting the tokens, we store them in a corpus, and use word2vec model to get the vector of each word. 

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

import gensim
from gensim.models import Word2Vec
import os

corpus_train = [code_tokenize(code)[0] for code, _ in snippets_train]
corpus_dev = [code_tokenize(code)[0] for code, _ in snippets_dev]
type_corpus_train = [code_tokenize(code)[1] for code, _ in snippets_train]
type_corpus_dev = [code_tokenize(code)[1] for code, _ in snippets_dev]

if os.path.exists("./model/word2vec.model"):
    model = Word2Vec.load("./model/word2vec.model")
else:
    model = gensim.models.Word2Vec(corpus_train + corpus_dev, min_count = 1, vector_size = 3, window = 4)
    model.save("./model/word2vec.model")
    
if os.path.exists("./model/word2vec_type.model"):
    type_model = Word2Vec.load("./model/word2vec_type.model")
else:
    type_model = gensim.models.Word2Vec(type_corpus_train + type_corpus_dev, min_count = 1, vector_size = 3, window = 4)
    type_model.save("./model/word2vec_type.model")

Then, the training set with tensors could be generated. Since the length of the code snippets varies a lot, we use the `tensorflow.image.resize` to resize the variable-sized tensors to the same size tensors. 

In [None]:
import numpy as np
import tensorflow as tf

y_train = tf.image.convert_image_dtype(np.asarray([s[1] for s in snippets_train]).astype('float32'), tf.float32)
y_dev = tf.image.convert_image_dtype(np.asarray([s[1] for s in snippets_dev]).astype('float32'), tf.float32)

x_train_l, x_dev_l = list(), list()
for i in range(len(corpus_train)):
    code_tensor = list()
    for j in range(len(corpus_train[i])):
        code_tensor.append(np.column_stack((model.wv[corpus_train[i][j]], type_model.wv[type_corpus_train[i][j]])))
    
    code_tensor = tf.image.resize(tf.convert_to_tensor(code_tensor, tf.float32), (64, 64))
    x_train_l.append(code_tensor)
    
for i in range(len(corpus_dev)):
    code_tensor = list()
    for j in range(len(corpus_dev[i])):
        code_tensor.append(np.column_stack((model.wv[corpus_dev[i][j]], type_model.wv[type_corpus_dev[i][j]])))
    
    code_tensor = tf.image.resize(tf.convert_to_tensor(code_tensor, tf.float32), (64, 64))
    x_dev_l.append(code_tensor)

x_train = tf.convert_to_tensor(x_train_l, tf.float32)
x_dev = tf.convert_to_tensor(x_dev_l, tf.float32)

## 4. Binary Classification

Afterward, the tensors of codes can be trained in convolutional neural network, just like the image binary classification task. Here we use accuracy, loss, recall, precision, and f1 as metrics.

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
if os.path.exists("./model/cls-model"):
    cls_model = tf.keras.models.load_model("./model/cls-model")

else:
    cls_model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(16, (2,2), activation='relu', input_shape=(64, 64, 2)),
        tf.keras.layers.MaxPooling2D(2, 2),
        tf.keras.layers.Conv2D(32, (2,2), activation='relu'),
        tf.keras.layers.MaxPooling2D(2,2),
        tf.keras.layers.Conv2D(64, (2,2), activation='relu'),
        tf.keras.layers.MaxPooling2D(2,2),
        tf.keras.layers.Conv2D(64, (2,2), activation='relu'),
        tf.keras.layers.MaxPooling2D(2,2),
        tf.keras.layers.Conv2D(64, (2,2), activation='relu'),
        tf.keras.layers.MaxPooling2D(2,2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    cls_model.compile(
        loss='binary_crossentropy',
        optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001),
        metrics=['accuracy', f1_m, precision_m, recall_m]
    )
    history = cls_model.fit(
        x_train,
        y_train,
        epochs=100,
        verbose=0
    )
    cls_model.save(
        './model/cls-model',
        overwrite=True,
        include_optimizer=True,
        save_format=None,
        signatures=None,
        options=None,
        save_traces=True
    )
    
cls_model.evaluate(x_train, y_train)
cls_model.evaluate(x_dev, y_dev)

Here are the curves with respect to different metrics during training.

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 8))
plt.plot(history.history['accuracy'], lw=4, ls='-', label='accuracy')
plt.plot(history.history['loss'], lw=4, ls='--', label='loss')
plt.plot(history.history['f1_m'], lw=4, ls='--', label='f1')
plt.plot(history.history['precision_m'], ls=':', lw=4, label='precision')
plt.plot(history.history['recall_m'], lw=4, ls=':', label='recall')
plt.legend(fontsize=16)
plt.xlabel('episodes', fontsize=16)
plt.xticks(size=16)
plt.yticks(size=16)
plt.savefig("cnn.png")