In [1]:
import pandas as pd
import json, os

import tensorflow as tf
import numpy as np
import csv
from transformers import AutoTokenizer, TFAutoModel, TFAutoModelForSequenceClassification #, BertModel, BertTokenizer, TFBertForSequenceClassification
import matplotlib.pyplot as plt
import random
from tensorflow.keras.callbacks import CSVLogger
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow.keras.backend as K
from collections import OrderedDict
import time
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, \
roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold
import random

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense, GlobalMaxPool1D

from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split


Set the seeder to have as stable random operations as possible

In [2]:
seed = 123
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

Read data

In [3]:
sequences_data = pd.read_csv('sequences_data.csv') # sequences of tokens

In [4]:
print(sequences_data.head())

                                       Vulnerability       Category  Length
0              f"str$id""str$id""str$id"         ...  sql_injection       9
1      client.listentcp()    proxy = proxy(proxy_...           xsrf       8
2  from django.http import httpresponse, httpresp...  open_redirect       9
3  def write_preset(conn, queryin, descriptin):\t...  sql_injection     175
4                          update_query = self.up...  sql_injection      14


In [5]:
label_frequencies = sequences_data['Category'].value_counts()
print("Label Frequencies:\n", label_frequencies)
print("Total samples ", len(sequences_data))

Label Frequencies:
 sql_injection            1424
xsrf                      976
command_injection         721
path_disclosure           481
open_redirect             442
remote_code_execution     334
xss                       145
Name: Category, dtype: int64
Total samples  4523


In [6]:
np.max(sequences_data["Length"])

392

In [7]:
word_counts = sequences_data["Vulnerability"].apply(lambda x: len(x.split()))
max_length = word_counts.max()
print("Maximum number of words:", max_length)


Maximum number of words: 392


Pre-trained CodeBERT model - Fine-tuning

In [8]:
model_variation = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_variation, do_lower_case=True)

# Define New tokens for string and numerical i.e., strId$ and numId$
new_tokens = ["strId$", "numId$"]
for new_token in new_tokens:
    if new_token not in tokenizer.get_vocab().keys():
        tokenizer.add_tokens(new_token)


In [9]:
# user parameters
n_epochs = 100
batch_size = 8
lr = 5e-05
max_len = 512
patience = 5
#train_len = round(len(sequences_data) * 0.9)
#sequences_data = sequences_data.iloc[0:, :]

In [10]:
def recall_metric(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = (true_positives + K.epsilon()) / (possible_positives + K.epsilon())
        return recall

def precision_metric(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = (true_positives + K.epsilon()) / (predicted_positives + K.epsilon())
        return precision

def f1_metric(y_true, y_pred):

    prec = precision_metric(y_true, y_pred)
    rec = recall_metric(y_true, y_pred)
    f1 = 2*((prec*rec)/(prec+rec+K.epsilon()))
    return f1

In [11]:
optimizer = Adam(
    learning_rate=lr, # HF recommendation
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0
)

loss = CategoricalCrossentropy(from_logits=True)


Multi-class Classification: Categorization of all detected vulnerabilities

In [12]:
n_categories = len(label_frequencies) # 7

In [13]:
# Convert categories to numerical indexes
category_numerical_indexes, unique_categories = sequences_data["Category"].factorize()

# Create a dictionary mapping each category to its numerical index
category_to_index = {category: index for index, category in enumerate(unique_categories)}

# Update the categories in the DataFrame with their numerical indexes
sequences_data["Category_Index"] = sequences_data["Category"].map(category_to_index)
sequences_data.head()

Unnamed: 0,Vulnerability,Category,Length,Category_Index
0,"f""str$id""""str$id""""str$id"" ...",sql_injection,9,0
1,client.listentcp() proxy = proxy(proxy_...,xsrf,8,1
2,"from django.http import httpresponse, httpresp...",open_redirect,9,2
3,"def write_preset(conn, queryin, descriptin):\t...",sql_injection,175,0
4,update_query = self.up...,sql_injection,14,0


In [14]:
def getMaxLen(X):

    # Code for identifying max length of the data samples after tokenization using transformer tokenizer
    
    max_length = 0
    # Iterate over each sample in your dataset
    for i, input_ids in enumerate(X['input_ids']):
        # Calculate the length of the tokenized sequence for the current sample
        length = tf.math.reduce_sum(tf.cast(input_ids != 1, tf.int32)).numpy()
        # Update max_length and max_row if the current length is greater
        if length > max_length:
            max_length = length
            max_row = i

    print("Max length of tokenized data:", max_length)
    print("Row with max length:", max_row)

    #X['input_ids'] = np.delete(X['input_ids'], max_row, axis=0)
    
    return max_length

In [15]:
X = sequences_data["Vulnerability"].tolist()

X = tokenizer(
        text=X[0:],
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding=True,
        return_tensors='tf',
        return_token_type_ids=False,
        return_attention_mask=True,
        verbose=True
    )

max_len = getMaxLen(X)
print("Max tokenized length", max_len)

Max length of tokenized data: 512
Row with max length: 0
Max tokenized length 512


In [16]:
def tokenize_X(train_data_input, val_data_input, max_len):

    X_train = tokenizer(
        text=train_data_input,
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding=True,
        return_tensors='tf',
        return_token_type_ids=False,
        return_attention_mask=True,
        verbose=True
    )

    X_test = tokenizer(
        text=val_data_input,
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding=True,
        return_tensors='tf',
        return_token_type_ids=False,
        return_attention_mask=True,
        verbose=True
    )
    
    return X_train, X_test

In [17]:
## split dataset to train-test sets
### split data into train and test (90% train, 10% test)
shuffle_seeders = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
shuffle_seeder = shuffle_seeders[1]
x_train, x_test, y_train, y_test = train_test_split(sequences_data["Vulnerability"].tolist(), sequences_data["Category_Index"].tolist(), stratify = sequences_data["Category_Index"].tolist(), test_size=0.1, random_state=shuffle_seeder)

In [18]:
# X_train, X_test = tokenize_X(sequences_data["Vulnerability"].tolist()[0:train_len], sequences_data["Vulnerability"].tolist()[train_len:], max_len)
# Y_train = sequences_data["Category_Index"].tolist()[0:train_len]
# Y_test = sequences_data["Category_Index"].tolist()[train_len:]
# Y_train = np.array(Y_train)
# Y_test = np.array(Y_test)

In [19]:
X_train, X_test = tokenize_X(x_train, x_test, max_len)

Y_train = np.array(y_train)
Y_test = np.array(y_test)

Train - Test split, fit and evaluate

In [20]:
print("Training...")
milli_sec1 = int(round(time.time() * 1000))

model = TFAutoModelForSequenceClassification.from_pretrained(model_variation, num_labels=n_categories)
# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=[f1_metric]
)

early_stopping = EarlyStopping(monitor='val_loss', mode='min', patience=patience)
model_checkpoint = ModelCheckpoint('./checkpoints/best_weights', monitor='val_loss', mode='min', save_best_only=True)

history = model.fit(
    x = {'input_ids':X_train['input_ids'], 'attention_mask':X_train['attention_mask']},
    y = to_categorical(Y_train.astype(int)),
    validation_data = ({'input_ids':X_test['input_ids'], 'attention_mask':X_test['attention_mask']},
                        to_categorical(Y_test.astype(int))),
    epochs=n_epochs,
    batch_size=batch_size,
    callbacks=[early_stopping, model_checkpoint]
)

#model.save_weights('./checkpoints/my_checkpoint')

#model = TFAutoModelForSequenceClassification.from_pretrained(model_variation, num_labels=n_categories)
#model.resize_token_embeddings(len(tokenizer))

model.load_weights('./checkpoints/best_weights')

predicted = model.predict({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']}).logits
predictions = np.argmax(predicted, axis=1)

targets = Y_test.astype(int)

accuracy=accuracy_score(targets, predictions)
precision=precision_score(targets, predictions, average='macro')
recall=recall_score(targets, predictions, average='macro')
f1=f1_score(targets, predictions, average='macro')
conf_matrix = confusion_matrix(targets, predictions)

print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:%.2f%%"%(accuracy*100))
print("Precision:%.2f%%"%(precision*100))
print("Recall:%.2f%%"%(recall*100))
print("F1 score:%.2f%%"%(f1*100))

class_report = classification_report(targets, predictions)
print("Classification Report:\n", class_report)

milli_sec2 = int(round(time.time() * 1000))
print("Training is completed after", milli_sec2-milli_sec1)

Training...


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/100



INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


Epoch 2/100



INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


Epoch 3/100



INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


Epoch 4/100



INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


Epoch 5/100



INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


Epoch 6/100



INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


Epoch 7/100



INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


Epoch 8/100



INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


Epoch 9/100



INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


Epoch 10/100



INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


INFO:tensorflow:Assets written to: ./checkpoints\best_weights\assets


Epoch 11/100
 16/509 [..............................] - ETA: 15:52 - loss: 0.2403 - f1_metric: 0.7794

KeyboardInterrupt: 

In [21]:
model.load_weights('./checkpoints/best_weights')

predicted = model.predict({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']}).logits
predictions = np.argmax(predicted, axis=1)

targets = Y_test.astype(int)

accuracy=accuracy_score(targets, predictions)
precision=precision_score(targets, predictions, average='macro')
recall=recall_score(targets, predictions, average='macro')
f1=f1_score(targets, predictions, average='macro')
conf_matrix = confusion_matrix(targets, predictions)

print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:%.2f%%"%(accuracy*100))
print("Precision:%.2f%%"%(precision*100))
print("Recall:%.2f%%"%(recall*100))
print("F1 score:%.2f%%"%(f1*100))

class_report = classification_report(targets, predictions)
print("Classification Report:\n", class_report)

milli_sec2 = int(round(time.time() * 1000))
print("Training is completed after", milli_sec2-milli_sec1)

Confusion Matrix:
 [[122   3   2   1   8   5   2]
 [  3  91   2   0   2   0   0]
 [  6   4  32   0   0   0   2]
 [  0   0   1  13   1   0   0]
 [  0   3   3   0  24   1   2]
 [  7   1   2   1   0  60   1]
 [  3   0   3   0   0   1  41]]
Accuracy:84.55%
Precision:82.44%
Recall:82.72%
F1 score:82.55%
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.85      0.86       143
           1       0.89      0.93      0.91        98
           2       0.71      0.73      0.72        44
           3       0.87      0.87      0.87        15
           4       0.69      0.73      0.71        33
           5       0.90      0.83      0.86        72
           6       0.85      0.85      0.85        48

    accuracy                           0.85       453
   macro avg       0.82      0.83      0.83       453
weighted avg       0.85      0.85      0.85       453

Training is completed after 3883123
