In [1]:
import pandas as pd
import json, os

import tensorflow as tf
import numpy as np
import csv
from transformers import AutoTokenizer, TFAutoModel, TFAutoModelForSequenceClassification #, BertModel, BertTokenizer, TFBertForSequenceClassification
import matplotlib.pyplot as plt
import random
from tensorflow.keras.callbacks import CSVLogger
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow.keras.backend as K
from collections import OrderedDict
import time
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, \
roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold
import random

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense, GlobalMaxPool1D

from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold


Set the seeder to have as stable random operations as possible

In [2]:
seed = 123
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

Read data

In [3]:
sequences_data = pd.read_csv('sequences_data.csv') # sequences of tokens

In [4]:
print(sequences_data.head())

                                       Vulnerability       Category  Length
0              f"str$id""str$id""str$id"         ...  sql_injection       9
1      client.listentcp()    proxy = proxy(proxy_...           xsrf       8
2  from django.http import httpresponse, httpresp...  open_redirect       9
3  def write_preset(conn, queryin, descriptin):\t...  sql_injection     175
4                          update_query = self.up...  sql_injection      14


In [5]:
label_frequencies = sequences_data['Category'].value_counts()
print("Label Frequencies:\n", label_frequencies)
print("Total samples ", len(sequences_data))

Label Frequencies:
 sql_injection            1424
xsrf                      976
command_injection         721
path_disclosure           481
open_redirect             442
remote_code_execution     334
xss                       145
Name: Category, dtype: int64
Total samples  4523


In [6]:
np.max(sequences_data["Length"])

392

In [7]:
word_counts = sequences_data["Vulnerability"].apply(lambda x: len(x.split()))
max_length = word_counts.max()
print("Maximum number of words:", max_length)


Maximum number of words: 392


Pre-trained CodeBERT model - Fine-tuning

In [8]:
model_variation = "microsoft/codebert-base-mlm"
tokenizer = AutoTokenizer.from_pretrained(model_variation, do_lower_case=True)

# Define New tokens for string and numerical i.e., strId$ and numId$
new_tokens = ["strId$", "numId$"]
for new_token in new_tokens:
    if new_token not in tokenizer.get_vocab().keys():
        tokenizer.add_tokens(new_token)


In [9]:
# user parameters
n_epochs = 2
batch_size = 2
lr = 5e-05
max_len = 512
patience = 5
train_len = 4000
sequences_data = sequences_data.iloc[0:, :]

In [10]:
def recall_metric(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = (true_positives + K.epsilon()) / (possible_positives + K.epsilon())
        return recall

def precision_metric(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = (true_positives + K.epsilon()) / (predicted_positives + K.epsilon())
        return precision

def f1_metric(y_true, y_pred):

    prec = precision_metric(y_true, y_pred)
    rec = recall_metric(y_true, y_pred)
    f1 = 2*((prec*rec)/(prec+rec+K.epsilon()))
    return f1

Cross-Validation

Binary Classification: Recognition of Injection Vulnerabilities (command_injection and sql_injection merged)

In [11]:
# # Define a function to determine if the category is an injection or not
# def is_injection(category):
#     if category in ['sql_injection', 'command_injection']:
#         return '1'
#     else:
#         return '0'

# sequences_data['Injection'] = sequences_data['Category'].apply(is_injection)

In [12]:
#n_categories = 2

In [13]:
# model = TFAutoModelForSequenceClassification.from_pretrained(model_variation, num_labels=n_categories)
# # resize model embedding to match new tokenizer
# model.resize_token_embeddings(len(tokenizer))

In [14]:
# def tokenize_X(train_data_input, val_data_input, max_len):

#     X_train = tokenizer(
#         text=train_data_input,
#         add_special_tokens=True,
#         max_length=max_len,
#         truncation=True,
#         padding=True,
#         return_tensors='tf',
#         return_token_type_ids=False,
#         return_attention_mask=True,
#         verbose=True
#     )

#     X_test = tokenizer(
#         text=val_data_input,
#         add_special_tokens=True,
#         max_length=max_len,
#         truncation=True,
#         padding=True,
#         return_tensors='tf',
#         return_token_type_ids=False,
#         return_attention_mask=True,
#         verbose=True
#     )
    
#     return X_train, X_test

In [15]:
# X_train, X_test = tokenize_X(sequences_data["Vulnerability"].tolist()[0:train_len], sequences_data["Vulnerability"].tolist()[train_len:], max_len)

# X_train

In [16]:
optimizer = Adam(
    learning_rate=lr, # HF recommendation
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0
)

loss = CategoricalCrossentropy(from_logits=True)


In [17]:
# model.compile(
#     optimizer=optimizer,
#     loss=loss,
#     metrics=[f1_metric]
# )

In [18]:
# history = model.fit(
#     x = {'input_ids':X_train['input_ids'], 'attention_mask':X_train['attention_mask']},
#     y = to_categorical(sequences_data['Injection'].iloc[0:train_len]),
#     validation_data = ({'input_ids':X_test['input_ids'], 'attention_mask':X_test['attention_mask']},
#                         to_categorical(sequences_data['Injection'].iloc[train_len:].astype(int))),
#     epochs=n_epochs,
#     batch_size=batch_size
# )

In [19]:
# plt.plot(history.history['f1_metric'])
# plt.plot(history.history['val_f1_metric'])
# plt.ylabel('model f1_metric')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='best')
# plt.savefig('train_history.png')
# plt.show()

In [20]:
# targets = sequences_data['Injection'].iloc[train_len:].astype(int)
# predicted = model.predict({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']}).logits
# y_predicted = np.argmax(predicted, axis=1)
# print(classification_report(targets, y_predicted))

# tn, fp, fn, tp = confusion_matrix(targets, y_predicted).ravel()
# acc=(tp+tn)/(tp+tn+fp+fn)
# prec=tp/(tp+fp)
# rec=tp/(tp+fn)
# f1=2*prec*rec / (prec+rec)
# print("Accuracy: ", acc)
# print("Precision: ", prec)
# print("Recall: ", rec)
# print("F1-score: ", f1)


In [21]:
# X = sequences_data["Vulnerability"].tolist()
# y = sequences_data["Injection"].tolist()

# X = tokenizer(
#         text=X[0:],
#         add_special_tokens=True,
#         max_length=max_len,
#         truncation=True,
#         padding=True,
#         return_tensors='tf',
#         return_token_type_ids=False,
#         return_attention_mask=True,
#         verbose=True
#     )

# y = np.array(y)

# scores=['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
# values = [np.array([]) for i in range(0, len(scores))]
# score_dict = OrderedDict(zip(scores, values))
# k=10
# f=0
# kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)

# nb_epoch = 100
# BS = 64
# print("Training...")
# milli_sec1 = int(round(time.time() * 1000))

# for train_index, test_index in kfold.split(X['input_ids'].numpy(), y):
#     f = f + 1
#     print('fold number= ',f)
    
#     X_train_inputs, X_train_attention, X_test_inputs, X_test_attention = np.array(X['input_ids'])[train_index], np.array(X['attention_mask'])[train_index], np.array(X['input_ids'])[test_index], np.array(X['attention_mask'])[test_index]
#     Y_train, Y_test = y[train_index], y[test_index]
    
# #         Y_train = np.array(Y_train)
# #         Y_train = Y_train.ravel()
# #         Y_test = np.array(Y_test)
# #         Y_test = Y_test.ravel()

# #         #sampling
# #         X_res, Y_res = RandomOverSampler(random_state=seed, sampling_strategy=0.5).fit_resample(X_train, Y_train)
# #         #X_res, Y_res = RandomUnderSampler(random_state=seed, sampling_strategy=0.5).fit_resample(X_train, Y_train)

# #         #shuffle dataset
# #         X_resampled=pd.DataFrame(X_res)
# #         Y_resampled=pd.DataFrame(Y_res)
# #         newTrain=X_resampled.assign(Label=Y_resampled.values)
# #         newTrain = shuffle(newTrain,random_state=seed)
# #         X_train=np.array(newTrain.iloc[:, 0:-1 ])
# #         X_train=pd.DataFrame(X_train)
# #         Y_train=np.array(newTrain.iloc[:, -1 ])
# #         Y_train=pd.DataFrame(Y_train)

#     model = TFAutoModelForSequenceClassification.from_pretrained(model_variation, num_labels=n_categories)
#     # resize model embedding to match new tokenizer
#     model.resize_token_embeddings(len(tokenizer))

#     model.compile(
#         optimizer=optimizer,
#         loss=loss,
#         metrics=[f1_metric]
#     )
    
#     history = model.fit(
#         x = {'input_ids':X_train_inputs, 'attention_mask':X_train_attention},
#         y = to_categorical(Y_train.astype(int)),
#         validation_data = ({'input_ids':X_test_inputs, 'attention_mask':X_test_attention},
#                             to_categorical(Y_test.astype(int))),
#         epochs=n_epochs,
#         batch_size=batch_size
#     )

#     predicted = model.predict({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']}).logits
#     predictions = np.argmax(predicted, axis=1)
    
#     targets = Y_test.astype(int)
    
#     accuracy=accuracy_score(targets, predictions)
#     precision=precision_score(targets, predictions)
#     recall=recall_score(targets, predictions)
#     f1=f1_score(targets, predictions)
#     roc_auc=roc_auc_score(targets, predictions)
#     print(confusion_matrix(targets, predictions, labels=[0, 1]))
#     tn, fp, fn, tp = confusion_matrix(targets, predictions).ravel()
#     acc = ((tp+tn)/(tp+tn+fp+fn))
#     print("Accuracy:%.2f%%"%(acc*100))
#     print("Precision:%.2f%%"%(precision*100))
#     print("Recall:%.2f%%"%(recall*100))
#     print("F1 score:%.2f%%"%(f1*100))
#     print("Roc_Auc score:%.2f%%"%(roc_auc*100))
#     print(classification_report(targets, predictions))
#     del model
#     score_dict['accuracy'] = np.append(score_dict['accuracy'], accuracy)
#     score_dict['precision'] = np.append(score_dict['precision'], precision)
#     score_dict['recall'] = np.append(score_dict['recall'], recall)
#     score_dict['f1'] = np.append(score_dict['f1'], f1)
#     score_dict['roc_auc'] = np.append(score_dict['roc_auc'], roc_auc)

# milli_sec2 = int(round(time.time() * 1000))
# print("Cross Validation is completed after", milli_sec2-milli_sec1)

# print("accuracy: %.2f%% (%.2f%%)" % (score_dict['accuracy'].mean()*100, score_dict['accuracy'].std()*100))
# print("precision: %.2f%% (%.2f%%)" % (score_dict['precision'].mean()*100, score_dict['precision'].std()*100))
# print("recall: %.2f%% (%.2f%%)" % (score_dict['recall'].mean()*100, score_dict['recall'].std()*100))
# print("f1: %.2f%% (%.2f%%)" % (score_dict['f1'].mean()*100, score_dict['f1'].std()*100))
# print("roc_auc: %.2f%% (%.2f%%)" % (score_dict['roc_auc'].mean()*100, score_dict['roc_auc'].std()*100))

Multi-class Classification: Categorization of all detected vulnerabilities

In [22]:
n_categories = len(label_frequencies) # 7

In [23]:
# Convert categories to numerical indexes
category_numerical_indexes, unique_categories = sequences_data["Category"].factorize()

# Create a dictionary mapping each category to its numerical index
category_to_index = {category: index for index, category in enumerate(unique_categories)}

# Update the categories in the DataFrame with their numerical indexes
sequences_data["Category_Index"] = sequences_data["Category"].map(category_to_index)
sequences_data.head()

Unnamed: 0,Vulnerability,Category,Length,Category_Index
0,"f""str$id""""str$id""""str$id"" ...",sql_injection,9,0
1,client.listentcp() proxy = proxy(proxy_...,xsrf,8,1
2,"from django.http import httpresponse, httpresp...",open_redirect,9,2
3,"def write_preset(conn, queryin, descriptin):\t...",sql_injection,175,0
4,update_query = self.up...,sql_injection,14,0


In [24]:
def getMaxLen(X):

    # Code for identifying max length of the data samples after tokenization using transformer tokenizer
    
    max_length = 0
    # Iterate over each sample in your dataset
    for i, input_ids in enumerate(X['input_ids']):
        # Calculate the length of the tokenized sequence for the current sample
        length = tf.math.reduce_sum(tf.cast(input_ids != 1, tf.int32)).numpy()
        # Update max_length and max_row if the current length is greater
        if length > max_length:
            max_length = length
            max_row = i

    print("Max length of tokenized data:", max_length)
    print("Row with max length:", max_row)

    #X['input_ids'] = np.delete(X['input_ids'], max_row, axis=0)
    
    return max_length

In [25]:
X = sequences_data["Vulnerability"].tolist()

X = tokenizer(
        text=X[0:],
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding=True,
        return_tensors='tf',
        return_token_type_ids=False,
        return_attention_mask=True,
        verbose=True
    )

max_len = getMaxLen(X)
print("Max tokenized length", max_len)

Max length of tokenized data: 512
Row with max length: 3
Max tokenized length 512


In [None]:
X = sequences_data["Vulnerability"].tolist()
y = sequences_data["Category_Index"].tolist()

X = tokenizer(
        text=X[0:],
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding=True,
        return_tensors='tf',
        return_token_type_ids=False,
        return_attention_mask=True,
        verbose=True
    )


scores=['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
values = [np.array([]) for i in range(0, len(scores))]
score_dict = OrderedDict(zip(scores, values))
k=10
f=0
kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)

print("Training...")
milli_sec1 = int(round(time.time() * 1000))

for train_index, test_index in kfold.split(X['input_ids'].numpy(), y):
    f = f + 1
    print('fold number= ',f)
    
    y = np.array(y)
    
    X_train_inputs, X_train_attention, X_test_inputs, X_test_attention = np.array(X['input_ids'])[train_index], np.array(X['attention_mask'])[train_index], np.array(X['input_ids'])[test_index], np.array(X['attention_mask'])[test_index]
    Y_train, Y_test = y[train_index], y[test_index]
    
#         Y_train = np.array(Y_train)
#         Y_train = Y_train.ravel()
#         Y_test = np.array(Y_test)
#         Y_test = Y_test.ravel()

#         #sampling
#         X_res, Y_res = RandomOverSampler(random_state=seed, sampling_strategy=0.5).fit_resample(X_train, Y_train)
#         #X_res, Y_res = RandomUnderSampler(random_state=seed, sampling_strategy=0.5).fit_resample(X_train, Y_train)

#         #shuffle dataset
#         X_resampled=pd.DataFrame(X_res)
#         Y_resampled=pd.DataFrame(Y_res)
#         newTrain=X_resampled.assign(Label=Y_resampled.values)
#         newTrain = shuffle(newTrain,random_state=seed)
#         X_train=np.array(newTrain.iloc[:, 0:-1 ])
#         X_train=pd.DataFrame(X_train)
#         Y_train=np.array(newTrain.iloc[:, -1 ])
#         Y_train=pd.DataFrame(Y_train)

    model = TFAutoModelForSequenceClassification.from_pretrained(model_variation, num_labels=n_categories)
    # resize model embedding to match new tokenizer
    model.resize_token_embeddings(len(tokenizer))

    model.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=[f1_metric]
    )
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience)
    model_checkpoint = ModelCheckpoint('./checkpoints/best_weights', monitor='val_loss', save_best_only=True)
    
    history = model.fit(
        x = {'input_ids':X_train_inputs, 'attention_mask':X_train_attention},
        y = to_categorical(Y_train.astype(int)),
        validation_data = ({'input_ids':X_test_inputs, 'attention_mask':X_test_attention},
                            to_categorical(Y_test.astype(int))),
        epochs=n_epochs,
        batch_size=batch_size,
        callbacks=[early_stopping, model_checkpoint]
    )

    #model.save_weights('./checkpoints/my_checkpoint')
   
    #model = TFAutoModelForSequenceClassification.from_pretrained(model_variation, num_labels=n_categories)
    #model.resize_token_embeddings(len(tokenizer))
    
    model.load_weights('./checkpoints/best_weights')
    
    predicted = model.predict({'input_ids': X_test_inputs, 'attention_mask': X_test_attention}).logits
    predictions = np.argmax(predicted, axis=1)
    
    targets = Y_test.astype(int)
    
    accuracy=accuracy_score(targets, predictions)
    precision=precision_score(targets, predictions, average='micro')
    recall=recall_score(targets, predictions, average='micro')
    f1=f1_score(targets, predictions, average='micro')
    conf_matrix = confusion_matrix(targets, predictions)
                                   
    print("Confusion Matrix:\n", conf_matrix)
    print("Accuracy:%.2f%%"%(accuracy*100))
    print("Precision:%.2f%%"%(precision*100))
    print("Recall:%.2f%%"%(recall*100))
    print("F1 score:%.2f%%"%(f1*100))
    
    class_report = classification_report(targets, predictions)
    print("Classification Report:\n", class_report)
    
    del model
    score_dict['accuracy'] = np.append(score_dict['accuracy'], accuracy)
    score_dict['precision'] = np.append(score_dict['precision'], precision)
    score_dict['recall'] = np.append(score_dict['recall'], recall)
    score_dict['f1'] = np.append(score_dict['f1'], f1)

milli_sec2 = int(round(time.time() * 1000))
print("Cross Validation is completed after", milli_sec2-milli_sec1)

print("accuracy: %.2f%% (%.2f%%)" % (score_dict['accuracy'].mean()*100, score_dict['accuracy'].std()*100))
print("precision: %.2f%% (%.2f%%)" % (score_dict['precision'].mean()*100, score_dict['precision'].std()*100))
print("recall: %.2f%% (%.2f%%)" % (score_dict['recall'].mean()*100, score_dict['recall'].std()*100))
print("f1: %.2f%% (%.2f%%)" % (score_dict['f1'].mean()*100, score_dict['f1'].std()*100))


Training...
fold number=  1


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base-mlm and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
 458/2035 [=====>........................] - ETA: 6:41 - loss: 1.2286 - f1_metric: 0.5637