In [1]:
import seaborn as sn
import pandas as pd
import json, os

import tensorflow as tf
import numpy as np
import csv
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification  #, BertModel, BertTokenizer, TFBertForSequenceClassification
from transformers import AdamWeightDecay
import matplotlib.pyplot as plt
import random
from tensorflow.keras.callbacks import CSVLogger
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow.keras.backend as K
from collections import OrderedDict
import time
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, \
roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold
import random

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense, GlobalMaxPool1D

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from collections import defaultdict
from imblearn.under_sampling import RandomUnderSampler

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder


Define method name and root path of the repository

In [2]:
method = "forSequence"

root_path = os.path.join('..', '..', '..')

Define specific seeder for all experiments and processes

In [3]:
seed = 123
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

Read data and shuffle them

In [4]:
data = pd.read_csv(os.path.join(root_path, 'data', 'dataset.csv'))
data = data.sample(frac=1, random_state=seed).reset_index(drop=True)
print(data.head())
print(len(data))

                                                func  vul  length
0  static enum test_return test_binary_replace(vo...    0       9
1  force_unmount (device *device,\r\n            ...    0      84
2  bool clipboard::isformatavailablebystring(cons...    0      12
3  int kvm_get_dirty_log(struct kvm *kvm,\r\n\t\t...    0      81
4  static double filter_mitchell(const double x)\...    0     124
178460


Data visualization

In [5]:
np.max(data["length"])

191

In [6]:
word_counts = data["func"].apply(lambda x: len(x.split()))
max_length = word_counts.max()
print("Maximum number of words:", max_length)

Maximum number of words: 191


In [7]:
vc = data["vul"].value_counts()

print(vc)

print("Percentage: ", (vc[1] / vc[0])*100, '%')

n_categories = len(vc)
print(n_categories)

0    170642
1      7818
Name: vul, dtype: int64
Percentage:  4.58152154803624 %
2


In [8]:
data = pd.DataFrame(({'text': data['func'], 'label': data['vul']}))
#data = data[0:100]
data.head()

Unnamed: 0,text,label
0,static enum test_return test_binary_replace(vo...,0
1,"force_unmount (device *device,\r\n ...",0
2,bool clipboard::isformatavailablebystring(cons...,0
3,"int kvm_get_dirty_log(struct kvm *kvm,\r\n\t\t...",0
4,static double filter_mitchell(const double x)\...,0


Train test split with seeder

In [9]:
val_ratio = 0.10

#split to train-val-test
# split dataset to train-test sets
### split data into train and test (90% train, 10% test)
shuffle_seeders = [seed, 10, 15, 20, 25, 30, 35, 40, 45, 50]
shuffle_seeder = shuffle_seeders[0]

train_data, test_data = train_test_split(data, test_size=val_ratio, random_state=shuffle_seeder, stratify=data['label'])
# print(len(data))
# print(len(train_data))
# print(len(test_data))
# print(len(test_data)+len(train_data))

Pre-processing step: Under-sampling

In [10]:
sampling = True
if n_categories == 2 and sampling == True:
    # Apply under-sampling with the specified strategy
    class_counts = pd.Series(train_data["label"]).value_counts()
    print("Class distribution ", class_counts)

    majority_class = class_counts.idxmax()
    print("Majority class ", majority_class)

    minority_class = class_counts.idxmin()
    print("Minority class ", minority_class)

    target_count = class_counts[class_counts.idxmin()] # int(class_counts.iloc[0] / 2) 
    print("Targeted number of majority class", target_count)

    # under
    sampling_strategy = {majority_class: target_count}        
    rus = RandomUnderSampler(random_state=seed, sampling_strategy=sampling_strategy)

    x_train_resampled, y_train_resampled = rus.fit_resample(np.array(train_data["text"]).reshape(-1, 1), train_data["label"]) 
    print("Class distribution after augmentation", pd.Series(y_train_resampled).value_counts())


    # Shuffle the resampled data while preserving the correspondence between features and labels
    x_train_resampled, y_train_resampled = shuffle(x_train_resampled, y_train_resampled, random_state=seed)

    # rename
    X_train = x_train_resampled
    Y_train = y_train_resampled

    X_train = pd.Series(X_train.reshape(-1))

else:
    X_train = train_data["text"]
    Y_train = train_data["label"]

Class distribution  0    153578
1      7036
Name: label, dtype: int64
Majority class  0
Minority class  1
Targeted number of majority class 7036
Class distribution after augmentation 0    7036
1    7036
Name: label, dtype: int64


Choose transformer model

In [11]:
model_variation = "microsoft/codebert-base-mlm"
tokenizer = AutoTokenizer.from_pretrained(model_variation, do_lower_case=True) #Tokenizer
#bert-base-uncased #bert-base #albert-base-v2 # roberta-base # distilbert-base-uncased #distilbert-base # microsoft/codebert-base-mlm # microsoft/codebert-base

Insert new tokens to the tokenizer

In [12]:
# Define New tokens for string and numerical i.e., strId$ and numId$
new_tokens = ["strId$", "numId$"]
for new_token in new_tokens:
    if new_token not in tokenizer.get_vocab().keys():
        tokenizer.add_tokens(new_token)


In [13]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_variation, num_labels=n_categories)

config = model.get_config()
print(config)

model.resize_token_embeddings(len(tokenizer))

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base-mlm and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_

<transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings at 0x205a4d9e4f0>

Custom evaluation metrics for monitoring the produced model

In [14]:
def recall_metric(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = (true_positives + K.epsilon()) / (possible_positives + K.epsilon())
        return recall

def precision_metric(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = (true_positives + K.epsilon()) / (predicted_positives + K.epsilon())
        return precision

def f1_metric(y_true, y_pred):

    prec = precision_metric(y_true, y_pred)
    rec = recall_metric(y_true, y_pred)
    f1 = 2*((prec*rec)/(prec+rec+K.epsilon()))
    return f1

def f2_metric(y_true, y_pred):

    prec = precision_metric(y_true, y_pred)
    rec = recall_metric(y_true, y_pred)
    f2 = 5*((prec*rec)/(4*prec+rec+K.epsilon()))
    return f2

Find the max length of the tokined input sequneces of the training set

In [15]:
def getMaxLen(X):

    # Code for identifying max length of the data samples after tokenization using transformer tokenizer
    
    max_length = 0
    # Iterate over each sample in your dataset
    for i, input_ids in enumerate(X['input_ids']):
        # Calculate the length of the tokenized sequence for the current sample
        length = tf.math.reduce_sum(tf.cast(input_ids != 1, tf.int32)).numpy()
        # Update max_length and max_row if the current length is greater
        if length > max_length:
            max_length = length
            max_row = i

    print("Max length of tokenized data:", max_length)
    print("Row with max length:", max_row)

    #X['input_ids'] = np.delete(X['input_ids'], max_row, axis=0)
    
    return max_length

In [16]:
X = tokenizer(
        text=X_train.tolist(),
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding=True,
        return_tensors='tf',
        return_token_type_ids=False,
        return_attention_mask=True,
        verbose=True
    )

max_len = getMaxLen(X)
print("Max tokenized length", max_len)

Max length of tokenized data: 512
Row with max length: 0
Max tokenized length 512


Tokenize train and test sets

In [17]:
def tokenize_X(x_train, x_test, max_len):
    X_train = tokenizer(
        text=x_train.tolist(),
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding=True,
        return_tensors='tf',
        return_token_type_ids=False,
        return_attention_mask=True,
        verbose=True
    )

    X_test = tokenizer(
        text=x_test.tolist(),
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding=True,
        return_tensors='tf',
        return_token_type_ids=False,
        return_attention_mask=True,
        verbose=True
    )
    
    return X_train, X_test

In [18]:
X_train, X_test = tokenize_X(X_train, test_data['text'], max_len)

In [19]:
Y_train = np.array(Y_train)
Y_test = np.array(test_data["label"])

Select Hyper-parameters

In [20]:
n_epochs = 100
patience = 5
batch_size = 6 #16
lr = 5e-4 #5e-05 #  5e-5, 3e-5, 2e-5

optimizer = AdamWeightDecay(
    learning_rate=lr, 
#     epsilon=1e-08,
#     decay=0.01,
#     clipnorm=1.0
)

loss = CategoricalCrossentropy(from_logits=True)
#metric = CategoricalAccuracy('balanced_accuracy')

if n_categories > 2:
    model.compile(
        optimizer=optimizer,
        loss=loss
    )

else:
    model.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=[f1_metric]
    )

print(model.summary())

Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLayer  multiple                 124056576 
 )                                                               
                                                                 
 classifier (TFRobertaClassi  multiple                 592130    
 ficationHead)                                                   
                                                                 
Total params: 124,648,706
Trainable params: 124,648,706
Non-trainable params: 0
_________________________________________________________________
None


Training

In [21]:
print("Training...")
milli_sec1 = int(round(time.time() * 1000))

early_stopping = EarlyStopping(monitor='val_f1_metric', mode='max', patience=patience)
model_checkpoint = ModelCheckpoint('./checkpoints/best_weights', monitor='val_f1_metric', mode='max', save_best_only=True)

history = model.fit(
    x = {'input_ids':X_train['input_ids'], 'attention_mask':X_train['attention_mask']},
    y = to_categorical(Y_train),
    validation_split=val_ratio,
#     validation_data = ({'input_ids':X_test['input_ids'], 'attention_mask':X_test['attention_mask']},
#                         to_categorical(Y_test)),
    epochs=n_epochs,
    batch_size=batch_size,
    callbacks=[early_stopping, model_checkpoint]
)

milli_sec2 = int(round(time.time() * 1000))
print("Training is completed after", milli_sec2-milli_sec1)

Training...
Epoch 1/100
  85/2111 [>.............................] - ETA: 16:11 - loss: 0.7438 - f1_metric: 0.0501

KeyboardInterrupt: 

Plot history

In [None]:
if n_categories <= 2:
    plt.plot(history.history['f1_metric'])
    plt.plot(history.history['val_f1_metric'])
    plt.ylabel('model f1_metric')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='best')
    plt.savefig('train_history.png')
    plt.show()

Load best model from checkpoint during training with early stopping

In [None]:
model.load_weights('./checkpoints/best_weights')

Make predictions on the testing set and compute evaluation metrics

In [None]:
predicted = model.predict({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']}).logits
predictions = np.argmax(predicted, axis=1)

targets = Y_test.astype(int)

accuracy=accuracy_score(targets, predictions)
if n_categories > 2:
    precision=precision_score(targets, predictions, average='macro')
    recall=recall_score(targets, predictions, average='macro')
    f1=f1_score(targets, predictions, average='macro')
else:
    precision=precision_score(targets, predictions)
    recall=recall_score(targets, predictions)
    f1=f1_score(targets, predictions)
    roc_auc=roc_auc_score(targets, predictions)
f2 = (5*precision*recall) / (4*precision+recall)
    
conf_matrix = confusion_matrix(targets, predictions)
#print("Confusion Matrix:\n", conf_matrix)
sn.heatmap(conf_matrix, annot=True)

tn, fp, fn, tp = conf_matrix.ravel()

print("TP=",tp)
print("TN=",tn)
print("FP=",fp)
print("FN=",fn)

acc = ((tp+tn)/(tp+tn+fp+fn))

print("Accuracy:%.2f%%"%(accuracy*100))
print("Precision:%.2f%%"%(precision*100))
print("Recall:%.2f%%"%(recall*100))
print("F1 score:%.2f%%"%(f1*100))
print("F2 score:%.2f%%"%(f2*100))
if roc_auc:
    print("Roc_Auc score:%.2f%%"%(roc_auc*100))

class_report = classification_report(targets, predictions)
print("Classification Report:\n", class_report)

Export classification report

In [None]:
# Create the path
path = os.path.join(root_path, 'results', model_variation.split("/")[-1], method, str(shuffle_seeder))

# Create directory if it doesn't exist
os.makedirs(path, exist_ok=True)

# Define the CSV file path
csv_file_path = os.path.join(path, f"{shuffle_seeder}.csv")

# Write data to CSV
data = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1,
    "f2": f2,
    "roc_auc": roc_auc
}

# Write to CSV
with open(csv_file_path, "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=data.keys())
    writer.writeheader()
    writer.writerow(data)


Compute the average values of the classication metrics considering the results for all different seeders

In [None]:
# Define a dictionary to store cumulative sum of metrics
cumulative_metrics = defaultdict(float)
count = 0  # Counter to keep track of number of CSV files

# Iterate over all CSV files in the results folder
results_folder = os.path.join(root_path, "results", model_variation.split("/")[-1], method, str(shuffle_seeder))
for filename in os.listdir(results_folder):
    if filename.endswith(".csv"):
        csv_file_path = os.path.join(results_folder, filename)
        with open(csv_file_path, "r", newline="") as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                for metric, value in row.items():
                    cumulative_metrics[metric] += float(value)
        count += 1
        
# Compute average values
average_metrics = {metric: total / count for metric, total in cumulative_metrics.items()}

# Print average values 
print(average_metrics)

# Define the path for the average CSV file
avg_csv_file_path = os.path.join(root_path, "results", model_variation.split("/")[-1], method, "avg.csv")

# Write average metrics to CSV
with open(avg_csv_file_path, "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=average_metrics.keys())
    writer.writeheader()
    writer.writerow(average_metrics)