## 1. IMPORT LIBRARIES

In [1]:
import sys
import os
import pandas

## 2. CLEAN GADGET

### Processes code lines by:

- Replacing user-defined function names with symbolic names like `FUN1, FUN2`, etc.
- Replacing user-defined variable names with symbolic names like `VAR1, VAR2`, etc.
- Leaving keywords, built-in functions, and standard arguments (`argc, argv`) untouched.
- Skipping over string and character literals, comments, and non-ASCII characters to focus only on code identifiers.

In [2]:
import re

# Immutable set of keywords up to C11 and C++17 standards.
# These are reserved words in C/C++ that should not be renamed or replaced.
keywords = frozenset({
    '__asm', '__builtin', '__cdecl', '__declspec', '__except', '__export', '__far16', '__far32',
    '__fastcall', '__finally', '__import', '__inline', '__int16', '__int32', '__int64', '__int8',
    '__leave', '__optlink', '__packed', '__pascal', '__stdcall', '__system', '__thread', '__try',
    '__unaligned', '_asm', '_Builtin', '_Cdecl', '_declspec', '_except', '_Export', '_Far16',
    '_Far32', '_Fastcall', '_finally', '_Import', '_inline', '_int16', '_int32', '_int64',
    '_int8', '_leave', '_Optlink', '_Packed', '_Pascal', '_stdcall', '_System', '_try', 'alignas',
    'alignof', 'and', 'and_eq', 'asm', 'auto', 'bitand', 'bitor', 'bool', 'break', 'case',
    'catch', 'char', 'char16_t', 'char32_t', 'class', 'compl', 'const', 'const_cast', 'constexpr',
    'continue', 'decltype', 'default', 'delete', 'do', 'double', 'dynamic_cast', 'else', 'enum',
    'explicit', 'export', 'extern', 'false', 'final', 'float', 'for', 'friend', 'goto', 'if',
    'inline', 'int', 'long', 'mutable', 'namespace', 'new', 'noexcept', 'not', 'not_eq', 'nullptr',
    'operator', 'or', 'or_eq', 'override', 'private', 'protected', 'public', 'register',
    'reinterpret_cast', 'return', 'short', 'signed', 'sizeof', 'static', 'static_assert',
    'static_cast', 'struct', 'switch', 'template', 'this', 'thread_local', 'throw', 'true', 'try',
    'typedef', 'typeid', 'typename', 'union', 'unsigned', 'using', 'virtual', 'void', 'volatile',
    'wchar_t', 'while', 'xor', 'xor_eq', 'NULL'
})

# Known, non-user-defined function names that shouldn't be replaced.
main_set = frozenset({'main'})

# Common arguments in the 'main' function that should not be renamed.
main_args = frozenset({'argc', 'argv'})

# Function to process and anonymize a C/C++ code snippet.
# Input: gadget (list of strings), where each string is a line of code.
# Output: cleaned_gadget (list of strings) with function/variable names replaced by symbolic names.
def clean_gadget(gadget):
    # Maps user-defined function names to anonymized symbols (e.g., FUN1, FUN2).
    fun_symbols = {}

    # Maps user-defined variable names to anonymized symbols (e.g., VAR1, VAR2).
    var_symbols = {}

    # Counters to generate unique symbolic names.
    fun_count = 1
    var_count = 1

    # Regex to detect if a line is ending a multi-line comment.
    rx_comment = re.compile(r'\*/\s*$')

    # Regex to find candidate function names (words followed by an opening parenthesis).
    rx_fun = re.compile(r'\b([_A-Za-z]\w*)\b(?=\s*\()')

    # Regex to find candidate variable names.
    # Matches identifiers not immediately followed by a '('.
    rx_var = re.compile(r'\b([_A-Za-z]\w*)\b(?:(?=\s*\w+\()|(?!\s*\w+))(?!\s*\()')

    # List to store the cleaned code lines.
    cleaned_gadget = []

    # Process each line of the input gadget.
    for line in gadget:
        # Skip lines that end multi-line comments.
        if rx_comment.search(line) is None:

            # Step 1: Clean the line of literals and non-ASCII characters.

            # Remove string literals (content inside double quotes) to avoid replacing names inside them.
            nostrlit_line = re.sub(r'".*?"', '""', line)

            # Remove character literals (content inside single quotes).
            nocharlit_line = re.sub(r"'.*?'", "''", nostrlit_line)

            # Remove non-ASCII characters to ensure processing of clean ASCII text.
            ascii_line = re.sub(r'[^\x00-\x7f]', r'', nocharlit_line)

            # Step 2: Extract potential function and variable names.

            # Find all function-like identifiers (names followed by '(').
            user_fun = rx_fun.findall(ascii_line)

            # Find all variable-like identifiers (names not followed by '(').
            user_var = rx_var.findall(ascii_line)

            # Step 3: Replace user-defined function names with symbolic names.
            for fun_name in user_fun:
                # Skip if it's 'main' or a reserved keyword.
                if fun_name not in main_set and fun_name not in keywords:

                    # If the function isn't already mapped, assign it a new symbol.
                    if fun_name not in fun_symbols:
                        fun_symbols[fun_name] = 'FUN' + str(fun_count)
                        fun_count += 1

                    # Replace function calls with the symbolic name (positive lookahead ensures it's only a function call).
                    ascii_line = re.sub(
                        r'\b(' + re.escape(fun_name) + r')\b(?=\s*\()',
                        fun_symbols[fun_name],
                        ascii_line
                    )

            # Step 4: Replace user-defined variable names with symbolic names.
            for var_name in user_var:
                # Skip if it's a reserved keyword or a common 'main' argument.
                if var_name not in keywords and var_name not in main_args:

                    # If the variable isn't already mapped, assign it a new symbol.
                    if var_name not in var_symbols:
                        var_symbols[var_name] = 'VAR' + str(var_count)
                        var_count += 1

                    # Replace variables with the symbolic name.
                    # Uses lookaheads to ensure it's not a function name.
                    ascii_line = re.sub(
                        r'\b(' + re.escape(var_name) + r')\b(?:(?=\s*\w+\()|(?!\s*\w+))(?!\s*\()',
                        var_symbols[var_name],
                        ascii_line
                    )

            # Append the processed line to the result list.
            cleaned_gadget.append(ascii_line)

    # Return the fully cleaned and anonymized code.
    return cleaned_gadget

## 3. PARSE FILE

In [3]:
def parse_file(filename):
    """
    Opens and reads the specified gadget file line by line.
    Groups lines into individual gadgets, ignoring the first "index" line of each gadget.
    Cleans each gadget using `clean_gadget()`, which anonymizes variable and function names.
    Yields a tuple for each gadget: (cleaned_gadget_lines, gadget_label)
    """
    with open(filename, "r", encoding="utf8") as file:
        gadget = []       # Stores code lines for the current gadget
        gadget_val = 0    # Stores the vulnerability label (0 or 1) for the current gadget
        
        for line in file:
            stripped = line.strip()   # Remove leading/trailing whitespace
            
            if not stripped:
                # Skip empty lines
                continue

            # Check for end-of-gadget delimiter (a line of dashes)
            if "-" * 33 in line and gadget: 
                # Yield the current gadget and its label after cleaning it
                yield clean_gadget(gadget), gadget_val
                # Reset for the next gadget
                gadget = []

            # Check if the line starts with a digit (could be the label or code)
            elif stripped.split()[0].isdigit():
                if gadget:
                    # If it's just a number, treat it as the vulnerability label (e.g., "1" or "0")
                    if stripped.isdigit():
                        gadget_val = int(stripped)
                    else:
                        # Otherwise, it's a code line that happens to start with a number
                        gadget.append(stripped)
            else:
                # Regular code line, add it to the current gadget
                gadget.append(stripped)

In [4]:
filename = "cwe399_cgd.txt"
parse_file(filename)

<generator object parse_file at 0x00000294C896A540>

In [5]:
base = os.path.splitext(os.path.basename(filename))[0]
vector_filename = base + "_gadget_vectors.pkl"
vector_length = 50

## 4. VECTORIZE GADGET

### Define a GadgetVectorizer class, which:

- Tokenizes C/C++ code snippets (gadgets).
- Trains a Word2Vec model on these tokenized gadgets.
- Generates vector representations of gadgets using the trained Word2Vec embeddings.

In [6]:
import re
import sys

import warnings
warnings.filterwarnings("ignore")

from gensim.models import Word2Vec
import numpy

# =======================
# Operator Sets for Tokenization
# =======================

# Operators with 3 characters 
operators3 = {'<<=', '>>='}
# Operators with 2 characters
operators2 = {
    '->', '++', '--', 
    '!~', '<<', '>>', '<=', '>=', 
    '==', '!=', '&&', '||', '+=', 
    '-=', '*=', '/=', '%=', '&=', '^=', '|='
    }
# Operators with 1 character
operators1 = { 
    '(', ')', '[', ']', '.', 
    '+', '-', '*', '&', '/', 
    '%', '<', '>', '^', '|', 
    '=', ',', '?', ':' , ';',
    '{', '}'
    }

"""
A class for tokenizing code gadgets, training a Word2Vec model, and generating
fixed-size vector representations of the gadgets.

Primary Functions:
- Tokenize individual code lines and gadgets.
- Buffer gadgets and train Word2Vec embeddings.
- Convert tokenized gadgets into 2D vector matrices.

Each gadget is treated as a sequence of tokens, and the final vector
representation is a matrix of size (50 x vector_length).
"""
class GadgetVectorizer:

    def __init__(self, vector_length):
        """
        Initialize the GadgetVectorizer.
        """
        self.gadgets = []             # List to store tokenized gadgets for Word2Vec training.
        self.vector_length = vector_length  # Dimension of each token vector.
        self.forward_slices = 0       # Count of gadgets vectorized in forward direction.
        self.backward_slices = 0      # Count of gadgets vectorized in backward direction.

    @staticmethod
    def tokenize(line):
        """
        Tokenize a single line of C/C++ code.
        Splits the line into tokens including identifiers, keywords, and operators.
        Preserves the original order of tokens.
        """
        tmp, w = [], []  # tmp: list of finalized tokens, w: characters building current word
        i = 0

        while i < len(line):
            if line[i] == ' ':
                # End of word; finalize and add a space token
                tmp.append(''.join(w))
                tmp.append(line[i])
                w = []
                i += 1
            # Check for three-character operators (e.g., <<=)
            elif line[i:i+3] in operators3:
                tmp.append(''.join(w))
                tmp.append(line[i:i+3])
                w = []
                i += 3
            # Check for two-character operators (e.g., ++, ==)
            elif line[i:i+2] in operators2:
                tmp.append(''.join(w))
                tmp.append(line[i:i+2])
                w = []
                i += 2
            # Check for one-character operators (e.g., +, -, ;)
            elif line[i] in operators1:
                tmp.append(''.join(w))
                tmp.append(line[i])
                w = []
                i += 1
            else:
                # Part of a word/identifier; collect characters
                w.append(line[i])
                i += 1

        # Filter out empty strings and space tokens
        res = list(filter(lambda c: c != '', tmp))
        return list(filter(lambda c: c != ' ', res))

    @staticmethod
    def tokenize_gadget(gadget):
        """
        Tokenize an entire gadget (list of code lines).

        For each line:
            - Tokenize the line.
            - Concatenate all tokens into a single list.
            - Check if any function calls (tokens starting with 'FUN') exist.
        """
        tokenized = []
        function_regex = re.compile(r'FUN(\d)+')  # Matches tokens like FUN1, FUN2, etc.
        backwards_slice = False

        for line in gadget:
            tokens = GadgetVectorizer.tokenize(line)
            tokenized += tokens

            # If a function token exists in this line, set backwards_slice to True
            if any(function_regex.match(token) for token in tokens):
                backwards_slice = True
            else:
                backwards_slice = False

        return tokenized, backwards_slice

    def add_gadget(self, gadget):
        """
        Add a tokenized gadget to the training buffer.

        Updates forward or backward slice counters depending on the presence of function tokens.
        """
        tokenized_gadget, backwards_slice = GadgetVectorizer.tokenize_gadget(gadget)
        self.gadgets.append(tokenized_gadget)
        if backwards_slice:
            self.backward_slices += 1
        else:
            self.forward_slices += 1

    def vectorize(self, gadget):
        """
        Generate a 2D vector representation of a gadget.

        - Tokenizes the gadget.
        - Creates a matrix of size (50 x vector_length).
        - Fills it with token embeddings from the Word2Vec model.
        - Uses forward or backward slicing to populate vectors.
        """
        tokenized_gadget, backwards_slice = GadgetVectorizer.tokenize_gadget(gadget)
        vectors = numpy.zeros(shape=(50, self.vector_length))

        num_tokens = min(len(tokenized_gadget), 50)

        if backwards_slice:
            # Populate the matrix from the bottom up (reverse order)
            for i in range(num_tokens):
                token_index = len(tokenized_gadget) - 1 - i
                vectors[49 - i] = self.embeddings[tokenized_gadget[token_index]]
        else:
            # Populate the matrix from the top down (forward order)
            for i in range(num_tokens):
                vectors[i] = self.embeddings[tokenized_gadget[i]]

        return vectors

    def train_model(self):
        """
        Train the Word2Vec model on the buffered tokenized gadgets.

        - Uses skip-gram model (`sg=1`) for learning embeddings.
        - Sets `min_count=1` to ensure every token has an embedding.
        - After training, keeps only the word vectors (embeddings).
        - Frees memory by deleting the model and raw gadget data.
        """
        # Train the Word2Vec model on all gadgets
        model = Word2Vec(
            sentences=self.gadgets,
            min_count=1,
            vector_size=self.vector_length,
            sg=1  # Skip-gram model
        )

        # Save the learned word vectors
        self.embeddings = model.wv

        # Clean up to save memory
        del model
        del self.gadgets

In [7]:
"""
Processes a gadget file and returns a DataFrame of vectorized gadgets and their labels.

Workflow:
---------
1. Parse the gadget file using `parse_file()` to extract individual gadgets and their vulnerability labels.
2. Store each gadget (code + label) in a list.
3. Add each gadget to the `GadgetVectorizer` to build a training corpus.
4. After all gadgets are collected, train a Word2Vec model on the tokens.
5. Re-iterate over the gadgets and convert each into a fixed-size vector matrix.
6. Store each vector and its corresponding label into a list.
7. Convert the list into a Pandas DataFrame with two columns: "vector" and "val".
"""
def get_vectors_df(filename, vector_length=100):
    gadgets = []  # List to hold all gadgets with their labels
    count = 0
    vectorizer = GadgetVectorizer(vector_length)  # Initialize vectorizer with embedding dimension

    # First pass: parse and collect gadgets, add them to the vectorizer
    for gadget, val in parse_file(filename):
        count += 1
        print("Collecting gadgets...", count, end="\r")
        vectorizer.add_gadget(gadget)  # Tokenize and store gadget for training
        row = {"gadget": gadget, "val": val}  # Store raw gadget + label
        gadgets.append(row)

    # Print slicing mode stats
    print('Found {} forward slices and {} backward slices'
          .format(vectorizer.forward_slices, vectorizer.backward_slices))
    print()

    # Train Word2Vec model on all tokenized gadgets
    print("Training model...", end="\r")
    vectorizer.train_model()
    print()

    vectors = []  # Final list to store vectorized gadgets
    count = 0

    # Second pass: convert each gadget to a vector
    for gadget in gadgets:
        count += 1
        print("Processing gadgets...", count, end="\r")
        vector = vectorizer.vectorize(gadget["gadget"])  # Get (50 x vector_length) matrix
        row = {"vector": vector, "val": gadget["val"]}  # Store vector + label
        vectors.append(row)

    print()

    # Convert to DataFrame 
    df = pandas.DataFrame(vectors)
    return df

In [8]:
# Check if the preprocessed vector data already exists as a pickle file
if os.path.exists(vector_filename):
    # Load the DataFrame from the cached pickle file 
    df = pandas.read_pickle(vector_filename)
else:
    # If not cached, generate the vectors from the raw gadget file
    df = get_vectors_df(filename, vector_length)
    # Save the generated DataFrame as a pickle file for future reuse
    df.to_pickle(vector_filename)

## 5. BLSTM Hyperparameter Tuning using Optuna

- Optuna works by sampling, training, and evaluating many model configurations.
- It uses Bayesian optimization to intelligently choose hyperparameter values and finds the best set that maximizes validation accuracy.

In [9]:
from __future__ import print_function
import warnings
import numpy as np
import optuna
import tensorflow as tf

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Bidirectional, LeakyReLU, ReLU
from keras.optimizers import Adamax
from keras.callbacks import EarlyStopping
from optuna.integration import TFKerasPruningCallback

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.utils import compute_class_weight

warnings.filterwarnings("ignore")


class BLSTM:
    def __init__(self, data, name="blstm_optuna"):
        vectors = np.stack(data.iloc[:, 0].values)
        labels = data.iloc[:, 1].values

        # Show original distribution
        unique, counts = np.unique(labels, return_counts=True)
        print(f"Original dataset class distribution: {dict(zip(unique, counts))}")

        # Split into train/test
        X_train_raw, X_test, y_train_raw, y_test = train_test_split(
            vectors, labels, test_size=0.2, stratify=labels, random_state=42
        )

        # Show pre-balanced training distribution
        unique, counts = np.unique(y_train_raw, return_counts=True)
        print(f"Train split before balancing: {dict(zip(unique, counts))}")

        # Balance the training data
        pos_idxs = np.where(y_train_raw == 1)[0]
        neg_idxs = np.where(y_train_raw == 0)[0]
        undersampled_neg_idxs = np.random.choice(neg_idxs, len(pos_idxs), replace=False)
        balanced_idxs = np.concatenate([pos_idxs, undersampled_neg_idxs])
        X_balanced = X_train_raw[balanced_idxs]
        y_balanced = y_train_raw[balanced_idxs]

        # Show post-balanced training distribution
        unique, counts = np.unique(y_balanced, return_counts=True)
        print(f"Balanced training class distribution: {dict(zip(unique, counts))}")

        # Split into final train/val
        X_train, X_val, y_train, y_val = train_test_split(
            X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42
        )

        print(f"Final training class distribution: {dict(zip(*np.unique(y_train, return_counts=True)))}")
        print(f"Validation class distribution: {dict(zip(*np.unique(y_val, return_counts=True)))}")

        self.X_train = X_train
        self.X_val = X_val
        self.X_test = X_test
        self.y_train = to_categorical(y_train)
        self.y_val = to_categorical(y_val)
        self.y_test = to_categorical(y_test)
        self.name = name

        # Class weights from original training data (before balancing)
        classes = np.unique(y_train_raw)
        weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train_raw)
        self.class_weight = dict(zip(classes, weights))

    def build_model(self, trial):
        model = Sequential()

        lstm_units = trial.suggest_categorical('lstm_units', [192, 256])
        dense_units_1 = trial.suggest_categorical('dense_units_1', [192, 256])
        dense_units_2 = trial.suggest_categorical('dense_units_2', [192, 256])
        dropout_rate = trial.suggest_float('dropout_rate', 0.30, 0.38)
        learning_rate = trial.suggest_float('learning_rate', 0.002, 0.0035, log=True)
        activation_choice = trial.suggest_categorical('activation', ['LeakyReLU'])
        batch_size = trial.suggest_categorical('batch_size', [64, 128])
        epochs = trial.suggest_int('epochs', 8, 12)

        model.add(Bidirectional(LSTM(lstm_units), input_shape=(self.X_train.shape[1], self.X_train.shape[2])))

        model.add(Dense(dense_units_1))
        model.add(LeakyReLU() if activation_choice == 'LeakyReLU' else ReLU())
        model.add(Dropout(dropout_rate))

        model.add(Dense(dense_units_2))
        model.add(LeakyReLU() if activation_choice == 'LeakyReLU' else ReLU())
        model.add(Dropout(dropout_rate))

        model.add(Dense(2, activation='softmax'))

        optimizer = Adamax(learning_rate=learning_rate)
        model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

        trial.set_user_attr('batch_size', batch_size)
        trial.set_user_attr('epochs', epochs)

        return model

    def objective(self, trial):
        model = self.build_model(trial)

        batch_size = trial.user_attrs['batch_size']
        epochs = trial.user_attrs['epochs']

        pruning_cb = TFKerasPruningCallback(trial, 'val_accuracy')
        early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

        model.fit(
            self.X_train,
            self.y_train,
            batch_size=batch_size,
            epochs=epochs,
            class_weight=self.class_weight,
            validation_data=(self.X_val, self.y_val),
            callbacks=[pruning_cb, early_stop],
            verbose=0
        )

        val_loss, val_accuracy = model.evaluate(self.X_val, self.y_val, verbose=0)

        if trial.should_prune():
            raise optuna.TrialPruned()

        return val_accuracy

    def run_optimization(self, n_trials=50, n_jobs=1):
        study = optuna.create_study(direction='maximize', study_name=self.name)
        study.optimize(self.objective, n_trials=n_trials, n_jobs=n_jobs)

        print("Best trial:")
        trial = study.best_trial
        print(f"  Accuracy: {trial.value}")
        print("  Best hyperparameters:")
        for key, value in trial.params.items():
            print(f"    {key}: {value}")

        self.best_params = trial.params
        self.model = self.build_model_with_params(self.best_params)
        self.train_final_model()

    def build_model_with_params(self, params):
        model = Sequential()

        model.add(Bidirectional(LSTM(params['lstm_units']), input_shape=(self.X_train.shape[1], self.X_train.shape[2])))

        model.add(Dense(params['dense_units_1']))
        model.add(LeakyReLU() if params['activation'] == 'LeakyReLU' else ReLU())
        model.add(Dropout(params['dropout_rate']))

        model.add(Dense(params['dense_units_2']))
        model.add(LeakyReLU() if params['activation'] == 'LeakyReLU' else ReLU())
        model.add(Dropout(params['dropout_rate']))

        model.add(Dense(2, activation='softmax'))

        optimizer = Adamax(learning_rate=params['learning_rate'])
        model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

        return model

    def train_final_model(self):
        batch_size = self.best_params['batch_size']
        epochs = self.best_params['epochs']

        # Combine train and validation sets
        X_full_train = np.concatenate([self.X_train, self.X_val], axis=0)
        y_full_train = np.concatenate([self.y_train, self.y_val], axis=0)

        # Rebuild model to reset weights
        self.model = self.build_model_with_params(self.best_params)

        early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

        self.model.fit(
            X_full_train,
            y_full_train,
            batch_size=batch_size,
            epochs=epochs,
            class_weight=self.class_weight,
            validation_split=0.0,
            callbacks=[early_stop],
            verbose=1
        )

        self.model.save_weights(self.name + "_best_model.weights.h5")

    def test(self):
        self.model.load_weights(self.name + "_best_model.weights.h5")

        values = self.model.evaluate(self.X_test, self.y_test, batch_size=self.best_params['batch_size'], verbose=1)
        print("Test Accuracy:", values[1])

        predictions = self.model.predict(self.X_test, batch_size=self.best_params['batch_size']).round()

        tn, fp, fn, tp = confusion_matrix(
            np.argmax(self.y_test, axis=1),
            np.argmax(predictions, axis=1)
        ).ravel()

        print('False positive rate:', fp / (fp + tn))
        print('False negative rate:', fn / (fn + tp))

        recall = tp / (tp + fn)
        precision = tp / (tp + fp)
        f1_score = (2 * precision * recall) / (precision + recall)

        print('True positive rate (Recall):', recall)
        print('Precision:', precision)
        print('F1 score:', f1_score)


In [10]:
blstm = BLSTM(df,name=base)
blstm.run_optimization(n_trials=10)
blstm.test()

Original dataset class distribution: {0: 14600, 1: 7285}
Train split before balancing: {0: 11680, 1: 5828}
Balanced training class distribution: {0: 5828, 1: 5828}
Final training class distribution: {0: 4662, 1: 4662}
Validation class distribution: {0: 1166, 1: 1166}


[I 2025-05-23 13:59:00,863] A new study created in memory with name: cwe399_cgd
[I 2025-05-23 14:00:37,002] Trial 0 finished with value: 0.9249570965766907 and parameters: {'lstm_units': 192, 'dense_units_1': 192, 'dense_units_2': 192, 'dropout_rate': 0.3039637249738407, 'learning_rate': 0.0029976023618403544, 'activation': 'LeakyReLU', 'batch_size': 128, 'epochs': 12}. Best is trial 0 with value: 0.9249570965766907.
[I 2025-05-23 14:02:17,695] Trial 1 finished with value: 0.9296740889549255 and parameters: {'lstm_units': 256, 'dense_units_1': 192, 'dense_units_2': 192, 'dropout_rate': 0.35211765305639636, 'learning_rate': 0.002652879992556034, 'activation': 'LeakyReLU', 'batch_size': 128, 'epochs': 12}. Best is trial 1 with value: 0.9296740889549255.
[I 2025-05-23 14:04:14,566] Trial 2 finished with value: 0.9275299906730652 and parameters: {'lstm_units': 256, 'dense_units_1': 256, 'dense_units_2': 192, 'dropout_rate': 0.3592788656688265, 'learning_rate': 0.003382437112364196, 'activa

Best trial:
  Accuracy: 0.9331046342849731
  Best hyperparameters:
    lstm_units: 256
    dense_units_1: 256
    dense_units_2: 256
    dropout_rate: 0.31948108235856754
    learning_rate: 0.0022742445991311092
    activation: LeakyReLU
    batch_size: 64
    epochs: 12
Epoch 1/12
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 63ms/step - accuracy: 0.6724 - loss: 0.5387
Epoch 2/12
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 61ms/step - accuracy: 0.8818 - loss: 0.2752
Epoch 3/12
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 64ms/step - accuracy: 0.8993 - loss: 0.2390
Epoch 4/12
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 66ms/step - accuracy: 0.9029 - loss: 0.2169
Epoch 5/12
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 66ms/step - accuracy: 0.9128 - loss: 0.1950
Epoch 6/12
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 64ms/step - accuracy: 0.9230 - loss: 0.1

## 6. THRESHOLD OPTIMIZATION

### Goal:
To find the best classification threshold for a binary classification model (e.g., vulnerable vs. not vulnerable) by:

- Predicting probabilities from a model.
- Testing different thresholds (from 0.0 to 1.0).
- Selecting the threshold that gives the highest F1 score.
- Returning that optimal threshold and F1 score.

In [11]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, accuracy_score
import numpy as np

def find_best_threshold_on_val(model, X_val, y_val, batch_size=128):
    """
    Finds the optimal classification threshold using validation data.
    """
    print("\nSearching for optimal threshold using validation set...")

    probas = model.predict(X_val, batch_size=batch_size)
    y_true = np.argmax(y_val, axis=1)
    positive_class_probs = probas[:, 1]

    thresholds = np.arange(0.0, 1.01, 0.01)
    best_threshold = 0.5
    best_f1 = 0

    print("\nThreshold\tPrecision\tRecall\t\tF1 Score")
    for th in thresholds:
        preds = (positive_class_probs >= th).astype(int)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true, preds, average='binary', zero_division=0
        )
        print(f"{th:.2f}\t\t{precision:.4f}\t\t{recall:.4f}\t\t{f1:.4f}")
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = th

    print(f"\nBest threshold on validation set: {best_threshold:.2f} (F1 = {best_f1:.4f})")
    return best_threshold


def evaluate_on_test(model, X_test, y_test, threshold, batch_size=128):
    """
    Evaluates model performance on the test set using a fixed threshold.
    """
    print(f"\nEvaluating on test set using threshold {threshold:.2f}...")

    probas = model.predict(X_test, batch_size=batch_size)
    y_true = np.argmax(y_test, axis=1)
    preds = (probas[:, 1] >= threshold).astype(int)

    tn, fp, fn, tp = confusion_matrix(y_true, preds).ravel()

    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    f1_score_final = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = accuracy_score(y_true, preds)
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

    print("\nFinal Test Evaluation:")
    print(f"Test Accuracy: {accuracy:.6f}")
    print(f"False positive rate: {fpr:.6f}")
    print(f"False negative rate: {fnr:.6f}")
    print(f"True positive rate (Recall): {recall:.6f}")
    print(f"Precision: {precision:.6f}")
    print(f"F1 score: {f1_score_final:.6f}")

    return {
        'threshold': threshold,
        'accuracy': accuracy,
        'recall': recall,
        'precision': precision,
        'f1': f1_score_final,
        'fpr': fpr,
        'fnr': fnr,
        'tp': tp,
        'fp': fp,
        'tn': tn,
        'fn': fn
    }

In [12]:
# Load the best model weights saved during training
blstm.model.load_weights(blstm.name + "_best_model.weights.h5")

# Find best threshold on validation set
best_threshold = find_best_threshold_on_val(
    model=blstm.model,
    X_val=blstm.X_val,
    y_val=blstm.y_val,
    batch_size=blstm.best_params['batch_size']
)

# Evaluate model on test set using the best threshold
test_metrics = evaluate_on_test(
    model=blstm.model,
    X_test=blstm.X_test,
    y_test=blstm.y_test,
    threshold=best_threshold,
    batch_size=blstm.best_params['batch_size']
)

# Print summary
print(f"\nFinal Test Evaluation with Threshold {test_metrics['threshold']:.2f}:")
print(f"F1 Score: {test_metrics['f1']:.4f}")
print(f"Precision: {test_metrics['precision']:.4f}")
print(f"Recall: {test_metrics['recall']:.4f}")
print(f"Accuracy: {test_metrics['accuracy']:.4f}")
print(f"FPR: {test_metrics['fpr']:.4f}")
print(f"FNR: {test_metrics['fnr']:.4f}")


Searching for optimal threshold using validation set...
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step

Threshold	Precision	Recall		F1 Score
0.00		0.5000		1.0000		0.6667
0.01		0.8041		1.0000		0.8914
0.02		0.8264		1.0000		0.9049
0.03		0.8382		1.0000		0.9120
0.04		0.8517		1.0000		0.9199
0.05		0.8567		1.0000		0.9228
0.06		0.8612		1.0000		0.9254
0.07		0.8650		1.0000		0.9276
0.08		0.8682		1.0000		0.9295
0.09		0.8708		1.0000		0.9309
0.10		0.8734		1.0000		0.9324
0.11		0.8741		1.0000		0.9328
0.12		0.8754		1.0000		0.9335
0.13		0.8767		1.0000		0.9343
0.14		0.8793		1.0000		0.9358
0.15		0.8792		0.9991		0.9354
0.16		0.8798		0.9983		0.9353
0.17		0.8818		0.9983		0.9364
0.18		0.8837		0.9966		0.9367
0.19		0.8891		0.9966		0.9397
0.20		0.8890		0.9957		0.9393
0.21		0.8888		0.9940		0.9385
0.22		0.8902		0.9940		0.9392
0.23		0.8915		0.9940		0.9400
0.24		0.8922		0.9940		0.9404
0.25		0.8922		0.9940		0.9404
0.26		0.8929		0.9940		0.9407
0.27		0.9087		0.9897		0.9475
0.28		0.9087		0.989