# IVUL: WITH CLASS BALANCING (CWE 119)

In [1]:
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, precision_score, f1_score
from math import floor, sqrt

# ---------------------------------------
# 0. Reproducibility Setup
# ---------------------------------------
SEED = 41
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
tf.config.experimental.enable_op_determinism()

# ---------------------------------------
# 1. Data Loading + Preprocessing
# ---------------------------------------
def parse_file(filename):
    with open(filename, "r", encoding="utf8") as file:
        gadget = []
        label = None
        for line in file:
            stripped = line.strip()
            if not stripped:
                continue
            if '-' * 10 in stripped and gadget and label in [0, 1]:
                yield "\n".join(gadget), label
                gadget = []
                label = None
            elif stripped.split()[0].isdigit():
                if stripped.isdigit():
                    label_candidate = int(stripped)
                    if label_candidate in [0, 1]:
                        label = label_candidate
                else:
                    gadget.append(stripped)
            else:
                gadget.append(stripped)

def code_to_image(code_sample):
    byte_array = bytearray(code_sample, 'utf-8')
    flat = np.array(byte_array, dtype=np.uint8)
    size = floor(sqrt(len(flat)))
    cropped = flat[:size * size].reshape((size, size))
    padded = np.zeros((224, 224), dtype=np.uint8)
    h, w = cropped.shape
    padded[:h, :w] = cropped
    return padded

def load_balanced_data(filepath, seed=SEED):
    codes, labels = zip(*parse_file(filepath))
    labels = np.array(labels).astype(np.int32)

    # Show original distribution
    unique, counts = np.unique(labels, return_counts=True)
    print(f"Original dataset class distribution: {dict(zip(unique, counts))}")

    # Train-test split
    X_train_codes, X_test_codes, y_train_all, y_test = train_test_split(
        codes, labels, test_size=0.2, stratify=labels, random_state=seed
    )

    # Before balancing training
    unique_train_pre, count_train_pre = np.unique(y_train_all, return_counts=True)
    print(f"Train split before balancing: {dict(zip(unique_train_pre, count_train_pre))}")

    # Balance training set (undersample class 0)
    y_train_all = np.array(y_train_all)
    pos_idx = np.where(y_train_all == 1)[0]
    neg_idx = np.where(y_train_all == 0)[0]

    rng = np.random.default_rng(seed)
    neg_sample = rng.choice(neg_idx, len(pos_idx), replace=False)
    balanced_idx = np.concatenate([pos_idx, neg_sample])
    rng.shuffle(balanced_idx)

    # Balanced training set
    balanced_codes = [X_train_codes[i] for i in balanced_idx]
    y_train = y_train_all[balanced_idx]

    unique_train_post, count_train_post = np.unique(y_train, return_counts=True)
    print(f"Balanced training class distribution: {dict(zip(unique_train_post, count_train_post))}")

    # Convert to images
    X_train = np.array([code_to_image(code) for code in balanced_codes], dtype=np.float32) / 255.0
    X_train = np.expand_dims(X_train, -1)

    # Test set
    X_test = np.array([code_to_image(code) for code in X_test_codes], dtype=np.float32) / 255.0
    X_test = np.expand_dims(X_test, -1)
    y_test = np.array(y_test).astype(np.int32)

    # Final distributions
    print(f"Final training class distribution: {dict(zip(*np.unique(y_train, return_counts=True)))}")

    class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
    class_weight_dict = dict(zip(np.unique(y_train), class_weights))

    return (X_train, X_test, y_train, y_test), class_weight_dict

# ---------------------------------------
# 2. Model Training Using Given Parameters
# ---------------------------------------
def train_final_model(filepath):
    (X_train, X_test, y_train, y_test), class_weight = load_balanced_data(filepath)

    learning_rate = 0.001
    dropout = 0.4
    dense_units = 128
    batch_size = 32

    model = Sequential([
        tf.keras.Input(shape=(224, 224, 1)),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(64, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Dropout(dropout),
        Flatten(),
        Dense(dense_units, activation="relu"),
        Dense(2, activation="softmax")
    ])

    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    print("Training for 15 epochs...")
    model.fit(
        X_train, y_train,
        batch_size=batch_size,
        epochs=15,
        class_weight=class_weight,
        verbose=1
    )

    print("\nEvaluating on test set...")
    loss, acc = model.evaluate(X_test, y_test)
    print(f"\nFinal Test Accuracy: {acc:.4f}")

    print("\nComputing additional metrics...")
    y_pred_probs = model.predict(X_test, batch_size=batch_size)
    y_pred = np.argmax(y_pred_probs, axis=1)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    precision = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"Accuracy        : {accuracy:.4f}")
    print(f"True Pos Rate   : {tpr:.4f}")
    print(f"False Neg Rate  : {fnr:.4f}")
    print(f"False Pos Rate  : {fpr:.4f}")
    print(f"Precision       : {precision:.4f}")
    print(f"F1 Score        : {f1:.4f}")

# --- Run it ---
train_final_model("cwe119_cgd.txt")

Original dataset class distribution: {0: 29313, 1: 10440}
Train split before balancing: {0: 23450, 1: 8352}
Balanced training class distribution: {0: 8352, 1: 8352}
Final training class distribution: {0: 8352, 1: 8352}
Training for 15 epochs...
Epoch 1/15
[1m522/522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 191ms/step - accuracy: 0.6043 - loss: 0.6533
Epoch 2/15
[1m522/522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 189ms/step - accuracy: 0.6288 - loss: 0.6249
Epoch 3/15
[1m522/522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 188ms/step - accuracy: 0.7085 - loss: 0.5527
Epoch 4/15
[1m522/522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 187ms/step - accuracy: 0.7658 - loss: 0.4816
Epoch 5/15
[1m522/522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 189ms/step - accuracy: 0.7843 - loss: 0.4436
Epoch 6/15
[1m522/522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 188ms/step - accuracy: 0.8053 - loss: 0.4126
Epoch 7/15
[1m5

# IVUL: WITH CLASS BALANCING (CWE 399)

In [2]:
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, precision_score, f1_score
from math import floor, sqrt

# ---------------------------------------
# 0. Reproducibility Setup
# ---------------------------------------
SEED = 41
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
tf.config.experimental.enable_op_determinism()

# ---------------------------------------
# 1. Data Loading + Preprocessing
# ---------------------------------------
def parse_file(filename):
    with open(filename, "r", encoding="utf8") as file:
        gadget = []
        label = None
        for line in file:
            stripped = line.strip()
            if not stripped:
                continue
            if '-' * 10 in stripped and gadget and label in [0, 1]:
                yield "\n".join(gadget), label
                gadget = []
                label = None
            elif stripped.split()[0].isdigit():
                if stripped.isdigit():
                    label_candidate = int(stripped)
                    if label_candidate in [0, 1]:
                        label = label_candidate
                else:
                    gadget.append(stripped)
            else:
                gadget.append(stripped)

def code_to_image(code_sample):
    byte_array = bytearray(code_sample, 'utf-8')
    flat = np.array(byte_array, dtype=np.uint8)
    size = floor(sqrt(len(flat)))
    cropped = flat[:size * size].reshape((size, size))
    padded = np.zeros((224, 224), dtype=np.uint8)
    h, w = cropped.shape
    padded[:h, :w] = cropped
    return padded

def load_balanced_data(filepath, seed=SEED):
    codes, labels = zip(*parse_file(filepath))
    labels = np.array(labels).astype(np.int32)

    # Show original distribution
    unique, counts = np.unique(labels, return_counts=True)
    print(f"Original dataset class distribution: {dict(zip(unique, counts))}")

    # Train-test split
    X_train_codes, X_test_codes, y_train_all, y_test = train_test_split(
        codes, labels, test_size=0.2, stratify=labels, random_state=seed
    )

    # Before balancing training
    unique_train_pre, count_train_pre = np.unique(y_train_all, return_counts=True)
    print(f"Train split before balancing: {dict(zip(unique_train_pre, count_train_pre))}")

    # Balance training set (undersample class 0)
    y_train_all = np.array(y_train_all)
    pos_idx = np.where(y_train_all == 1)[0]
    neg_idx = np.where(y_train_all == 0)[0]

    rng = np.random.default_rng(seed)
    neg_sample = rng.choice(neg_idx, len(pos_idx), replace=False)
    balanced_idx = np.concatenate([pos_idx, neg_sample])
    rng.shuffle(balanced_idx)

    # Balanced training set
    balanced_codes = [X_train_codes[i] for i in balanced_idx]
    y_train = y_train_all[balanced_idx]

    unique_train_post, count_train_post = np.unique(y_train, return_counts=True)
    print(f"Balanced training class distribution: {dict(zip(unique_train_post, count_train_post))}")

    # Convert to images
    X_train = np.array([code_to_image(code) for code in balanced_codes], dtype=np.float32) / 255.0
    X_train = np.expand_dims(X_train, -1)

    # Test set
    X_test = np.array([code_to_image(code) for code in X_test_codes], dtype=np.float32) / 255.0
    X_test = np.expand_dims(X_test, -1)
    y_test = np.array(y_test).astype(np.int32)

    # Final distributions
    print(f"Final training class distribution: {dict(zip(*np.unique(y_train, return_counts=True)))}")

    class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
    class_weight_dict = dict(zip(np.unique(y_train), class_weights))

    return (X_train, X_test, y_train, y_test), class_weight_dict

# ---------------------------------------
# 2. Model Training Using Given Parameters
# ---------------------------------------
def train_final_model(filepath):
    (X_train, X_test, y_train, y_test), class_weight = load_balanced_data(filepath)

    learning_rate = 0.001
    dropout = 0.4
    dense_units = 128
    batch_size = 32

    model = Sequential([
        tf.keras.Input(shape=(224, 224, 1)),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(64, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Dropout(dropout),
        Flatten(),
        Dense(dense_units, activation="relu"),
        Dense(2, activation="softmax")
    ])

    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    print("Training for 15 epochs...")
    model.fit(
        X_train, y_train,
        batch_size=batch_size,
        epochs=15,
        class_weight=class_weight,
        verbose=1
    )

    print("\nEvaluating on test set...")
    loss, acc = model.evaluate(X_test, y_test)
    print(f"\nFinal Test Accuracy: {acc:.4f}")

    print("\nComputing additional metrics...")
    y_pred_probs = model.predict(X_test, batch_size=batch_size)
    y_pred = np.argmax(y_pred_probs, axis=1)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    precision = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"Accuracy        : {accuracy:.4f}")
    print(f"True Pos Rate   : {tpr:.4f}")
    print(f"False Neg Rate  : {fnr:.4f}")
    print(f"False Pos Rate  : {fpr:.4f}")
    print(f"Precision       : {precision:.4f}")
    print(f"F1 Score        : {f1:.4f}")

# --- Run it ---
train_final_model("cwe399_cgd.txt")

Original dataset class distribution: {0: 14600, 1: 7285}
Train split before balancing: {0: 11680, 1: 5828}
Balanced training class distribution: {0: 5828, 1: 5828}
Final training class distribution: {0: 5828, 1: 5828}
Training for 15 epochs...
Epoch 1/15
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 189ms/step - accuracy: 0.5515 - loss: 0.6793
Epoch 2/15
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 187ms/step - accuracy: 0.6657 - loss: 0.5915
Epoch 3/15
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 187ms/step - accuracy: 0.7792 - loss: 0.4306
Epoch 4/15
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 187ms/step - accuracy: 0.8227 - loss: 0.3561
Epoch 5/15
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 189ms/step - accuracy: 0.8496 - loss: 0.3143
Epoch 6/15
[1m365/365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 187ms/step - accuracy: 0.8671 - loss: 0.2842
Epoch 7/15
[1m365

# HYPERPARAMETER TUNING (CWE 119)

In [2]:
import os
import random
import numpy as np
from joblib import Parallel, delayed
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, precision_score, f1_score
from math import floor, sqrt
import optuna

SEED = 41
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
tf.config.experimental.enable_op_determinism()

TARGET_SIZE = 32

def compute_mean_coverage(filepath, target_size):
    lengths = []
    with open(filepath, "r", encoding="utf8") as file:
        gadget = []
        label = None
        for line in file:
            stripped = line.strip()
            if not stripped:
                continue
            if '-' * 10 in stripped:
                if gadget and label in [0, 1]:
                    byte_array = bytearray("\n".join(gadget), 'utf-8')
                    lengths.append(len(byte_array))
                gadget = []
                label = None
            elif stripped.split()[0].isdigit():
                if stripped.isdigit():
                    label_candidate = int(stripped)
                    if label_candidate in [0, 1]:
                        label = label_candidate
                    else:
                        gadget = []
                        label = None
                else:
                    gadget.append(stripped)
            else:
                gadget.append(stripped)

    max_pixels = target_size * target_size
    coverage = [min(floor(sqrt(l))**2, max_pixels) / l for l in lengths if l > 0]
    mean_coverage = np.mean(coverage)
    print(f"\nMean fraction of data preserved with TARGET_SIZE={target_size}: {mean_coverage:.4f}")
    return mean_coverage

def parse_file(filename):
    with open(filename, "r", encoding="utf8") as file:
        gadget = []
        label = None
        for line in file:
            stripped = line.strip()
            if not stripped:
                continue
            if '-' * 10 in stripped:
                if gadget and label in [0, 1]:
                    yield "\n".join(gadget), label
                gadget = []
                label = None
            elif stripped.split()[0].isdigit():
                if stripped.isdigit():
                    label_candidate = int(stripped)
                    if label_candidate in [0, 1]:
                        label = label_candidate
                    else:
                        gadget = []
                        label = None
                else:
                    gadget.append(stripped)
            else:
                gadget.append(stripped)

def code_to_image(code_sample):
    byte_array = bytearray(code_sample, 'utf-8')
    flat = np.array(byte_array, dtype=np.uint8)
    size = floor(sqrt(len(flat)))
    cropped = flat[:size * size].reshape((size, size))
    padded = np.zeros((TARGET_SIZE, TARGET_SIZE), dtype=np.uint8)
    h, w = cropped.shape
    h = min(h, TARGET_SIZE)
    w = min(w, TARGET_SIZE)
    padded[:h, :w] = cropped[:h, :w]
    return padded

def load_balanced_data(filepath, seed=SEED):
    codes, labels = zip(*parse_file(filepath))
    labels = np.array(labels).astype(np.int32)

    codes_trainval, codes_test, y_trainval_raw, y_test = train_test_split(
        codes, labels, test_size=0.2, stratify=labels, random_state=seed
    )

    codes_train, codes_val, y_train_raw, y_val = train_test_split(
        codes_trainval, y_trainval_raw, test_size=0.125, stratify=y_trainval_raw, random_state=seed
    )

    y_train_raw = np.array(y_train_raw)
    pos_idx = np.where(y_train_raw == 1)[0]
    neg_idx = np.where(y_train_raw == 0)[0]

    rng = np.random.default_rng(seed)
    neg_sample = rng.choice(neg_idx, len(pos_idx), replace=False)
    balanced_idx = np.concatenate([pos_idx, neg_sample])
    rng.shuffle(balanced_idx)

    balanced_codes = [codes_train[i] for i in balanced_idx]
    y_train = y_train_raw[balanced_idx]

    X_train = Parallel(n_jobs=-1)(delayed(code_to_image)(code) for code in balanced_codes)
    X_val = Parallel(n_jobs=-1)(delayed(code_to_image)(code) for code in codes_val)
    X_test = Parallel(n_jobs=-1)(delayed(code_to_image)(code) for code in codes_test)

    X_train = np.expand_dims(np.stack(X_train).astype(np.float32) / 255.0, -1)
    X_val = np.expand_dims(np.stack(X_val).astype(np.float32) / 255.0, -1)
    X_test = np.expand_dims(np.stack(X_test).astype(np.float32) / 255.0, -1)

    y_val = np.array(y_val).astype(np.int32)
    y_test = np.array(y_test).astype(np.int32)

    class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
    class_weight_dict = dict(zip(np.unique(y_train), class_weights))

    return (X_train, X_val, X_test, y_train, y_val, y_test), class_weight_dict

def objective(trial):
    try:
        (X_train, X_val, _, y_train, y_val, _), class_weight = load_balanced_data("cwe119_cgd.txt")
    except ValueError as e:
        raise optuna.exceptions.TrialPruned(str(e))

    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-3, log=True)
    dropout = trial.suggest_float("dropout", 0.2, 0.5)
    dense_units = trial.suggest_categorical("dense_units", [64, 128, 256, 512])
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128])

    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    model = Sequential([
        tf.keras.Input(shape=(TARGET_SIZE, TARGET_SIZE, 1)),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(64, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Dropout(dropout),
        Flatten(),
        Dense(dense_units, activation="relu"),
        Dense(2, activation="softmax")
    ])

    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    model.fit(train_ds, validation_data=val_ds, epochs=10, class_weight=class_weight, verbose=0)
    y_val_probs = model.predict(X_val, batch_size=batch_size, verbose=0)
    y_val_pred = np.argmax(y_val_probs, axis=1)

    return np.mean(y_val_pred == y_val)  

def train_best_model(best_params):
    (X_trainval, _, X_test, y_trainval, _, y_test), class_weight = load_balanced_data("cwe119_cgd.txt")

    batch_size = best_params["batch_size"]
    trainval_ds = tf.data.Dataset.from_tensor_slices((X_trainval, y_trainval)).shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    model = Sequential([
        tf.keras.Input(shape=(TARGET_SIZE, TARGET_SIZE, 1)),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(64, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Dropout(best_params["dropout"]),
        Flatten(),
        Dense(best_params["dense_units"], activation="relu"),
        Dense(2, activation="softmax")
    ])

    model.compile(
        optimizer=Adam(learning_rate=best_params["learning_rate"]),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    print("\nTraining final model on train + val...")
    model.fit(trainval_ds, epochs=20, class_weight=class_weight, verbose=1)

    print("\nEvaluating on test set...")
    y_pred_probs = model.predict(X_test, batch_size=batch_size, verbose=0)
    y_pred = np.argmax(y_pred_probs, axis=1)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    precision = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"\nAccuracy        : {accuracy:.4f}")
    print(f"True Pos Rate   : {tpr:.4f}")
    print(f"False Neg Rate  : {fnr:.4f}")
    print(f"False Pos Rate  : {fpr:.4f}")
    print(f"Precision       : {precision:.4f}")
    print(f"F1 Score        : {f1:.4f}")

def run_optuna():
    compute_mean_coverage("cwe119_cgd.txt", TARGET_SIZE)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20)

    print("\nBest Trial:")
    best = study.best_trial
    print(f"Accuracy: {best.value:.4f}")
    for k, v in best.params.items():
        print(f"  {k}: {v}")

    train_best_model(best.params)

run_optuna()

[I 2025-05-23 23:06:02,979] A new study created in memory with name: no-name-a5da2157-5aed-4958-9136-15ce0f950a2e



Mean fraction of data preserved with TARGET_SIZE=32: 0.9353


[I 2025-05-23 23:06:28,221] Trial 0 finished with value: 0.5404929577464789 and parameters: {'learning_rate': 1.911834782609028e-05, 'dropout': 0.4415879914992911, 'dense_units': 64, 'batch_size': 64}. Best is trial 0 with value: 0.5404929577464789.
[I 2025-05-23 23:06:50,459] Trial 1 finished with value: 0.596327967806841 and parameters: {'learning_rate': 4.729700700417656e-05, 'dropout': 0.3798021638730412, 'dense_units': 256, 'batch_size': 128}. Best is trial 1 with value: 0.596327967806841.
[I 2025-05-23 23:07:20,370] Trial 2 finished with value: 0.5404929577464789 and parameters: {'learning_rate': 2.145725433638952e-05, 'dropout': 0.4709343571294727, 'dense_units': 64, 'batch_size': 32}. Best is trial 1 with value: 0.596327967806841.
[I 2025-05-23 23:08:02,175] Trial 3 finished with value: 0.7376760563380281 and parameters: {'learning_rate': 0.0029901502741362805, 'dropout': 0.36190349195732446, 'dense_units': 64, 'batch_size': 16}. Best is trial 3 with value: 0.7376760563380281.



Best Trial:
Accuracy: 0.7638
  learning_rate: 0.00041314272442970695
  dropout: 0.32317539094635517
  dense_units: 512
  batch_size: 16

Training final model on train + val...
Epoch 1/20
[1m914/914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.5972 - loss: 0.6558
Epoch 2/20
[1m914/914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.6458 - loss: 0.6155
Epoch 3/20
[1m914/914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.7179 - loss: 0.5451
Epoch 4/20
[1m914/914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.7662 - loss: 0.4762
Epoch 5/20
[1m914/914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.7965 - loss: 0.4355
Epoch 6/20
[1m914/914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.8015 - loss: 0.4098
Epoch 7/20
[1m914/914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.815

# HYPERPARAMETER TUNING (CWE 399)

In [12]:
import os
import random
import numpy as np
from joblib import Parallel, delayed
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, precision_score, f1_score
from math import floor, sqrt
import optuna

SEED = 41
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
tf.config.experimental.enable_op_determinism()

TARGET_SIZE = 32

def compute_mean_coverage(filepath, target_size):
    lengths = []
    with open(filepath, "r", encoding="utf8") as file:
        gadget = []
        label = None
        for line in file:
            stripped = line.strip()
            if not stripped:
                continue
            if '-' * 10 in stripped:
                if gadget and label in [0, 1]:
                    byte_array = bytearray("\n".join(gadget), 'utf-8')
                    lengths.append(len(byte_array))
                gadget = []
                label = None
            elif stripped.split()[0].isdigit():
                if stripped.isdigit():
                    label_candidate = int(stripped)
                    if label_candidate in [0, 1]:
                        label = label_candidate
                    else:
                        gadget = []
                        label = None
                else:
                    gadget.append(stripped)
            else:
                gadget.append(stripped)

    max_pixels = target_size * target_size
    coverage = [min(floor(sqrt(l))**2, max_pixels) / l for l in lengths if l > 0]
    mean_coverage = np.mean(coverage)
    print(f"\nMean fraction of data preserved with TARGET_SIZE={target_size}: {mean_coverage:.4f}")
    return mean_coverage

def parse_file(filename):
    with open(filename, "r", encoding="utf8") as file:
        gadget = []
        label = None
        for line in file:
            stripped = line.strip()
            if not stripped:
                continue
            if '-' * 10 in stripped:
                if gadget and label in [0, 1]:
                    yield "\n".join(gadget), label
                gadget = []
                label = None
            elif stripped.split()[0].isdigit():
                if stripped.isdigit():
                    label_candidate = int(stripped)
                    if label_candidate in [0, 1]:
                        label = label_candidate
                    else:
                        gadget = []
                        label = None
                else:
                    gadget.append(stripped)
            else:
                gadget.append(stripped)

def code_to_image(code_sample):
    byte_array = bytearray(code_sample, 'utf-8')
    flat = np.array(byte_array, dtype=np.uint8)
    size = floor(sqrt(len(flat)))
    cropped = flat[:size * size].reshape((size, size))
    padded = np.zeros((TARGET_SIZE, TARGET_SIZE), dtype=np.uint8)
    h, w = cropped.shape
    h = min(h, TARGET_SIZE)
    w = min(w, TARGET_SIZE)
    padded[:h, :w] = cropped[:h, :w]
    return padded

def load_balanced_data(filepath, seed=SEED):
    codes, labels = zip(*parse_file(filepath))
    labels = np.array(labels).astype(np.int32)

    codes_trainval, codes_test, y_trainval_raw, y_test = train_test_split(
        codes, labels, test_size=0.2, stratify=labels, random_state=seed
    )

    codes_train, codes_val, y_train_raw, y_val = train_test_split(
        codes_trainval, y_trainval_raw, test_size=0.125, stratify=y_trainval_raw, random_state=seed
    )

    y_train_raw = np.array(y_train_raw)
    pos_idx = np.where(y_train_raw == 1)[0]
    neg_idx = np.where(y_train_raw == 0)[0]

    rng = np.random.default_rng(seed)
    neg_sample = rng.choice(neg_idx, len(pos_idx), replace=False)
    balanced_idx = np.concatenate([pos_idx, neg_sample])
    rng.shuffle(balanced_idx)

    balanced_codes = [codes_train[i] for i in balanced_idx]
    y_train = y_train_raw[balanced_idx]

    X_train = Parallel(n_jobs=-1)(delayed(code_to_image)(code) for code in balanced_codes)
    X_val = Parallel(n_jobs=-1)(delayed(code_to_image)(code) for code in codes_val)
    X_test = Parallel(n_jobs=-1)(delayed(code_to_image)(code) for code in codes_test)

    X_train = np.expand_dims(np.stack(X_train).astype(np.float32) / 255.0, -1)
    X_val = np.expand_dims(np.stack(X_val).astype(np.float32) / 255.0, -1)
    X_test = np.expand_dims(np.stack(X_test).astype(np.float32) / 255.0, -1)

    y_val = np.array(y_val).astype(np.int32)
    y_test = np.array(y_test).astype(np.int32)

    class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
    class_weight_dict = dict(zip(np.unique(y_train), class_weights))

    return (X_train, X_val, X_test, y_train, y_val, y_test), class_weight_dict

def objective(trial):
    try:
        (X_train, X_val, _, y_train, y_val, _), class_weight = load_balanced_data("cwe399_cgd.txt")
    except ValueError as e:
        raise optuna.exceptions.TrialPruned(str(e))

    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-3, log=True)
    dropout = trial.suggest_float("dropout", 0.2, 0.5)
    dense_units = trial.suggest_categorical("dense_units", [64, 128, 256, 512])
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128])

    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    model = Sequential([
        tf.keras.Input(shape=(TARGET_SIZE, TARGET_SIZE, 1)),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(64, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Dropout(dropout),
        Flatten(),
        Dense(dense_units, activation="relu"),
        Dense(2, activation="softmax")
    ])

    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    model.fit(train_ds, validation_data=val_ds, epochs=10, class_weight=class_weight, verbose=0)
    y_val_probs = model.predict(X_val, batch_size=batch_size, verbose=0)
    y_val_pred = np.argmax(y_val_probs, axis=1)

    return np.mean(y_val_pred == y_val)  

def train_best_model(best_params):
    (X_trainval, _, X_test, y_trainval, _, y_test), class_weight = load_balanced_data("cwe399_cgd.txt")

    batch_size = best_params["batch_size"]
    trainval_ds = tf.data.Dataset.from_tensor_slices((X_trainval, y_trainval)).shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    model = Sequential([
        tf.keras.Input(shape=(TARGET_SIZE, TARGET_SIZE, 1)),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(64, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Dropout(best_params["dropout"]),
        Flatten(),
        Dense(best_params["dense_units"], activation="relu"),
        Dense(2, activation="softmax")
    ])

    model.compile(
        optimizer=Adam(learning_rate=best_params["learning_rate"]),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    print("\nTraining final model on train + val...")
    model.fit(trainval_ds, epochs=20, class_weight=class_weight, verbose=1)

    print("\nEvaluating on test set...")
    y_pred_probs = model.predict(X_test, batch_size=batch_size, verbose=0)
    y_pred = np.argmax(y_pred_probs, axis=1)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    precision = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"\nAccuracy        : {accuracy:.4f}")
    print(f"True Pos Rate   : {tpr:.4f}")
    print(f"False Neg Rate  : {fnr:.4f}")
    print(f"False Pos Rate  : {fpr:.4f}")
    print(f"Precision       : {precision:.4f}")
    print(f"F1 Score        : {f1:.4f}")

def run_optuna():
    compute_mean_coverage("cwe399_cgd.txt", TARGET_SIZE)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20)

    print("\nBest Trial:")
    best = study.best_trial
    print(f"Accuracy: {best.value:.4f}")
    for k, v in best.params.items():
        print(f"  {k}: {v}")

    train_best_model(best.params)

run_optuna()

[I 2025-05-23 21:53:55,704] A new study created in memory with name: no-name-4298b01b-de8b-45a3-aa8e-59c084f261e3



Mean fraction of data preserved with TARGET_SIZE=32: 0.9353


[I 2025-05-23 21:54:13,617] Trial 0 finished with value: 0.6368204659661946 and parameters: {'learning_rate': 4.970793410464982e-05, 'dropout': 0.35074186441874666, 'dense_units': 512, 'batch_size': 128}. Best is trial 0 with value: 0.6368204659661946.
[I 2025-05-23 21:54:37,349] Trial 1 finished with value: 0.6391046139789859 and parameters: {'learning_rate': 2.0580541725067192e-05, 'dropout': 0.24182785242427024, 'dense_units': 64, 'batch_size': 32}. Best is trial 1 with value: 0.6391046139789859.
[I 2025-05-23 21:54:53,651] Trial 2 finished with value: 0.8634079488350845 and parameters: {'learning_rate': 0.0028927308577403783, 'dropout': 0.47919020135162466, 'dense_units': 256, 'batch_size': 128}. Best is trial 2 with value: 0.8634079488350845.
[I 2025-05-23 21:55:10,088] Trial 3 finished with value: 0.6057560529922339 and parameters: {'learning_rate': 1.9123017376983858e-05, 'dropout': 0.35086921361875245, 'dense_units': 128, 'batch_size': 128}. Best is trial 2 with value: 0.863407


Best Trial:
Accuracy: 0.8890
  learning_rate: 0.000292664961892896
  dropout: 0.294643590864854
  dense_units: 256
  batch_size: 16

Training final model on train + val...
Epoch 1/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.5654 - loss: 0.6748
Epoch 2/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6693 - loss: 0.5983
Epoch 3/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7642 - loss: 0.4707
Epoch 4/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8151 - loss: 0.3845
Epoch 5/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8491 - loss: 0.3399
Epoch 6/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8687 - loss: 0.2929
Epoch 7/20
[1m638/638[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8817 - 

# BEST MODEL (CWE 119)

In [3]:
import os
import random
import numpy as np
from joblib import Parallel, delayed
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from math import floor, sqrt

# ---------------------------------------
# 0. Reproducibility Setup
# ---------------------------------------
SEED = 41
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
tf.config.experimental.enable_op_determinism()

# ---------------------------------------
# 1. Preprocessing 
# ---------------------------------------
TARGET_SIZE = 32  

def parse_file(filename):
    with open(filename, "r", encoding="utf8") as file:
        gadget = []
        label = None
        for line in file:
            stripped = line.strip()
            if not stripped:
                continue
            if '-' * 10 in stripped:
                if gadget and label in [0, 1]:
                    yield "\n".join(gadget), label
                gadget = []
                label = None
            elif stripped.split()[0].isdigit():
                if stripped.isdigit():
                    label_candidate = int(stripped)
                    if label_candidate in [0, 1]:
                        label = label_candidate
                    else:
                        gadget = []
                        label = None
                else:
                    gadget.append(stripped)
            else:
                gadget.append(stripped)

def code_to_image(code_sample):
    byte_array = bytearray(code_sample, 'utf-8')
    flat = np.array(byte_array, dtype=np.uint8)
    size = floor(sqrt(len(flat)))
    cropped = flat[:size * size].reshape((size, size))

    padded = np.zeros((TARGET_SIZE, TARGET_SIZE), dtype=np.uint8)
    h, w = cropped.shape
    h = min(h, TARGET_SIZE)
    w = min(w, TARGET_SIZE)
    padded[:h, :w] = cropped[:h, :w]

    return padded

# ---------------------------------------
# 2. Data Loader with Fixed Leakage
# ---------------------------------------
def load_balanced_data(filepath, seed=SEED):
    codes, labels = zip(*parse_file(filepath))
    labels = np.array(labels).astype(np.int32)

    # Step 1: Split first
    codes_train, codes_test, y_train_raw, y_test = train_test_split(
        codes, labels, test_size=0.2, stratify=labels, random_state=seed
    )

    # Step 2: Balance training data only
    y_train_raw = np.array(y_train_raw)
    pos_idx = np.where(y_train_raw == 1)[0]
    neg_idx = np.where(y_train_raw == 0)[0]

    rng = np.random.default_rng(seed)
    neg_sample = rng.choice(neg_idx, len(pos_idx), replace=False)
    balanced_idx = np.concatenate([pos_idx, neg_sample])
    rng.shuffle(balanced_idx)

    balanced_codes = [codes_train[i] for i in balanced_idx]
    y_train = y_train_raw[balanced_idx]

    # Step 3: Convert to image arrays
    X_train = Parallel(n_jobs=-1)(delayed(code_to_image)(code) for code in balanced_codes)
    X_test = Parallel(n_jobs=-1)(delayed(code_to_image)(code) for code in codes_test)

    X_train = np.expand_dims(np.stack(X_train).astype(np.float32) / 255.0, -1)
    X_test = np.expand_dims(np.stack(X_test).astype(np.float32) / 255.0, -1)
    y_test = np.array(y_test).astype(np.int32)

    class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
    class_weight_dict = dict(zip(np.unique(y_train), class_weights))

    return (X_train, X_test, y_train, y_test), class_weight_dict

# ---------------------------------------
# 3. Training Function Using tf.data
# ---------------------------------------
def train_final_model(filepath):
    (X_train, X_test, y_train, y_test), class_weight = load_balanced_data(filepath)

    learning_rate = 0.00041314272442970695
    dropout = 0.32317539094635517
    dense_units = 512
    batch_size = 16

    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    model = Sequential([
        tf.keras.Input(shape=(TARGET_SIZE, TARGET_SIZE, 1)),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(64, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Dropout(dropout),
        Flatten(),
        Dense(dense_units, activation="relu"),
        Dense(2, activation="softmax")
    ])

    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    print("Training for 20 epochs...")
    model.fit(train_ds, epochs=20, class_weight=class_weight, verbose=1)

    print("\nEvaluating on test set...")
    loss, acc = model.evaluate(test_ds)
    print(f"\nFinal Test Accuracy: {acc:.4f}")

    print("\nComputing additional metrics...")
    y_pred_probs = model.predict(X_test, batch_size=batch_size)
    y_pred = np.argmax(y_pred_probs, axis=1)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    precision = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"Accuracy        : {accuracy:.4f}")
    print(f"True Pos Rate   : {tpr:.4f}")
    print(f"False Neg Rate  : {fnr:.4f}")
    print(f"False Pos Rate  : {fpr:.4f}")
    print(f"Precision       : {precision:.4f}")
    print(f"F1 Score        : {f1:.4f}")

# --- Run the model ---
train_final_model("cwe119_cgd.txt")

Training for 20 epochs...
Epoch 1/20
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.6018 - loss: 0.6519
Epoch 2/20
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.6703 - loss: 0.5951
Epoch 3/20
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.7542 - loss: 0.4983
Epoch 4/20
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.7828 - loss: 0.4491
Epoch 5/20
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.7962 - loss: 0.4218
Epoch 6/20
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.8133 - loss: 0.3909
Epoch 7/20
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.8217 - loss: 0.3702
Epoch 8/20
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.8342 - loss: 0.353

# BEST MODEL (CWE 399)

In [17]:
import os
import random
import numpy as np
from joblib import Parallel, delayed
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from math import floor, sqrt

# ---------------------------------------
# 0. Reproducibility Setup
# ---------------------------------------
SEED = 41
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
tf.config.experimental.enable_op_determinism()

# ---------------------------------------
# 1. Preprocessing 
# ---------------------------------------
TARGET_SIZE = 32  

def parse_file(filename):
    with open(filename, "r", encoding="utf8") as file:
        gadget = []
        label = None
        for line in file:
            stripped = line.strip()
            if not stripped:
                continue
            if '-' * 10 in stripped:
                if gadget and label in [0, 1]:
                    yield "\n".join(gadget), label
                gadget = []
                label = None
            elif stripped.split()[0].isdigit():
                if stripped.isdigit():
                    label_candidate = int(stripped)
                    if label_candidate in [0, 1]:
                        label = label_candidate
                    else:
                        gadget = []
                        label = None
                else:
                    gadget.append(stripped)
            else:
                gadget.append(stripped)

def code_to_image(code_sample):
    byte_array = bytearray(code_sample, 'utf-8')
    flat = np.array(byte_array, dtype=np.uint8)
    size = floor(sqrt(len(flat)))
    cropped = flat[:size * size].reshape((size, size))

    padded = np.zeros((TARGET_SIZE, TARGET_SIZE), dtype=np.uint8)
    h, w = cropped.shape
    h = min(h, TARGET_SIZE)
    w = min(w, TARGET_SIZE)
    padded[:h, :w] = cropped[:h, :w]

    return padded

# ---------------------------------------
# 2. Data Loader with Fixed Leakage
# ---------------------------------------
def load_balanced_data(filepath, seed=SEED):
    codes, labels = zip(*parse_file(filepath))
    labels = np.array(labels).astype(np.int32)

    # Step 1: Split first
    codes_train, codes_test, y_train_raw, y_test = train_test_split(
        codes, labels, test_size=0.2, stratify=labels, random_state=seed
    )

    # Step 2: Balance training data only
    y_train_raw = np.array(y_train_raw)
    pos_idx = np.where(y_train_raw == 1)[0]
    neg_idx = np.where(y_train_raw == 0)[0]

    rng = np.random.default_rng(seed)
    neg_sample = rng.choice(neg_idx, len(pos_idx), replace=False)
    balanced_idx = np.concatenate([pos_idx, neg_sample])
    rng.shuffle(balanced_idx)

    balanced_codes = [codes_train[i] for i in balanced_idx]
    y_train = y_train_raw[balanced_idx]

    # Step 3: Convert to image arrays
    X_train = Parallel(n_jobs=-1)(delayed(code_to_image)(code) for code in balanced_codes)
    X_test = Parallel(n_jobs=-1)(delayed(code_to_image)(code) for code in codes_test)

    X_train = np.expand_dims(np.stack(X_train).astype(np.float32) / 255.0, -1)
    X_test = np.expand_dims(np.stack(X_test).astype(np.float32) / 255.0, -1)
    y_test = np.array(y_test).astype(np.int32)

    class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
    class_weight_dict = dict(zip(np.unique(y_train), class_weights))

    return (X_train, X_test, y_train, y_test), class_weight_dict

# ---------------------------------------
# 3. Training Function Using tf.data
# ---------------------------------------
def train_final_model(filepath):
    (X_train, X_test, y_train, y_test), class_weight = load_balanced_data(filepath)

    learning_rate = 0.000292664961892896
    dropout = 0.294643590864854
    dense_units = 256
    batch_size = 16

    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    model = Sequential([
        tf.keras.Input(shape=(TARGET_SIZE, TARGET_SIZE, 1)),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(32, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Conv2D(64, 3, padding="same", activation="relu"),
        MaxPool2D(),
        Dropout(dropout),
        Flatten(),
        Dense(dense_units, activation="relu"),
        Dense(2, activation="softmax")
    ])

    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    print("Training for 20 epochs...")
    model.fit(train_ds, epochs=20, class_weight=class_weight, verbose=1)

    print("\nEvaluating on test set...")
    loss, acc = model.evaluate(test_ds)
    print(f"\nFinal Test Accuracy: {acc:.4f}")

    print("\nComputing additional metrics...")
    y_pred_probs = model.predict(X_test, batch_size=batch_size)
    y_pred = np.argmax(y_pred_probs, axis=1)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    precision = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"Accuracy        : {accuracy:.4f}")
    print(f"True Pos Rate   : {tpr:.4f}")
    print(f"False Neg Rate  : {fnr:.4f}")
    print(f"False Pos Rate  : {fpr:.4f}")
    print(f"Precision       : {precision:.4f}")
    print(f"F1 Score        : {f1:.4f}")

# --- Run the model ---
train_final_model("cwe399_cgd.txt")

Training for 20 epochs...
Epoch 1/20
[1m729/729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.5648 - loss: 0.6747
Epoch 2/20
[1m729/729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7012 - loss: 0.5513
Epoch 3/20
[1m729/729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7876 - loss: 0.4264
Epoch 4/20
[1m729/729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8244 - loss: 0.3664
Epoch 5/20
[1m729/729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8497 - loss: 0.3288
Epoch 6/20
[1m729/729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8673 - loss: 0.2928
Epoch 7/20
[1m729/729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8755 - loss: 0.2699
Epoch 8/20
[1m729/729[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8958 - loss: 0.2408
Epoch 9/20
[1