In [None]:
import gc
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from random import sample
from collections import defaultdict
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, StratifiedKFold
from tensorflow.keras.layers import Input, Embedding, Concatenate, LSTM, Bidirectional, Layer

In [None]:
anon_vendor_dataset = pd.read_json('datasets/dataset_1500_seq_length.json')
dataset_list = []
for _, row in anon_vendor_dataset.iterrows():
    profile_data = row['profile']
    if isinstance(profile_data, str):
        profile_data = json.loads(profile_data)
    dataset_list.append({"profile": profile_data, "label": row['label']})

# Preprocessing

In [None]:
class ProfilePreprocessor:
    """
    A pipeline to preprocess and encode sequential profile data.

    This class handles:
    - Integer encoding for finite string variables (behavior, method, network_method).
    - Top-K encoding for string variables (element_tagname, event_handled),
      mapping less frequent values to an 'other' category.
    - Boolean to integer conversion for 'contains_sensitive_data'.
    - Retention of float values for 'delay'.
    Each categorical feature maintains its own separate vocabulary and mapping.
    """

    def __init__(self, top_k_elements=10, top_k_events=10):
        """
        Initializes the preprocessor with parameters for top-K encoding.

        Args:
            top_k_elements (int): The number of top most frequent element_tagname
                                  values to keep. Others will be mapped to 'other'.
            top_k_events (int): The number of top most frequent event_handled
                                values to keep. Others will be mapped to 'other'.
        """
        self.top_k_elements = top_k_elements
        self.top_k_events = top_k_events
        self.behavior_mapping = {}
        self.method_mapping = {}
        self.element_tagname_mapping = {}
        self.event_handled_mapping = {}
        self.label_mapping = {} 
        self._next_behavior_id_ref = [1]
        self._next_method_id_ref = [1]
        self._next_element_tagname_id_ref = [1]
        self._next_event_handled_id_ref = [1]
        self._next_label_id_ref = [0]
        self.max_len = 0
        self._element_tagname_counts = defaultdict(int)
        self._event_handled_counts = defaultdict(int)

    def _get_or_assign_id(self, value, mapping, next_id_ref):
        """Helper to get ID or assign a new one for a specific categorical mapping."""
        if value not in mapping:
            mapping[value] = next_id_ref[0]
            next_id_ref[0] += 1
        return mapping[value]

    def fit(self, dataset):
        """
        Fits the preprocessor to the dataset to learn all necessary mappings.

        Iterates through all profiles and their events to collect unique values
        and frequency counts for various fields.

        Args:
            dataset (list): A list of dictionaries, where each dictionary
                            has "profile" (list of event objects) and "label".
        """

        for data_point in dataset:
            profile = data_point["profile"]
            if self.max_len < len(profile):
                self.max_len = len(profile)

            self._get_or_assign_id(data_point["label"], self.label_mapping, self._next_label_id_ref)
            for event in profile:
                self._get_or_assign_id(event["behavior"], self.behavior_mapping, self._next_behavior_id_ref)
                self._get_or_assign_id(event["method"], self.method_mapping, self._next_method_id_ref)
                self._element_tagname_counts[event["element_tagname"]] += 1
                self._event_handled_counts[event["event_handled"]] += 1

        sorted_elements = sorted(self._element_tagname_counts.items(), key=lambda item: item[1], reverse=True)
        sorted_events = sorted(self._event_handled_counts.items(), key=lambda item: item[1], reverse=True)

        for i, (tag, _) in enumerate(sorted_elements):
            if i < self.top_k_elements:
                self._get_or_assign_id(tag, self.element_tagname_mapping, self._next_element_tagname_id_ref)
            else:
                break

        self._get_or_assign_id("__OTHER_TAG__", self.element_tagname_mapping, self._next_element_tagname_id_ref)
        for i, (event_name, _) in enumerate(sorted_events):
            if i < self.top_k_events:
                self._get_or_assign_id(event_name, self.event_handled_mapping, self._next_event_handled_id_ref)
            else:
                break 
        self._get_or_assign_id("__OTHER_EVENT__", self.event_handled_mapping, self._next_event_handled_id_ref)

        self._get_or_assign_id("__UNKNOWN__", self.behavior_mapping, self._next_behavior_id_ref)
        self._get_or_assign_id("__UNKNOWN__", self.method_mapping, self._next_method_id_ref)

        print("Fitting complete. Learned mappings:")
        print(f"Behavior mapping: {self.behavior_mapping}")
        print(f"Method mapping: {self.method_mapping}")
        print(f"Element Tagname mapping (top {self.top_k_elements} + other): {self.element_tagname_mapping}")
        print(f"Event Handled mapping (top {self.top_k_events} + other): {self.event_handled_mapping}")
        print(f"Label mapping: {self.label_mapping}")

    def transform(self, dataset):
        """
        Transforms the dataset using the learned mappings.

        Args:
            dataset (list): A list of dictionaries, where each dictionary
                            has "profile" (list of event objects) and "label".

        Returns:
            list: A new list of dictionaries with transformed profiles. Each
                  event in a profile is now a list of numerical features.
        """

        transformed_dataset = []
        for data_point in dataset:
            transformed_profile = []
            profile = data_point["profile"]
            encoded_label = self.label_mapping.get(data_point["label"], -1) # -1 for unknown label
            for event in profile:
    
                encoded_behavior = self.behavior_mapping.get(
                    event.get("behavior"),
                    self.behavior_mapping["__UNKNOWN__"]
                )
                encoded_method = self.method_mapping.get(
                    event.get("method"),
                    self.method_mapping["__UNKNOWN__"]
                )

                encoded_element_tagname = self.element_tagname_mapping.get(
                    event.get("element_tagname"), self.element_tagname_mapping["__OTHER_TAG__"]
                )
                encoded_event_handled = self.event_handled_mapping.get(
                    event.get("event_handled"), self.event_handled_mapping["__OTHER_EVENT__"]
                )


                transformed_profile.append([
                    encoded_behavior,
                    encoded_method,
                    encoded_element_tagname,
                    encoded_event_handled,
                ])
            obj = {
                "profile": transformed_profile,
                "label": encoded_label
            }
            if 'version' in data_point:
                obj['version'] = data_point['version']
            transformed_dataset.append(obj)

        print("Transformation complete.")
        return transformed_dataset

    def fit_transform(self, dataset):
        """
        Fits the preprocessor and then transforms the dataset.

        Args:
            dataset (list): A list of dictionaries, where each dictionary
                            has "profile" (list of event objects) and "label".

        Returns:
            list: A new list of dictionaries with transformed profiles.
        """

        self.fit(dataset)
        return self.transform(dataset)

    @property
    def behavior_vocab_size(self):
        """Returns the vocabulary size for the 'behavior' mapping."""
        return self._next_behavior_id_ref[0]

    @property
    def method_vocab_size(self):
        """Returns the vocabulary size for the 'method' mapping."""
        return self._next_method_id_ref[0]
    
    @property
    def element_tagname_vocab_size(self):
        """Returns the vocabulary size for the 'element_tagname' mapping."""
        return self._next_element_tagname_id_ref[0]

    @property
    def event_handled_vocab_size(self):
        """Returns the vocabulary size for the 'event_handled' mapping."""
        return self._next_event_handled_id_ref[0]

    @property
    def num_classes(self):
        """Returns the number of unique labels."""
        return self._next_label_id_ref[0]

def prepare_model_inputs(processed_dataset, versions=False, max_len=None):
    """
    Prepares the processed dataset into a format suitable for the Keras multi-input model.

    Args:
        processed_dataset (list): The output from ProfilePreprocessor.transform().
                                  A list of dictionaries, where each dict has
                                  "profile" (list of numerical features) and "label" (encoded int).
        max_len (int, optional): The maximum sequence length to pad to. If None,
                                 it will be determined from the longest profile in the dataset.

    Returns:
        tuple: A tuple containing:
            - inputs_dict (dict): A dictionary of NumPy arrays, where keys are
                                  input layer names and values are the padded feature sequences.
            - labels (np.ndarray): A NumPy array of the encoded labels.
    """

    if not processed_dataset:
        return {}, np.array([]), np.array([])

    if max_len is None:
        max_len = max(len(data_point["profile"]) for data_point in processed_dataset)
        print(f"Determined max_len from dataset: {max_len}")
    else:
        print(f"Using provided max_len: {max_len}")

    behavior_sequences = []
    method_sequences = []
    element_tagname_sequences = []
    event_handled_sequences = []
    labels = []
    version_labels = []

    for data_point in processed_dataset:
        profile_data = np.array(data_point["profile"]) 
        original_len = profile_data.shape[0]
        if max_len and original_len > max_len:
            front_len = max_len // 2
            back_len = max_len - front_len
            
            front_part = profile_data[:front_len, :]
            back_part = profile_data[-back_len:, :]

            profile_data = np.concatenate((front_part, back_part), axis=0)

        if "label" in data_point:
            labels.append(data_point["label"])
        if versions:
          version_labels.append(data_point["version"])

        if profile_data.size > 0:
          behavior_sequences.append(profile_data[:, 0])
          method_sequences.append(profile_data[:, 1])
          element_tagname_sequences.append(profile_data[:, 2])
          event_handled_sequences.append(profile_data[:, 3])

        else:
          behavior_sequences.append([])
          method_sequences.append([])
          element_tagname_sequences.append([])
          event_handled_sequences.append([])

    padded_behavior = pad_sequences(behavior_sequences, maxlen=max_len, padding='post', truncating='post', dtype='int32')
    padded_method = pad_sequences(method_sequences, maxlen=max_len, padding='post', truncating='post', dtype='int32')
    padded_element_tagname = pad_sequences(element_tagname_sequences, maxlen=max_len, padding='post', truncating='post', dtype='int32')
    padded_event_handled = pad_sequences(event_handled_sequences, maxlen=max_len, padding='post', truncating='post', dtype='int32')

    inputs_dict = {
        'behavior_input': padded_behavior,
        'method_input': padded_method,
        'element_tagname_input': padded_element_tagname,
        'event_handled_input': padded_event_handled,
    }

    return inputs_dict, np.array(labels), np.array(version_labels)

# Model Building

In [None]:
@tf.keras.utils.register_keras_serializable()
class SupervisedContrastiveLoss(tf.keras.losses.Loss):
    """
    Implements the Supervised Contrastive Loss.

    This loss encourages embeddings of samples with the same label to be closer,
    and embeddings of samples with different labels to be further apart.

    Key points:
    - Embeddings are L2-normalized before computing pairwise similarities.
    - Similarity is computed via dot-product divided by a temperature parameter.
    - Positive pairs are samples that share the same label.
    - Uses a mask to ensure a sample is not paired with itself.
    """

    def __init__(self, temperature=0.1, reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE, name='SupConLoss'):
        """
        Initializes the supervised contrastive loss.

        Args:
            temperature (float): Temperature scaling factor for contrastive logits.
            reduction: How the final loss is reduced over the batch.
            name (str): Name of the loss object.
        """
        super().__init__(reduction=reduction, name=name)
        self.temperature = temperature

    def call(self, labels, embeddings):
        """
        Computes the supervised contrastive loss.

        Args:
            labels (Tensor): Encoded integer labels of shape (batch,).
            embeddings (Tensor): Embedding vectors of shape (batch, dim).

        Returns:
            Tensor: Scalar loss value.
        """

        labels = tf.cast(tf.reshape(labels, [-1, 1]), tf.int32)
        embeddings = tf.math.l2_normalize(embeddings, axis=1)

        sim = tf.matmul(embeddings, embeddings, transpose_b=True) / self.temperature
        mask = tf.cast(tf.equal(labels, tf.transpose(labels)), tf.float32)
        mask_self = tf.eye(tf.shape(labels)[0])
        mask = mask - mask_self

        logits_max = tf.reduce_max(sim, axis=1, keepdims=True)
        logits = sim - logits_max

        exp_logits = tf.exp(logits) * (1 - mask_self)
        log_prob = logits - tf.math.log(tf.reduce_sum(exp_logits, axis=1, keepdims=True) + 1e-9)

        mean_log_prob_pos = tf.reduce_sum(mask * log_prob, axis=1) / (tf.reduce_sum(mask, axis=1) + 1e-9)
        loss = -tf.reduce_mean(mean_log_prob_pos)
        return loss

    def get_config(self):
        """Enables saving/loading through Keras serialization."""
        config = super().get_config()
        config.update({
            "temperature": self.temperature,
        })
        return config

@tf.keras.utils.register_keras_serializable()
class L2Normalize(Layer):
    """
    A simple Keras Layer that performs L2 normalization on the last axis.
    """
    def call(self, inputs):
        return tf.math.l2_normalize(inputs, axis=-1)


def get_embedding_model_multi_input(input_embedding_dim, final_embedding_dim, vocab_sizes, max_len, bidirectional=True):
    """
    Builds a multi-input embedding model for sequential categorical features.

    This model:
    - Creates an embedding layer per feature (behavior/method/...).
    - Concatenates all embeddings along the feature axis.
    - Applies an LSTM (optionally bidirectional) to produce a fixed-length embedding.
    - Outputs an L2-normalized embedding vector.

    Args:
        input_embedding_dim (int): Embedding dimension for each input feature.
        final_embedding_dim (int): Output embedding dimension of the LSTM.
        vocab_sizes (dict): Mapping: feature_name -> vocabulary_size.
        max_len (int): Maximum sequence length.
        bidirectional (bool): Whether to use a bidirectional LSTM.

    Returns:
        tuple: (model, input_layers, output_tensor)
    """
    
    inputs = {
        name: Input(shape=(max_len,), name=f"{name}_input", dtype='int32')
        for name in vocab_sizes
    }
    embeddings = []
    for name, vocab_size in vocab_sizes.items():
        embeddings.append(
            Embedding(
                input_dim=vocab_size,
                output_dim=input_embedding_dim,
                mask_zero=True if name == "behavior" else False,
                name=f"{name}_embedding"
            )(inputs[name])
        )

    concatenated = Concatenate(axis=-1, name="concat_embeddings")(embeddings)

    mask = embeddings[0]._keras_mask 

    lstm = LSTM(final_embedding_dim, name="pool")
    if bidirectional:
        lstm = Bidirectional(lstm)
    x = lstm(concatenated, mask=mask)

    normalized_output = L2Normalize(name='out')(x)

    model = Model(inputs=list(inputs.values()), outputs=normalized_output, name="multi_input_embedding_model")

    return model, list(inputs.values()), normalized_output

# Data Augmentation

In [None]:
def augment_data(X_dict, y, middle=False, min_len=10):
    """
    Data augmentation for sequential categorical features.

    Strategy:
    - Randomly choose a shorter length (>= min_len).
    - Truncate the sequence either from the middle or the end.
    - Pad back to full length.
    - Duplicate labels accordingly.

    Args:
        X_dict (dict): Feature_name -> padded sequences.
        y (Array): Labels.
        middle (bool): Whether to truncate using (front + back) slicing.
        min_len (int): Minimum allowed truncated length.

    Returns:
        (X_combined, y_combined): Original + augmented data.
    """

    n_samples = len(y)
    feature_names = list(X_dict.keys())
    seq_len = next(iter(X_dict.values())).shape[1]

    if min_len >= seq_len:
        raise ValueError(f"min_len ({min_len}) must be smaller than sequence length ({seq_len})")

    X_aug = {feat: [] for feat in feature_names}
    y_aug = []

    for i in range(n_samples):
      main_seq = X_dict['behavior_input'][i]
      main_non_pad = main_seq[main_seq != 0]
      len_non_pad = len(main_non_pad)
      new_len = np.random.randint(min_len, len_non_pad)
      new_len_2 = np.random.randint(min_len, len_non_pad)
      if new_len_2 == new_len:
        new_len_2 = np.random.randint(min_len, len_non_pad)
        if new_len_2 == new_len:
          new_len_2 = 0
      for feat in feature_names:
          seq = X_dict[feat][i]
          non_pad = seq[seq != 0]

          if len(non_pad) == 0:

              truncated_1 = np.zeros(seq_len, dtype=seq.dtype)
              truncated_2 = np.zeros(seq_len, dtype=seq.dtype)
          else:
            
              if middle:
                front_len_1 = new_len // 2
                back_len_1 = new_len - front_len_1
                front_part_1 = non_pad[:front_len_1]
                back_part_1 = non_pad[-back_len_1:]
                truncated_1 = np.concatenate((front_part_1, back_part_1))
                front_len_2 = new_len_2 // 2
                back_len_2 = new_len_2 - front_len_2
                front_part_2 = non_pad[:front_len_2]
                back_part_2 = non_pad[-back_len_2:]
                truncated_2 = np.concatenate((front_part_2, back_part_2))
              elif new_len_2:
                truncated_1 = non_pad[-new_len:]
                truncated_2 = non_pad[-new_len_2:]

              if len(truncated_1) < seq_len:
                  truncated_1 = np.pad(truncated_1, (0, seq_len - len(truncated_1)), constant_values=0)
              if len(truncated_2) < seq_len:
                  truncated_2 = np.pad(truncated_2, (0, seq_len - len(truncated_2)), constant_values=0)

          X_aug[feat].append(truncated_1)
          X_aug[feat].append(truncated_2)
      y_aug.append(y[i])
      y_aug.append(y[i])

    X_aug = {feat: np.array(vals, dtype=X_dict[feat].dtype) for feat, vals in X_aug.items()}
    y_aug = np.array(y_aug, dtype=y.dtype)

    X_combined = {feat: np.concatenate([X_dict[feat], X_aug[feat]], axis=0) for feat in feature_names}
    y_combined = np.concatenate([y, y_aug], axis=0)

    return X_combined, y_combined


def balance_data(X_dict, y, min_points=10, min_len=10, middle=False):
    """
    Balances dataset classes by augmenting underrepresented classes.

    Strategy:
    - Identify labels with fewer than `min_points` samples.
    - Augment them using truncation (similar to augment_data).
    - Ensures balanced class distribution.

    Args:
        X_dict (dict): Feature_name -> padded sequences.
        y (array): Label array.
        min_points (int): Minimum number of samples required per class.
        min_len (int): Minimum sequence length after truncation.
        middle (bool): Whether to use middle truncation instead of prefix slicing.

    Returns:
        (X_combined, y_combined): Balanced dataset.
    """
    
    n_samples = len(y)
    feature_names = list(X_dict.keys())
    seq_len = next(iter(X_dict.values())).shape[1]

    a = pd.Series(y).value_counts()
    a = a[a < min_points]
    labels, counts = list(a.index), list(a)
    y_series = pd.Series(y)
    X_aug = {feat: [] for feat in feature_names}
    y_aug = []
    for label, count in zip(labels, counts):
        to_add = min_points - count

        indexes = list(y_series[y_series == label].index)

        if to_add < count:
            sampled_indexes = sample(indexes, to_add)
            for i in sampled_indexes:
                main_seq = X_dict['behavior_input'][i]
                main_non_pad = main_seq[main_seq != 0]
                len_non_pad = len(main_non_pad)
                truncable_len = len_non_pad - min_len
                if truncable_len:
                    to_trunc_len = np.random.randint(1, min(truncable_len + 1, min_len))
                    new_len = len_non_pad - to_trunc_len

                    for feat in feature_names:
                        seq = X_dict[feat][i]
                        non_pad = seq[seq != 0]
                        if middle:
                            front_len = new_len // 2
                            back_len = new_len - front_len
                            front_part = non_pad[:front_len]
                            back_part = non_pad[-back_len:]
                            truncated = np.concatenate((front_part, back_part))
                        else:
                            truncated = non_pad[:new_len]
                        if len(truncated) < seq_len:
                            truncated = np.pad(truncated, (0, seq_len - len(truncated)), constant_values=0)
                        else:
                            truncated = []
                        if len(truncated) == seq_len:
                          X_aug[feat].append(truncated)

                    y_aug.append(y[i])
        else:
            pulls_from_index, remaining_pulls = to_add // count, to_add % count
            for i in indexes:
                main_seq = X_dict['behavior_input'][i]
                main_non_pad = main_seq[main_seq != 0]
                len_non_pad = len(main_non_pad)
                truncable_len = len_non_pad - min_len
                to_trunc_lens = []
                while truncable_len > 0 and len(to_trunc_lens) < pulls_from_index:
                    to_trunc_len = np.random.randint(1, 4)
                    if to_trunc_lens:
                        to_trunc_len += to_trunc_lens[-1]
                    if truncable_len - to_trunc_len >= min_len:
                        to_trunc_lens.append(to_trunc_len)
                    truncable_len -= to_trunc_len
                if remaining_pulls:
                    to_trunc_len = np.random.randint(1, 3)
                    if to_trunc_lens:
                        to_trunc_len += to_trunc_lens[-1]
                    if truncable_len - to_trunc_len >= min_len:
                        to_trunc_lens.append(to_trunc_len)
                for ttl in to_trunc_lens:
                    new_len = len_non_pad - ttl
                    for feat in feature_names:
                        seq = X_dict[feat][i]
                        non_pad = seq[seq != 0]
                        if middle:
                            front_len = new_len // 2
                            back_len = new_len - front_len
                            front_part = non_pad[:front_len]
                            back_part = non_pad[-back_len:]
                            truncated = np.concatenate((front_part, back_part))
                        else:
                            truncated = non_pad[:new_len]

                        if len(truncated) < seq_len:
                            truncated = np.pad(truncated, (0, seq_len - len(truncated)), constant_values=0)
                        else:
                            truncated = []

                        if len(truncated) == seq_len:
                          X_aug[feat].append(truncated)

                    y_aug.append(y[i])

    X_aug = {feat: np.array(vals, dtype=X_dict[feat].dtype) for feat, vals in X_aug.items()}
    y_aug = np.array(y_aug, dtype=y.dtype)
    
    X_combined = {feat: np.concatenate([X_dict[feat], X_aug[feat]], axis=0) for feat in feature_names}
    y_combined = np.concatenate([y, y_aug], axis=0)

    return X_combined, y_combined

# Experiments

In [None]:
epochs = 100
patience = 10 
augment = 'middle'
balance = False
top_k_events = 90
top_k_elements = 42
early_stop_start = 20
unknown_vendor_threshold = 0.88

In [None]:
def top_k_retrieve_acc(emb_train, y_train, emb_test, y_test, k=1, unknown_threshold=0):
    """
    Computes top-K retrieval accuracy using cosine similarity between embeddings.

    This function evaluates how well test embeddings retrieve the correct label
    from the training set based on similarity ranking.

    Process:
    - Compute cosine similarity between every test embedding and all train embeddings.
    - For each test sample, retrieve the top-K most similar training samples.
    - Check whether the correct label appears among the top-K predictions.
    - Optionally apply an unknown-threshold: only count a hit if the similarity
      for the matching label is above the threshold.

    Args:
        emb_train (np.ndarray): Training embeddings, shape (N_train, dim).
        y_train (array-like): Labels for training embeddings.
        emb_test (np.ndarray): Test embeddings, shape (N_test, dim).
        y_test (array-like): Labels for test embeddings.
        k (int): Number of nearest neighbors to retrieve.
        unknown_threshold (float): Minimum cosine similarity required to count a
                                   match as correct. If 0, thresholding is disabled.

    Returns:
        tuple:
            - hit_rate (float): Top-K retrieval accuracy.
            - top_k_predictions (np.ndarray): Retrieved labels for each test sample,
                                              shape (N_test, k).
    """
    
    if not isinstance(y_train, np.ndarray):
        y_train = np.array(y_train)

    sim_matrix = cosine_similarity(emb_test, emb_train)

    top_k_indices = np.argsort(sim_matrix, axis=1)[:, -k:]
    top_k_indices = np.fliplr(top_k_indices)

    true_labels = y_test
    top_k_predictions = y_train[top_k_indices]
    top_k_similarities = np.take_along_axis(sim_matrix, top_k_indices, axis=1)
    hit_count = 0
    for i in range(len(true_labels)):
        true_label = true_labels[i]
        if true_label in top_k_predictions[i]:
            if unknown_threshold == 0:
                hit_count += 1
            else:
                match_indices_in_k = np.where(top_k_predictions[i] == true_label)[0]
                best_match_index_in_k = match_indices_in_k[0]
                match_similarity = top_k_similarities[i, best_match_index_in_k]
                if match_similarity >= unknown_threshold:
                    hit_count += 1

    hit_rate = hit_count / len(y_test)

    return hit_rate, top_k_predictions

## Vendor Identification 

In [None]:
"""
Perform stratified k-fold cross-validation.

StratifiedKFold creates multiple train/test splits while preserving the
class distribution of the labels. This ensures each fold has similar 
label proportions, which is important for fair evaluation.

Inside the loop, we:
    1. Build train/test subsets using these indices
    2. Fit the preprocessor on training data only
    3. Transform both train and test sets
    4. Train the embedding model
    5. Generate embeddings for retrieval
    6. Compute top-K retrieval accuracy for the fold
"""

hit_rates = []

raw_labels = [dp["label"] for dp in dataset_list]
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(skf.split(np.arange(len(dataset_list)), raw_labels), 1):
    print(f"\nFold {fold}")

    train_raw = [dataset_list[i] for i in train_idx]
    test_raw  = [dataset_list[i] for i in test_idx]

    preprocessor = ProfilePreprocessor(top_k_elements=42,
                                        top_k_events=90)
    train_processed = preprocessor.fit_transform(train_raw)
    test_processed  = preprocessor.transform(test_raw)


    MAX_SEQUENCE_LENGTH = max(len(dp["profile"]) for dp in train_processed)
    X_train, y_train, _ = prepare_model_inputs(train_processed, max_len=MAX_SEQUENCE_LENGTH)
    X_test, y_test, _   = prepare_model_inputs(test_processed, max_len=MAX_SEQUENCE_LENGTH)

    vocab_sizes = {
        'behavior': preprocessor.behavior_vocab_size,
        'method': preprocessor.method_vocab_size,
        'element_tagname': preprocessor.element_tagname_vocab_size,
        'event_handled': preprocessor.event_handled_vocab_size,
    }

    if augment:
        print('Augmenting')
        X_train_aug, y_train_aug = augment_data(X_train, y_train, middle=augment == 'middle', min_len=5)
    elif balance:
        print('Balancing')
        X_train_aug, y_train_aug = balance_data(X_train, y_train, min_points=10, min_len=5, middle=balance == 'middle')
    else:
        X_train_aug, y_train_aug = X_train, y_train

    indices = np.arange(len(y_train_aug))
    embed_train_idx, embed_val_idx = train_test_split(
        indices, test_size=0.2, stratify=y_train_aug, random_state=42
    )

    embed_train_x = {k: v[embed_train_idx] for k, v in X_train_aug.items()}
    embed_val_x   = {k: v[embed_val_idx] for k, v in X_train_aug.items()}
    embed_y_train = y_train_aug[embed_train_idx]
    embed_y_val   = y_train_aug[embed_val_idx]

    model, inputs, pooled = get_embedding_model_multi_input(
        input_embedding_dim=16,
        final_embedding_dim=128,
        vocab_sizes=vocab_sizes,
        max_len=MAX_SEQUENCE_LENGTH,
    )
    early_stop = EarlyStopping(monitor='val_loss', patience=10,
                                restore_best_weights=True, start_from_epoch=20)
    model.compile(optimizer='adam', loss=SupervisedContrastiveLoss())

    model.fit(
        x=embed_train_x, y=embed_y_train,
        validation_data=(embed_val_x, embed_y_val),
        epochs=100, batch_size=32, shuffle=True,
        callbacks=[early_stop]
    )

    emb_test = model.predict(X_test, batch_size=32)
    emb_train = model.predict(X_train, batch_size=32)
    hit_rate, _ = top_k_retrieve_acc(emb_train, y_train, emb_test, y_test, unknown_threshold=unknown_vendor_threshold)
    hit_rates.append(hit_rate)
    print(f'Fold {fold} - Hit Rate: {hit_rate:.3f}')

    tf.keras.backend.clear_session()
    del model, preprocessor
    gc.collect()

print("Cross-Validation Summary")
print(f"Average HR: {np.mean(hit_rates):.4f}  {np.std(hit_rates):.4f}")

## New Vendors

In [None]:
def filter_vendors(dataset_list, unseen_vendors):
    """
    Split the dataset into seen and unseen vendor groups.

    Given a list of data points and a list of unseen vendor labels,
    this function separates the dataset into:
        - seen_raw:   samples whose vendor label is NOT in unseen_vendors
        - unseen_raw: samples whose vendor label IS in unseen_vendors

    This is used to hold out certain vendors entirely for unseen-vendor evaluation.
    """
    
    seen_raw, unseen_raw = [], []
    for dp in dataset_list:
        vendor = dp.get("label")
        if vendor in unseen_vendors:
            unseen_raw.append(dp)
        else:
            seen_raw.append(dp)
    return seen_raw, unseen_raw

In [None]:
categories = ['low', 'medium', 'high']

def categorize(value, q1, q2):
    """
    Assign a numeric value into a category ('low', 'medium', 'high').

    Given a value and two quantile thresholds (q1, q2):
        - value <= q1  → 'low'
        - value <= q2  → 'medium'
        - otherwise    → 'high'

    This is used to categorize vendors by their mean sequence length.
    """
    
    if value <= q1:
        return categories[0]
    elif value <= q2:
        return categories[1]
    else:
        return categories[2]

def sample_vendors_by_size(stats_df, n, size):
    """
    Sample vendors from a specific size category.

    Vendors are grouped into 'low', 'medium', or 'high' categories
    based on their mean sequence lengths (using 33% and 66% quantiles).

    This function:
        1. Categorizes all vendors in stats_df.
        2. Selects vendors from the requested category (size).
        3. Randomly samples `n` vendors from that category.
    """

    mean_q1, mean_q2 = np.quantile(stats_df['mean_seq_len'], [0.33, 0.66])

    stats_df = stats_df.copy()
    stats_df['mean_cat'] = stats_df['mean_seq_len'].apply(lambda x: categorize(x, mean_q1, mean_q2))

    groups = (
        stats_df
        .groupby(['mean_cat'])
        .apply(lambda g: g['label'])
    )
    
    if size not in groups:
        raise ValueError(f"No vendors in the '{size}' category.")
    
    available_indices = list(groups[size])
    if len(available_indices) < n:
        raise ValueError(f"Requested {n} samples but only {len(available_indices)} available in '{size}' group.")
    
    sampled = sample(available_indices, n)
    return sampled


def sample_diverse_vendors(stats_df, n):
    """
    Sample a diverse set of vendors across all size categories.

    This function ensures balanced sampling across:
        - 'low'
        - 'medium'
        - 'high'

    It divides n approximately evenly across the three categories.
    Any remainder vendors are distributed one-by-one starting from 'low'.

    Returns a combined list of sampled vendor labels.
    """

    base_n_per, remainder_n = n // len(categories), n % len(categories)
    sampled = []
    for category in categories:
        to_sample = base_n_per

        if remainder_n:
            to_sample += 1
            remainder_n -= 1
        
        sampled += sample_vendors_by_size(stats_df, to_sample, category)
        
    return sampled

In [None]:
anon_vendor_dataset['seq_len'] = anon_vendor_dataset['profile'].apply(len)
stats_df = anon_vendor_dataset.groupby('label')['seq_len'].mean().reset_index()
stats_df.rename(columns={'seq_len': 'mean_seq_len'}, inplace=True)

In [None]:
unseen_groups = []
for i in range(5):
   unseen_groups.append(sample_diverse_vendors(stats_df, 9))

In [None]:
def hold_out_unseen(dataset_list, unseen_vendors, top_k_elements=30, top_k_events=90,
                    epochs=50, patience=10, early_stop_start=20, test_size=0.25, augment=False, balance=False, max_len=0, unknown_vendor_threshold=0.85):
    """
    Perform hold-out evaluation for unseen vendors.

    This function trains a supervised contrastive embedding model
    ONLY on the vendors not included in unseen_vendors.

    Workflow:
        1. Split dataset → seen vendors (training) vs unseen vendors (held-out)
        2. Preprocess data and build padded model inputs
        3. Train embedding model on seen vendors
          (with optional augmentation or class balancing)
        4. Generate embeddings for:
              - seen test data
              - unseen vendor data
        5. Evaluate two metrics:
            a. Seen-vs-Unseen Detection Accuracy:
                  Predict whether each unseen sample belongs to a new vendor
                  by thresholding its max similarity to seen embeddings.
            b. Unseen Pair Similarity Accuracy:
                  Among unseen embeddings, check whether a sample’s nearest
                  unseen neighbor has the same vendor label.

    Returns:
        {
            "seen_vs_unseen_acc": float,
            "unseen_pair_similarity_acc": float
        }
    """
    
    seen_raw, unseen_raw = filter_vendors(dataset_list, unseen_vendors)

    seen_labels = [dp["label"] for dp in seen_raw]
    idx_train, idx_test = train_test_split(
        np.arange(len(seen_raw)),
        test_size=test_size,
        stratify=seen_labels,
        random_state=42
    )
    train_raw = [seen_raw[i] for i in idx_train]
    test_raw  = [seen_raw[i] for i in idx_test]

    preprocessor = ProfilePreprocessor(top_k_elements=top_k_elements,
                                       top_k_events=top_k_events)
    train_processed = preprocessor.fit_transform(train_raw)
    test_processed  = preprocessor.transform(test_raw)
    unseen_processed = preprocessor.transform(unseen_raw)

    if not max_len:
      max_len = max(len(dp["profile"]) for dp in train_processed)
    X_train, y_train, _ = prepare_model_inputs(train_processed, max_len=max_len)
    X_test, y_test, _   = prepare_model_inputs(test_processed,  max_len=max_len)
    X_unseen, y_unseen, _ = prepare_model_inputs(unseen_processed, max_len=max_len)

    vocab_sizes = {
        'behavior': preprocessor.behavior_vocab_size,
        'method': preprocessor.method_vocab_size,
        'element_tagname': preprocessor.element_tagname_vocab_size,
        'event_handled': preprocessor.event_handled_vocab_size,
    }

    model, inputs, pooled = get_embedding_model_multi_input(
          input_embedding_dim=16,
          final_embedding_dim=128,
          vocab_sizes=vocab_sizes,
          max_len=max_len,
      )
    early_stop = EarlyStopping(monitor='val_loss', patience=patience,
                               restore_best_weights=True, start_from_epoch=early_stop_start)
    model.compile(optimizer='adam', loss=SupervisedContrastiveLoss())

    if augment:
      print('Augmenting')
      X_train_aug, y_train_aug = augment_data(X_train, y_train, min_len=5, middle=augment == 'middle')

      model.fit(X_train_aug, y_train_aug, validation_data=(X_test, y_test),
                epochs=epochs, callbacks=[early_stop], batch_size=32)
    elif balance:
      print('Balancing')
      X_train_aug, y_train_aug = balance_data(X_train, y_train, min_points=10, min_len=5, middle=balance == 'middle')

      model.fit(X_train_aug, y_train_aug, validation_data=(X_test, y_test),
                epochs=epochs, callbacks=[early_stop], batch_size=32)
    else:
      model.fit(X_train, y_train, validation_data=(X_test, y_test),
                epochs=epochs, callbacks=[early_stop], batch_size=32)

    embedding_model = tf.keras.Model(inputs=inputs, outputs=pooled)
    emb_test   = embedding_model.predict(X_test)
    emb_unseen = embedding_model.predict(X_unseen)

    # Seen-vs-unseen detection
    sims_max = [np.max(cosine_similarity(e.reshape(1, -1), emb_test))
                for e in emb_unseen]
    predicted_unseen_mask = np.array(sims_max) < unknown_vendor_threshold
    true_unseen_mask = np.ones_like(predicted_unseen_mask, dtype=bool)
    seen_vs_unseen_acc = np.mean(predicted_unseen_mask == true_unseen_mask)

    # Unseen pair similarity accuracy
    correct, total = 0, 0
    for i in range(len(emb_unseen)):
        sims = cosine_similarity(emb_unseen[i].reshape(1, -1), emb_unseen)[0]
        sims[i] = -1
        j = np.argmax(sims)
        if y_unseen[i] == y_unseen[j] and sims[j] >= unknown_vendor_threshold:
            correct += 1
        total += 1
    unseen_pair_acc = correct / total if total > 0 else 0

    tf.keras.backend.clear_session()
    del model
    gc.collect()

    print(f"Seen-vs-Unseen Detection Accuracy: {seen_vs_unseen_acc:.4f}")
    print(f"Unseen Pair Similarity Accuracy: {unseen_pair_acc:.4f}")

    return {
        "seen_vs_unseen_acc": seen_vs_unseen_acc,
        "unseen_pair_similarity_acc": unseen_pair_acc
    }

In [None]:
"""
Evaluate unseen-vendor performance across multiple unseen vendor groups.

Each entry in unseen_groups is a list of vendor labels that will be held out
entirely during training. For each group:

    1. Run hold-out evaluation using hold_out_unseen()
    2. Train the model on the remaining (seen) vendors
    3. Compute:
         - seen_vs_unseen_acc: ability to detect unseen vendors
         - unseen_pair_similarity_acc: how well embeddings cluster unseen vendors

We store both metrics across all unseen vendor groups and then compute:

    - Mean and standard deviation of seen-vs-unseen detection accuracy
    - Mean and standard deviation of unseen vendor pair similarity accuracy

This provides a robust measurement of how well the model generalizes
to completely new, never-before-seen vendors.
"""

seen_vs_unseen_accs = []
unseen_pair_sim_accs = []
for unseen_vendors in unseen_groups:
    results = hold_out_unseen(dataset_list, unseen_vendors,
                                epochs=epochs, patience=patience, early_stop_start=early_stop_start, augment=augment, balance=balance, top_k_elements=top_k_elements, top_k_events=top_k_events, unknown_vendor_threshold=unknown_vendor_threshold)
    seen_vs_unseen_accs.append(results["seen_vs_unseen_acc"])
    unseen_pair_sim_accs.append(results["unseen_pair_similarity_acc"])

print('Seen Vs. Unseen Detection')
print(f"Mean: {np.mean(seen_vs_unseen_accs):.4f}, Std: {np.std(seen_vs_unseen_accs):.4f}")
print('Unseen Pairs Accuracy')
print(f"Mean: {np.mean(unseen_pair_sim_accs):.4f}, Std: {np.std(unseen_pair_sim_accs):.4f}")