In [26]:
from itertools import chain
from pathlib import Path
from typing import Hashable

import numpy as np
import tensorflow as tf


In [21]:
plaid = np.load('out/plaid.npz', allow_pickle=True)
plaid = plaid['arr_0']
print(plaid[0])

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [27]:
def load_files(data_set, nested=False):
    """Loads requested system call data set from disk

    Parameters
    ----------
    data_set : {"adfa", "plaid"}
        The data set to be returned.
    nested : bool
        Return attack sequences nested by application. Default False returns a flat list.

    Returns
    -------
    attack_sequences : List[List[str]] or List[List[List[str]]]
        List of attack system call sequences. When nested=False each element is an attack sequence represented as a list
        of strings. If nested=True each element is a list of all attack sequences belonging to a single application.
    base_sequences : List[List[str]]
        List of baseline system call sequences.

    """
    if data_set not in ["adfa", "plaid"]:
        raise ValueError("data_set must be on of (adfa, plaid)")

    def get_seq(files):
        ret = []
        for f in files:
            with open(f) as file:
                seq = file.read().strip().split(" ")
                if 4495 >= len(seq) >= 8:
                    ret.append(seq)
        return ret

    if data_set == "plaid":
        attack_files = sorted(list(Path("../data/PLAID/attack").rglob("*.txt")))
        baseline_files = Path("../data/PLAID/baseline").rglob("*.txt")
    else:
        root_path = Path("../data/ADFA_decoded_i386/")
        attack_files = sorted(list((root_path / "Attack_Data_Master").rglob("*.txt")))
        baseline_files = list((root_path / "Validation_Data_Master").rglob("*.txt"))
        baseline_files.extend((root_path / "Training_Data_Master").rglob("*.txt"))

    if nested:
        attack_sequences = []
        folders = set([x.parent for x in attack_files])
        for folder in folders:
            tmp = [x for x in attack_files if x.parent == folder]
            attack_sequences.append(get_seq(tmp))
    else:
        attack_sequences = get_seq(attack_files)
    base_sequences = get_seq(baseline_files)
    return attack_sequences, base_sequences



In [28]:
class Encoder:
    """Converts data to a dense integer encoding

    Attributes:
        file_path: location to save/load syscall map
        syscall_map: mapping from item to encoded value
    """

    file_path = Path()
    syscall_map: dict = dict()

    def __init__(self, file_path: str) -> None:
        self.file_path = Path(file_path)
        if self.file_path.exists():
            self.syscall_map = np.load(self.file_path, allow_pickle=True).item()

    def encode(self, syscall: Hashable) -> int:
        """Encodes an individual item

        Unique items are sequentially encoded (ie first item -> 0 next unique item -> 1). The mapping dict is updated
        with new encodings as necessary and immediately written to disk.

        Args:
            syscall: item to encode

        Returns:
            integer encoding of syscall
        """
        if syscall in self.syscall_map:
            return self.syscall_map[syscall]
        syscall_enc = len(self.syscall_map) + 1
        self.syscall_map[syscall] = syscall_enc
        np.save(self.file_path, self.syscall_map)

        return syscall_enc


In [29]:
atk_files, normal_files = load_files("plaid")

In [30]:
print(len(atk_files))

0
