# Los Alamos Cybersecurity dataset tools

Includes tools to manipulate dataset storage, as well as load events from said storage.

## Dataset chunking

This dataset is large! To facilitate its processing, it is best to cut its bigger files into *chunks*, which can be processed in parallel.

In [1]:
%load_ext pycodestyle_magic
%flake8_on --max_line_length 120 --ignore W293,E302

In [2]:
import notebooks

from contextlib import contextmanager
import dask
from glob import glob
from growing import growing
import gzip
import io
from jupytest import Suite, Report, summarize_results
import os
import os.path as op
import sys
from typing import Iterable, ContextManager, TypeVar, Any, Sequence
from unittest.mock import patch, Mock, call

In [3]:
suite = Suite()
if __name__ == "__main__":
    suite |= Report()

Let's embody the intended file hierarchy of the LANL dataset into a class.

In [4]:
@growing
class DataStoreLosAlamos:
    """
    Main files making up the Los Alamos Cybersecurity dataset.
    """
    
    def __init__(self, path: os.PathLike) -> None:
        self._dir_base = path
        
    @property
    def dir_base(self) -> os.PathLike:
        return self._dir_base

Cutting the raw files into compressed chunks is a long-running computation. Let's structure it so it's run into a compute cluster, when we need it.

In [5]:
SIZE_CHUNK = (2 << 25) + (2 << 24)  # 96 MB maximum

In [6]:
@DataStoreLosAlamos.method(wrapped_in=dask.delayed(pure=True))
def join_chunked(self, stream: str, *p: os.PathLike) -> os.PathLike:
    path_stream_chunked = op.join(self.dir_base, "chunked", stream)
    os.makedirs(path_stream_chunked, exist_ok=True)
    
    names_chunk = glob(op.join(path_stream_chunked, "*.txt.gz"))
    if len(names_chunk) == 0 or any(os.stat(p).st_size == 0 for p in names_chunk):
        # Raw files have not been chunked yet, or some chunks are corrupted. It's chunking time.
        with gzip.open(op.join(self.dir_base, "raw", f"{stream}.txt.gz"), "rb") as file_raw:
            for index in range(sys.maxsize):
                with FileChunk(op.join(self.dir_base, "chunked", stream, f"{index:04d}.txt.gz")) as file_chunk:
                    for line in file_raw:
                        if not file_chunk.write(line):
                            break
                    else:
                        break  # ...out of outer infinite loop.
                        
    return op.join(path_stream_chunked, *p)

**Tests** for method `join_chunked`:

In [7]:
T = TypeVar("T")


@contextmanager
def mocking_global(name: str, value_mock: T) -> ContextManager[T]:
    must_restore = False
    G = globals()
    if name in G:
        value_orig = G[name]
        must_restore = True
    G[name] = value_mock
    
    try:
        yield value_mock
    finally:
        if must_restore:
            G[name] = value_orig
        else:
            del G[name]

In [8]:
def mock_file_raw(lines: Iterable[str]) -> Mock:
    mock = Mock()
    mock.__enter__ = lambda self: self
    mock.__exit__ = lambda self, t, v, tb: False
    iter_lines = iter(lines)
    mock.__iter__ = lambda self: iter_lines
    return mock


@contextmanager
def mocking_gzip_open(lines: Iterable[str]) -> ContextManager[Mock]:
    with patch("gzip.open", return_value=mock_file_raw(lines)) as mock:
        yield mock

In [9]:
def mock_file_chunk(**kwargs: Any) -> Mock:
    mock = Mock()
    mock.__enter__ = lambda self: self
    mock.__exit__ = lambda self, t, v, tb: False
    mock.write = Mock(**kwargs)
    return mock


@contextmanager
def mocking_FileChunk(mocks: Sequence[Mock]) -> ContextManager[Mock]:
    with mocking_global("FileChunk", Mock(side_effect=mocks)) as mock:
        yield mock

In [10]:
%%test join-chunked/Stop
mocks_chunk = [
    mock_file_chunk(**kwargs)
    for kwargs in [dict(side_effect=[True, False]), dict(side_effect=[True, False]), dict(return_value=True)]
]
with patch("os.makedirs"), patch("glob.glob", return_value=[]),\
        mocking_gzip_open([b"asdf\n", b"qwer\n", b"zxcv\n", b"qwerty\n", b"uiop\n"]) as mock_raw,\
        mocking_FileChunk(mocks_chunk) as mock_class:
    ds = DataStoreLosAlamos("/path/to/data")
    assert ds.join_chunked("dns", "asdf", "qwer").compute(scheduler="single-threaded") ==\
        "/path/to/data/chunked/dns/asdf/qwer"

    mock_class.assert_has_calls(
        [call(f"/path/to/data/chunked/dns/{i:04d}.txt.gz") for i in range(3)]
    )
    mocks_chunk[0].write.assert_has_calls([call(s) for s in [b"asdf\n", b"qwer\n"]])
    mocks_chunk[1].write.assert_has_calls([call(s) for s in [b"zxcv\n", b"qwerty\n"]])
    mocks_chunk[2].write.assert_has_calls([call(s) for s in [b"uiop\n"]])

Test [1mjoin-chunked/Stop[0m passed.


In [11]:
%%test join-chunked/End of raw file corresponds to end of chunk
mocks_chunk = [
    mock_file_chunk(**kwargs)
    for kwargs in [dict(side_effect=[True, False]), dict(side_effect=[True, False]), dict(return_value=True)]
]
with patch("os.makedirs"), patch("glob.glob", return_value=[]),\
        mocking_gzip_open([b"asdf\n", b"qwer\n", b"zxcv\n", b"qwerty\n"]) as mock_raw,\
        mocking_FileChunk(mocks_chunk) as mock_class:
    ds = DataStoreLosAlamos("/path/to/data")
    assert ds.join_chunked("dns", "asdf", "qwer").compute(scheduler="single-threaded") ==\
        "/path/to/data/chunked/dns/asdf/qwer"

    mock_class.assert_has_calls(
        [call(f"/path/to/data/chunked/dns/{i:04d}.txt.gz") for i in range(3)]
    )
    mocks_chunk[0].write.assert_has_calls([call(s) for s in [b"asdf\n", b"qwer\n"]])
    mocks_chunk[1].write.assert_has_calls([call(s) for s in [b"zxcv\n", b"qwerty\n"]])
    mocks_chunk[2].write.assert_not_called()

Test [1mjoin-chunked/End of raw file corresponds to end of chunk[0m passed.


In [12]:
%%test join-chunked/Raw file is empty
mocks_chunk = [
    mock_file_chunk(**kwargs)
    for kwargs in [dict(side_effect=[True, False]), dict(side_effect=[True, False]), dict(return_value=True)]
]
with patch("os.makedirs"), patch("glob.glob", return_value=[]),\
        mocking_gzip_open([]) as mock_raw,\
        mocking_FileChunk(mocks_chunk) as mock_class:
    ds = DataStoreLosAlamos("/path/to/data")
    assert ds.join_chunked("dns", "asdf", "qwer").compute(scheduler="single-threaded") ==\
        "/path/to/data/chunked/dns/asdf/qwer"

    mock_class.assert_called_once_with("/path/to/data/chunked/dns/0000.txt.gz")
    for mock in mocks_chunk:
        mock.write.assert_not_called()

Test [1mjoin-chunked/Raw file is empty[0m passed.


Class `FileChunk` then embodies the creation of a chunk and the transfer of its content into the target file. Note that the algorithm of `join_chunked()` made it so the context of the `FileChunk` instance is entered before we have any content for the chunk; the creation of the file should thus be delayed to a call to method `write()`.

In [13]:
class FileChunk:
    """
    Delays the creation of a chunk file until the user commits to writing something in it.
    """
    
    def __init__(self, path: os.PathLike) -> None:
        self._path = path
        self._file: Optional[io.RawByteIO] = None
        
    def __enter__(self) -> "FileChunk":
        return self
    
    def __exit__(self, type_exc, value_exc, tb_exc) -> bool:
        if self._file is not None:
            self._file.close()
        return False
    
    def write(self, buf: bytes) -> None:
        if self._file is None:
            self._file = gzip.open(self._path, "wb")
        index = 0
        while index < len(buf):
            index += self._file.write(buf[index:])

**Tests**:

In [14]:
%%test FileChunk.write/All written in one single underlying write
bytes_written = io.BytesIO()
with patch("gzip.open", return_value=bytes_written):
    with FileChunk("asdf") as file_chunk:
        file_chunk.write(b"qwerty\n")
        assert bytes_written.getvalue() == b"qwerty\n"

Test [1mFileChunk.write/All written in one single underlying write[0m passed.


In [15]:
%%test FileChunk.write/Multiple underlying writes needed
with patch("gzip.GzipFile") as mock:
    mock.return_value.write = Mock(side_effect=[3, 4])
    with FileChunk("asdf") as file_chunk:
        file_chunk.write(b"qwerty\n")
    mock.return_value.write.assert_has_calls([call(b'qwerty\n',), call(b'rty\n',)])

Test [1mFileChunk.write/Multiple underlying writes needed[0m passed.


In [16]:
%%test FileChunk/No file created without write
with patch("gzip.open") as mock:
    with FileChunk("asdf") as file_chunk:
        pass
    mock.assert_not_called()

Test [1mFileChunk/No file created without write[0m passed.


# Test summary

In [17]:
if __name__ == "__main__":
    _ = summarize_results(suite)

6 passed, [37m0 failed[0m, [37m0 raised an error[0m
