# Los Alamos Cybersecurity dataset tools

Includes tools to manipulate dataset storage, as well as load events from said storage.

## Dataset chunking

This dataset is large! To facilitate its processing, it is best to cut its bigger files into *chunks*, which can be processed in parallel.

In [1]:
%load_ext pycodestyle_magic
%flake8_on --max_line_length 120 --ignore W293,E302

In [2]:
import notebooks_as_modules

from contextlib import contextmanager
import dask
import dask.dataframe as ddf
from dask.distributed import Client, LocalCluster
from glob import glob
from growing import growing
import gzip
import io
from jupytest import Suite, Report, Magic, summarize_results
import numpy as np
import os
import os.path as op
import pandas as pd
import re
import sys
import time
from typing import Iterable, ContextManager, TypeVar, Any, Sequence, Tuple, Mapping
from unittest.mock import patch, Mock, call, MagicMock

In [3]:
suite = Suite()
if __name__ == "__main__":
    suite |= Report()
    suite |= Magic()

Let's embody the intended file hierarchy of the LANL dataset into a class.

In [4]:
@growing
class DataStoreLosAlamos:
    """
    Main files making up the Los Alamos Cybersecurity dataset.
    """
    
    def __init__(self, path: os.PathLike) -> None:
        self._dir_base = path
        
    @property
    def dir_base(self) -> os.PathLike:
        return self._dir_base
    
    def __dask_tokenize__(self) -> str:
        return self.dir_base

Cutting the raw files into compressed chunks is a long-running computation. Let's structure it so it's run into a compute cluster, when we need it.

In [5]:
SIZE_CHUNK = (2 << 25) + (2 << 24)  # 96 MB maximum

In [6]:
@DataStoreLosAlamos.method(wrapped_in=dask.delayed(pure=True))
def join_chunked(self, stream: str, *p: os.PathLike, size_chunk: int = SIZE_CHUNK) -> os.PathLike:
    path_stream_chunked = op.join(self.dir_base, "chunked", stream)
    os.makedirs(path_stream_chunked, exist_ok=True)
    
    names_chunk = glob(op.join(path_stream_chunked, "*.txt.gz"))
    if len(names_chunk) == 0 or any(os.stat(p).st_size == 0 for p in names_chunk):
        # Raw files have not been chunked yet, or some chunks are corrupted. It's chunking time.
        with gzip.open(op.join(self.dir_base, f"{stream}.txt.gz"), "rb") as file_raw:
            for index in range(sys.maxsize):
                with FileChunk(
                    op.join(self.dir_base, "chunked", stream, f"{index:04d}.txt.gz"),
                    size_chunk
                ) as file_chunk:
                    for line in file_raw:
                        if not file_chunk.write(line):
                            break
                    else:
                        break  # ...out of outer infinite loop.
                        
    return op.join(path_stream_chunked, *p)

**Tests** for method `join_chunked`:

In [7]:
T = TypeVar("T")


@contextmanager
def mocking_global(name: str, value_mock: T) -> ContextManager[T]:
    must_restore = False
    G = globals()
    if name in G:
        value_orig = G[name]
        must_restore = True
    G[name] = value_mock
    
    try:
        yield value_mock
    finally:
        if must_restore:
            G[name] = value_orig
        else:
            del G[name]

In [8]:
def mock_file_raw(lines: Iterable[str]) -> Mock:
    mock = Mock()
    mock.__enter__ = lambda self: self
    mock.__exit__ = lambda self, t, v, tb: False
    iter_lines = iter(lines)
    mock.__iter__ = lambda self: iter_lines
    return mock


@contextmanager
def mocking_gzip_open(lines: Iterable[str]) -> ContextManager[Mock]:
    with patch("gzip.open", return_value=mock_file_raw(lines)) as mock:
        yield mock

In [9]:
def mock_file_chunk(**kwargs: Any) -> Mock:
    mock = Mock()
    mock.__enter__ = lambda self: self
    mock.__exit__ = lambda self, t, v, tb: False
    mock.write = Mock(**kwargs)
    return mock


@contextmanager
def mocking_FileChunk(mocks: Sequence[Mock]) -> ContextManager[Mock]:
    with mocking_global("FileChunk", Mock(side_effect=mocks)) as mock:
        yield mock

In [10]:
%%test join-chunked/Stop
mocks_chunk = [
    mock_file_chunk(**kwargs)
    for kwargs in [dict(side_effect=[True, False]), dict(side_effect=[True, False]), dict(return_value=True)]
]
with patch("os.makedirs"), patch("glob.glob", return_value=[]),\
        mocking_gzip_open([b"asdf\n", b"qwer\n", b"zxcv\n", b"qwerty\n", b"uiop\n"]) as mock_raw,\
        mocking_FileChunk(mocks_chunk) as mock_class:
    ds = DataStoreLosAlamos("/path/to/data")
    assert ds.join_chunked("dns", "asdf", "qwer", size_chunk=10).compute(scheduler="single-threaded") ==\
        "/path/to/data/chunked/dns/asdf/qwer"

    mock_class.assert_has_calls(
        [call(f"/path/to/data/chunked/dns/{i:04d}.txt.gz", 10) for i in range(3)]
    )
    mocks_chunk[0].write.assert_has_calls([call(s) for s in [b"asdf\n", b"qwer\n"]])
    mocks_chunk[1].write.assert_has_calls([call(s) for s in [b"zxcv\n", b"qwerty\n"]])
    mocks_chunk[2].write.assert_has_calls([call(s) for s in [b"uiop\n"]])

Test [1mjoin-chunked/Stop[0m passed.


In [11]:
%%test join-chunked/End of raw file corresponds to end of chunk
mocks_chunk = [
    mock_file_chunk(**kwargs)
    for kwargs in [dict(side_effect=[True, False]), dict(side_effect=[True, False]), dict(return_value=True)]
]
with patch("os.makedirs"), patch("glob.glob", return_value=[]),\
        mocking_gzip_open([b"asdf\n", b"qwer\n", b"zxcv\n", b"qwerty\n"]) as mock_raw,\
        mocking_FileChunk(mocks_chunk) as mock_class:
    ds = DataStoreLosAlamos("/path/to/data")
    assert ds.join_chunked("dns", "asdf", "qwer", size_chunk=10).compute(scheduler="single-threaded") ==\
        "/path/to/data/chunked/dns/asdf/qwer"

    mock_class.assert_has_calls(
        [call(f"/path/to/data/chunked/dns/{i:04d}.txt.gz", 10) for i in range(3)]
    )
    mocks_chunk[0].write.assert_has_calls([call(s) for s in [b"asdf\n", b"qwer\n"]])
    mocks_chunk[1].write.assert_has_calls([call(s) for s in [b"zxcv\n", b"qwerty\n"]])
    mocks_chunk[2].write.assert_not_called()

Test [1mjoin-chunked/End of raw file corresponds to end of chunk[0m passed.


In [12]:
%%test join-chunked/Raw file is empty
mocks_chunk = [
    mock_file_chunk(**kwargs)
    for kwargs in [dict(side_effect=[True, False]), dict(side_effect=[True, False]), dict(return_value=True)]
]
with patch("os.makedirs"), patch("glob.glob", return_value=[]),\
        mocking_gzip_open([]) as mock_raw,\
        mocking_FileChunk(mocks_chunk) as mock_class:
    ds = DataStoreLosAlamos("/path/to/data")
    assert ds.join_chunked("dns", "asdf", "qwer").compute(scheduler="single-threaded") ==\
        "/path/to/data/chunked/dns/asdf/qwer"

    mock_class.assert_called_once_with("/path/to/data/chunked/dns/0000.txt.gz", SIZE_CHUNK)
    for mock in mocks_chunk:
        mock.write.assert_not_called()

Test [1mjoin-chunked/Raw file is empty[0m passed.


Class `FileChunk` then embodies the creation of a chunk and the transfer of its content into the target file. Note that the algorithm of `join_chunked()` made it so the context of the `FileChunk` instance is entered before we have any content for the chunk; the creation of the file should thus be delayed to a call to method `write()`.

In [13]:
class FileChunk:
    """
    Delays the creation of a chunk file until the user commits to writing something in it.
    """
    
    def __init__(self, path: os.PathLike, limit: int) -> None:
        self._path = path
        self._file: Optional[io.RawByteIO] = None
        self._limit = limit
        self._size = 0
        
    def __enter__(self) -> "FileChunk":
        return self
    
    def __exit__(self, type_exc, value_exc, tb_exc) -> bool:
        if self._file is not None:
            self._file.close()
        return False
    
    def write(self, buf: bytes) -> bool:
        if self._file is None:
            self._file = gzip.open(self._path, "wb")

        index = 0
        while index < len(buf):
            index += self._file.write(buf[index:])
            
        self._size += len(buf)
        return self._size < self._limit

**Tests**:

In [14]:
%%test FileChunk/No file created without write
with patch("gzip.open") as mock:
    with FileChunk("asdf", 100) as file_chunk:
        pass
    mock.assert_not_called()

Test [1mFileChunk/No file created without write[0m passed.


In [15]:
%%test FileChunk.write/All written in one single underlying write
bytes_written = io.BytesIO()
with patch("gzip.open", return_value=bytes_written):
    with FileChunk("asdf", 100) as file_chunk:
        assert file_chunk.write(b"qwerty\n")
        assert bytes_written.getvalue() == b"qwerty\n"

Test [1mFileChunk.write/All written in one single underlying write[0m passed.


In [16]:
%%test FileChunk.write/Multiple underlying writes needed
with patch("gzip.GzipFile") as mock:
    mock.return_value.write = Mock(side_effect=[3, 4])
    with FileChunk("asdf", 100) as file_chunk:
        assert file_chunk.write(b"qwerty\n")
    mock.return_value.write.assert_has_calls([call(b'qwerty\n',), call(b'rty\n',)])

Test [1mFileChunk.write/Multiple underlying writes needed[0m passed.


In [17]:
def test_chunk_filling(last: bytes) -> None:
    bytes_written = io.BytesIO()
    with patch("gzip.open", return_value=bytes_written):
        with FileChunk("asdf", 25) as file_chunk:
            assert file_chunk.write(b"asdf\nqwer\n")
            assert file_chunk.write(b"zxcv\n")
            assert file_chunk.write(b"uiop\n")
            assert not file_chunk.write(last)
            assert bytes_written.getvalue() == b"asdf\nqwer\nzxcv\nuiop\n" + last
    

for adverb, last in [("exactly", b"1234\n"), ("beyond", b"1234567890\n")]:
    suite.test(
        test_chunk_filling,
        args=(last,),
        name=f"FileChunk.write/Return False once chunk once {adverb} full"
    )

Test [1mFileChunk.write/Return False once chunk once exactly full[0m passed.
Test [1mFileChunk.write/Return False once chunk once beyond full[0m passed.


## Experiments repository

In [18]:
@DataStoreLosAlamos.method
def join_experiments(self, *p: os.PathLike) -> os.PathLike:
    return op.join(self.dir_base, "experiments", *p)

## Loading a data stream into a Dask dataframe

Dask dataframes easily leverage the chunking of the streams that we have wrought. However, given how the chunks are already sorted, one gets the best benefits from these by supplying knowledge of the *divisions* of the index key (here, time) across the partitions. This is why we implement a custom dataframe loading that quickly extracts the division knowledge.

### Event timestamps

The Los Alamos Cybersecurity dataset has been captured over a period of two months, but the exact dates are unknown; the timestamps provided in the dataset start at 0. While mapping these directly to timestamps would yield funny 1970's dates to events, we rather choose a more modern setting. Given the late-2015 moment the dataset was released, we shall assume the acquisition ran from January 1st, 2015, to February 27th.

In [19]:
@growing
class Time:
    START = pd.Timestamp("2015-01-01T00:00:00")
    END = pd.Timestamp("2015-02-28T00:00:00") - pd.Timedelta(nanoseconds=1)


Time.END

Timestamp('2015-02-27 23:59:59.999999999')

In [20]:
def seconds2ts(n: str) -> pd.Timestamp:
    return Time.START + pd.Timedelta(seconds=int(n))

In [21]:
%%test Timestamp mapping
assert seconds2ts("3600") == pd.Timestamp("2015-01-01T01:00:00")

Test [1mTimestamp mapping[0m passed.


### Data schemas

The `SCHEMAS` dictionary describe the columns for each of the four main data streams, in addition to the label array stored in `redteam.txt.gz`. Each stream is sorted (and thus indexable) by its `time` column, which is omitted from the schema descriptions to facilitate the usage of the schema objects.

In [22]:
SCHEMA = Sequence[Tuple[str, str]]


SCHEMAS: Mapping[str, SCHEMA] = {
    "dns": [
        ("computer_source", "object"),
        ("computer_destination", "object")
    ],
    "flows": [
        ("duration", "int64"),
        ("computer_source", "object"),
        ("port_source", "object"),
        ("computer_destination", "object"),
        ("port_destination", "object"),
        ("protocol", "category"),
        ("num_packets", "int32"),
        ("num_bytes", "int64")
    ],
    "proc": [
        ("userdomain_source", "object"),
        ("computer_source", "object"),
        ("process", "object"),
        ("action", "category")
    ],
    "auth": [
        ("userdomain_source", "object"),
        ("userdomain_destination", "object"),
        ("computer_source", "object"),
        ("computer_destination", "object"),
        ("auth", "category"),
        ("logon", "category"),
        ("direction", "category"),
        ("result", "category")
    ],
    "redteam": [
        ("userdomain_source", "object"),
        ("computer_source", "object"),
        ("computer_destination", "object")
    ]
}

### Figuring out which stream a file is

In [23]:
RXS_NAMES_STREAM = r"(" + "|".join(SCHEMAS.keys()) + ")"
RX_PATH2STREAM = re.compile(r"/" + RXS_NAMES_STREAM + r"/|" + RXS_NAMES_STREAM + r"\.txt\.gz")

In [24]:
def path2stream(path: os.PathLike) -> str:
    m = re.search(RX_PATH2STREAM, path)
    if m is None:
        return ""
    return m.group(1) or m.group(2)

In [25]:
%%test Stream name for a raw file
assert path2stream("/data/lanl/redteam.txt.gz") == "redteam"

Test [1mStream name for a raw file[0m passed.


In [26]:
%%test Stream name for a chunk file
assert path2stream("/data/lanl/chunked/auth/0034.txt.gz") == "auth"

Test [1mStream name for a chunk file[0m passed.


In [27]:
%%test Stream name for a stream-specific processing result (not chunking)
assert path2stream("/data/lanl/experiments/asdf/qwer/proc/zxcv") == "proc"

Test [1mStream name for a stream-specific processing result (not chunking)[0m passed.


In [28]:
%%test Path with no stream name
assert not path2stream("/data/lanl/wtf")

Test [1mPath with no stream name[0m passed.


### Getting the first timestamp of a LANL file

In [29]:
def get_timestamp_lower(path: os.PathLike) -> pd.Timestamp:
    with gzip.open(path, "rb") as file:
        line1 = next(file)  # Assumption: no file empty.
        num_seconds, *_ = line1.split(b",")
        return seconds2ts(num_seconds)

In [30]:
%%test First timestamp for line of a DNS stream file
with patch("gzip.open", return_value=io.BytesIO(b"90842,C326,C89\n")):
    assert get_timestamp_lower("asdf") == pd.Timestamp("2015-01-02T01:14:02")

Test [1mFirst timestamp for line of a DNS stream file[0m passed.


In [31]:
%%test First timestamp for line of a flows stream file
with patch("gzip.open", return_value=io.BytesIO(b"2957021,2,C347,50234,C812,443,https,12,15723\nqwerty\n")):
    assert get_timestamp_lower("asdf") == pd.Timestamp("2015-02-04T05:23:41")

Test [1mFirst timestamp for line of a flows stream file[0m passed.


### Loading a LANL CSV file

In [32]:
def read_lanl_csv(path: os.PathLike, **kwargs: Any) -> pd.DataFrame:
    stream = path2stream(path)
    if not stream:
        raise ValueError(f"Path {path} does not involve a LANL data stream.")
    schema = SCHEMAS[stream]
    return pd.read_csv(
        path,
        header=None,
        names=["time"] + [attr for attr, _ in schema],
        dtype=dict(schema),
        parse_dates=["time"],
        date_parser=seconds2ts,
        index_col="time",
        compression="gzip",
        **kwargs
    )

In [33]:
@contextmanager
def dummy_proc_content() -> ContextManager[os.PathLike]:
    content = b"""\
3,C3@DOM1,C4,P2,Start
18,C89@DOM1,C23,P78,Start
29,C14@DOM1,C90,P123,Start
53,C90@DOM1,C34,P23,End
"""
    with patch("gzip.builtins.open", return_value=io.BytesIO(gzip.compress(content))):
        yield "/path/with/proc/"

In [34]:
%%test Reading LANL content
with dummy_proc_content() as path:
    df = read_lanl_csv(path)
assert len(df) == 4
assert len(df.columns) == 4
assert df.index.dtype == np.dtype("datetime64[ns]")
assert {c: str(dt) for c, dt in df.dtypes.items()} == {
    "userdomain_source": "object",
    "computer_source": "object",
    "process": "object",
    "action": "category"
}

Test [1mReading LANL content[0m passed.


### Putting it all together into a Dask dataframe

In [35]:
@DataStoreLosAlamos.method
def get_stream(self, name: str) -> ddf.DataFrame:
    schema = SCHEMAS[name]
    paths_chunk = sorted(glob(self.join_chunked(name, "*.txt.gz").compute()))
    divisions = [get_timestamp_lower(path) for path in paths_chunk] + [Time.END]
    return ddf.from_delayed(
        [dask.delayed(read_lanl_csv)(path) for path in paths_chunk],
        meta=dict(schema),
        divisions=divisions,
        prefix="load_chunk"
    )

In [36]:
%%test Stream dataframe coherence
ds = DataStoreLosAlamos("/lanl")
indices = [3, 2, 4, 0, 1]
with patch("__main__.glob", side_effect=[[f"/lanl/flows/{n:04d}.txt.gz" for n in indices]]),\
        patch.object(ds, "join_chunked", new=MagicMock()),\
        patch(
            "__main__.get_timestamp_lower",
            side_effect=[pd.Timestamp(s) for s in [
                "2015-01-01T00:00:04",
                "2015-01-11T12:45:32",
                "2015-01-27T18:19:19",
                "2015-02-12T14:10:23",
                "2015-02-23T18:02:38"
            ]]
        ):
    df = ds.get_stream("flows")
    assert df.npartitions == 5
    assert list(df.index.divisions) == [pd.Timestamp(s) for s in [
        "2015-01-01T00:00:04",
        "2015-01-11T12:45:32",
        "2015-01-27T18:19:19",
        "2015-02-12T14:10:23",
        "2015-02-23T18:02:38",
        "2015-02-27T23:59:59.999999999"
    ]]
    assert dict(df.dtypes) == dict(SCHEMAS["flows"])

Test [1mStream dataframe coherence[0m passed.


In [37]:
@suite.test(name="Resolving a complete dataframe")
def resolving_whole_dataframe():
    all_events = [
        b"""\
    63,U34@DOM1,U23@DOM1,C98,C98,Kerberos,Network,LogOn,Success
    91,U67@DOM1,SYSTEM@C89,C89,C89,Negotiate,Service,LogOn,Success
    """,
        b"""\
    304,U45@DOM1,U45@DOM1,C234,C329,Kerberos,Network,LogOff,Success
    897,U93@DOM1,U93@DOM1,C123,C123,Kerberos,Network,LogOn,Success
    """,
        b"""\
    956,U93@DOM1,U93@DOM1,C123,C123,Kerberos,Network,LogOff,Success
    3456,U67@DOM1,U45@DOM1,C89,C329,Kerberos,Network,LogOn,Failure
    4127,U980@DOM1,U980@DOM1,C23,C32,Kerberos,Service,LogOn,Success
    """
    ]

    map_content = {
        op.join("/lanl", "chunked", "auth", f"{n:02d}.txt.gz"): content
        for n, content in enumerate(all_events)
    }

    def grab_content(path: os.PathLike, *args, **kwargs) -> io.RawIOBase:
        return io.BytesIO(gzip.compress(map_content[path]))

    ds = DataStoreLosAlamos("asdf")
    with patch.object(ds, "join_chunked", new=MagicMock()),\
            patch("__main__.glob", side_effect=[list(map_content.keys())]),\
            patch("gzip.builtins.open", side_effect=grab_content):
        df = ds.get_stream("auth")
        assert df.npartitions == 3
        assert list(df.divisions) == [pd.Timestamp(s) for s in [
            "2015-01-01T00:01:03",
            "2015-01-01T00:05:04",
            "2015-01-01T00:15:56"
        ]] + [Time.END]
        df_realized = df.compute()
        assert dict(df_realized.dtypes) == dict(SCHEMAS["auth"])

    for line, ts_and_row in zip(b"".join(all_events).split(b"\n"), df_realized.iterrows()):
        num_seconds_expected, *cols_expected = line.split(b",")
        ts_expected = seconds2ts(num_seconds_expected)
        ts_obtained, row_obtained = ts_and_row
        assert ts_expected == ts_obtained
        assert [str(c, encoding="utf-8") for c in cols_expected] == list(row_obtained)

Test [1mResolving a complete dataframe[0m passed.


## Provide all streams in one query

In [38]:
@DataStoreLosAlamos.method
def streams(self) -> Mapping[str, ddf.DataFrame]:
    # First ensure all these streams have been chunked; leverage parallel cluster computation.
    streams = ["auth", "dns", "flows", "proc"]
    persisted = [self.join_chunked(name).persist() for name in streams]
    dask.compute(persisted)
    del persisted
    
    return {name: self.get_stream(name) for name in streams}

In [39]:
@suite.test
def gathering_all_streams():
    stream_delay: Mapping[str, float] = {
        "auth": 4.0,
        "dns": 0.5,
        "flows": 1.0,
        "proc": 2.0
    }
    longest = max(stream_delay.values())

    @dask.delayed
    def mock_join_chunked(name: str) -> os.PathLike:
        time.sleep(stream_delay[name])
        return name  # Unused
    
    def mock_get_stream(name: str) -> Tuple[str]:
        return (name,)
    
    cluster = LocalCluster(n_workers=4, threads_per_worker=1, dashboard_address=None)
    client = Client(cluster)
    try:
        ds = DataStoreLosAlamos("dummy")
        with patch.object(ds, "join_chunked", new=Mock(side_effect=mock_join_chunked)),\
                patch.object(ds, "get_stream", side_effect=mock_get_stream):
            tic = time.time()
            assert ds.streams() == {name: (name,) for name in ["auth", "dns", "flows", "proc"]}
            toc = time.time()
            assert(abs(toc - tic - longest) < 0.1)
    finally:
        client.close()
        cluster.close()

Test [1mgathering_all_streams[0m passed.


# Test summary

In [40]:
if __name__ == "__main__":
    _ = summarize_results(suite)

19 passed, [37m0 failed[0m, [37m0 raised an error[0m
