# Los Alamos Cybersecurity dataset tools

Includes tools to manipulate dataset storage, as well as load events from said storage.

In [1]:
%load_ext pycodestyle_magic
%flake8_on --max_line_length 120 --ignore W293,E302

In [2]:
import notebooks_as_modules

from collections import OrderedDict
from contextlib import contextmanager
import dask
import dask.dataframe as ddf
from dask.distributed import Client, LocalCluster
from glob import glob
from growing import growing
import gzip
import igraph as ig
import io
from jupytest import Suite, Report, Magic, summarize_results, assert_, eq, approx, Explanation, ExplanationOnFailure, \
    join_args, fail
import numpy as np
import os
import os.path as op
import pandas as pd
import re
import shutil
import sys
import time
from typing import *  # noqa
from unittest.mock import patch, Mock, call, MagicMock

In [3]:
suite = Suite()
if __name__ == "__main__":
    suite |= Report()
    suite |= Magic()

## Dataset chunking

This dataset is large! To facilitate its processing, it is best to cut its bigger files into *chunks*, which can be processed in parallel.

Let's embody the intended file hierarchy of the LANL dataset into a class.

In [4]:
@growing
class DataStoreLosAlamos:
    """
    Main files making up the Los Alamos Cybersecurity dataset.
    """
    
    def __init__(self, path: os.PathLike) -> None:
        self._dir_base = path
        
    @property
    def dir_base(self) -> os.PathLike:
        return self._dir_base
    
    def __dask_tokenize__(self) -> str:
        return self.dir_base

Cutting the raw files into compressed chunks is a long-running computation. Let's structure it so it's run into a compute cluster, when we need it.

In [5]:
SIZE_CHUNK = (2 << 25) + (2 << 24)  # 96 MB maximum

In [6]:
@DataStoreLosAlamos.method(wrapped_in=dask.delayed(pure=True))
def join_chunked(self, stream: str, *p: os.PathLike, size_chunk: int = SIZE_CHUNK) -> os.PathLike:
    path_stream_chunked = op.join(self.dir_base, "chunked", stream)
    os.makedirs(path_stream_chunked, exist_ok=True)
    
    names_chunk = glob(op.join(path_stream_chunked, "*.txt.gz"))
    if len(names_chunk) == 0 or any(os.stat(p).st_size == 0 for p in names_chunk):
        # Raw files have not been chunked yet, or some chunks are corrupted. It's chunking time.
        with gzip.open(op.join(self.dir_base, f"{stream}.txt.gz"), "rb") as file_raw:
            for index in range(sys.maxsize):
                with FileChunk(
                    op.join(self.dir_base, "chunked", stream, f"{index:04d}.txt.gz"),
                    size_chunk
                ) as file_chunk:
                    for line in file_raw:
                        if not file_chunk.write(line):
                            break
                    else:
                        break  # ...out of outer infinite loop.
                        
    return op.join(path_stream_chunked, *p)

**Tests** for method `join_chunked`:

In [7]:
T = TypeVar("T")


@contextmanager
def mocking_global(name: str, value_mock: T) -> ContextManager[T]:
    must_restore = False
    G = globals()
    if name in G:
        value_orig = G[name]
        must_restore = True
    G[name] = value_mock
    
    try:
        yield value_mock
    finally:
        if must_restore:
            G[name] = value_orig
        else:
            del G[name]

In [8]:
def mock_file_raw(lines: Iterable[str]) -> Mock:
    mock = Mock()
    mock.__enter__ = lambda self: self
    mock.__exit__ = lambda self, t, v, tb: False
    iter_lines = iter(lines)
    mock.__iter__ = lambda self: iter_lines
    return mock


@contextmanager
def mocking_gzip_open(lines: Iterable[str]) -> ContextManager[Mock]:
    with patch("gzip.open", return_value=mock_file_raw(lines)) as mock:
        yield mock

In [9]:
def mock_file_chunk(**kwargs: Any) -> Mock:
    mock = Mock()
    mock.__enter__ = lambda self: self
    mock.__exit__ = lambda self, t, v, tb: False
    mock.write = Mock(**kwargs)
    return mock


@contextmanager
def mocking_FileChunk(mocks: Sequence[Mock]) -> ContextManager[Mock]:
    with mocking_global("FileChunk", Mock(side_effect=mocks)) as mock:
        yield mock

In [10]:
%%test join-chunked/Stop
mocks_chunk = [
    mock_file_chunk(**kwargs)
    for kwargs in [dict(side_effect=[True, False]), dict(side_effect=[True, False]), dict(return_value=True)]
]
with patch("os.makedirs"), patch("glob.glob", return_value=[]),\
        mocking_gzip_open([b"asdf\n", b"qwer\n", b"zxcv\n", b"qwerty\n", b"uiop\n"]) as mock_raw,\
        mocking_FileChunk(mocks_chunk) as mock_class:
    ds = DataStoreLosAlamos("/path/to/data")
    assert_(
        eq,
        actual=ds.join_chunked("dns", "asdf", "qwer", size_chunk=10).compute(scheduler="single-threaded"),
        expected="/path/to/data/chunked/dns/asdf/qwer"
    )

    mock_class.assert_has_calls(
        [call(f"/path/to/data/chunked/dns/{i:04d}.txt.gz", 10) for i in range(3)]
    )
    mocks_chunk[0].write.assert_has_calls([call(s) for s in [b"asdf\n", b"qwer\n"]])
    mocks_chunk[1].write.assert_has_calls([call(s) for s in [b"zxcv\n", b"qwerty\n"]])
    mocks_chunk[2].write.assert_has_calls([call(s) for s in [b"uiop\n"]])

Test [1mjoin-chunked/Stop[0m passed.


In [11]:
%%test join-chunked/End of raw file corresponds to end of chunk
mocks_chunk = [
    mock_file_chunk(**kwargs)
    for kwargs in [dict(side_effect=[True, False]), dict(side_effect=[True, False]), dict(return_value=True)]
]
with patch("os.makedirs"), patch("glob.glob", return_value=[]),\
        mocking_gzip_open([b"asdf\n", b"qwer\n", b"zxcv\n", b"qwerty\n"]) as mock_raw,\
        mocking_FileChunk(mocks_chunk) as mock_class:
    ds = DataStoreLosAlamos("/path/to/data")
    assert_(
        eq,
        actual=ds.join_chunked("dns", "asdf", "qwer", size_chunk=10).compute(scheduler="single-threaded"),
        expected="/path/to/data/chunked/dns/asdf/qwer"
    )

    mock_class.assert_has_calls(
        [call(f"/path/to/data/chunked/dns/{i:04d}.txt.gz", 10) for i in range(3)]
    )
    mocks_chunk[0].write.assert_has_calls([call(s) for s in [b"asdf\n", b"qwer\n"]])
    mocks_chunk[1].write.assert_has_calls([call(s) for s in [b"zxcv\n", b"qwerty\n"]])
    mocks_chunk[2].write.assert_not_called()

Test [1mjoin-chunked/End of raw file corresponds to end of chunk[0m passed.


In [12]:
%%test join-chunked/Raw file is empty
mocks_chunk = [
    mock_file_chunk(**kwargs)
    for kwargs in [dict(side_effect=[True, False]), dict(side_effect=[True, False]), dict(return_value=True)]
]
with patch("os.makedirs"), patch("glob.glob", return_value=[]),\
        mocking_gzip_open([]) as mock_raw,\
        mocking_FileChunk(mocks_chunk) as mock_class:
    ds = DataStoreLosAlamos("/path/to/data")
    assert_(
        eq,
        actual=ds.join_chunked("dns", "asdf", "qwer").compute(scheduler="single-threaded"),
        expected="/path/to/data/chunked/dns/asdf/qwer"
    )

    mock_class.assert_called_once_with("/path/to/data/chunked/dns/0000.txt.gz", SIZE_CHUNK)
    for mock in mocks_chunk:
        mock.write.assert_not_called()

Test [1mjoin-chunked/Raw file is empty[0m passed.


Class `FileChunk` then embodies the creation of a chunk and the transfer of its content into the target file. Note that the algorithm of `join_chunked()` made it so the context of the `FileChunk` instance is entered before we have any content for the chunk; the creation of the file should thus be delayed to a call to method `write()`.

In [13]:
class FileChunk:
    """
    Delays the creation of a chunk file until the user commits to writing something in it.
    """
    
    def __init__(self, path: os.PathLike, limit: int) -> None:
        self._path = path
        self._file: Optional[io.RawByteIO] = None
        self._limit = limit
        self._size = 0
        
    def __enter__(self) -> "FileChunk":
        return self
    
    def __exit__(self, type_exc, value_exc, tb_exc) -> bool:
        if self._file is not None:
            self._file.close()
        return False
    
    def write(self, buf: bytes) -> bool:
        if self._file is None:
            self._file = gzip.open(self._path, "wb")

        index = 0
        while index < len(buf):
            index += self._file.write(buf[index:])
            
        self._size += len(buf)
        return self._size < self._limit

**Tests**:

In [14]:
%%test FileChunk/No file created without write
with patch("gzip.open") as mock:
    with FileChunk("asdf", 100) as file_chunk:
        pass
    mock.assert_not_called()

Test [1mFileChunk/No file created without write[0m passed.


In [15]:
%%test FileChunk.write/All written in one single underlying write
bytes_written = io.BytesIO()
with patch("gzip.open", return_value=bytes_written):
    with FileChunk("asdf", 100) as file_chunk:
        assert file_chunk.write(b"qwerty\n")
        assert_(eq, actual=bytes_written.getvalue(), expected=b"qwerty\n")

Test [1mFileChunk.write/All written in one single underlying write[0m passed.


In [16]:
%%test FileChunk.write/Multiple underlying writes needed
with patch("gzip.GzipFile") as mock:
    mock.return_value.write = Mock(side_effect=[3, 4])
    with FileChunk("asdf", 100) as file_chunk:
        assert file_chunk.write(b"qwerty\n")
    mock.return_value.write.assert_has_calls([call(b'qwerty\n',), call(b'rty\n',)])

Test [1mFileChunk.write/Multiple underlying writes needed[0m passed.


In [17]:
def test_chunk_filling(last: bytes) -> None:
    bytes_written = io.BytesIO()
    with patch("gzip.open", return_value=bytes_written):
        with FileChunk("asdf", 25) as file_chunk:
            assert file_chunk.write(b"asdf\nqwer\n")
            assert file_chunk.write(b"zxcv\n")
            assert file_chunk.write(b"uiop\n")
            assert not file_chunk.write(last)
            assert_(eq, actual=bytes_written.getvalue(), expected=b"asdf\nqwer\nzxcv\nuiop\n" + last)
    

for adverb, last in [("exactly", b"1234\n"), ("beyond", b"1234567890\n")]:
    suite.test(
        test_chunk_filling,
        args=(last,),
        name=f"FileChunk.write/Return False once chunk once {adverb} full"
    )

Test [1mFileChunk.write/Return False once chunk once exactly full[0m passed.
Test [1mFileChunk.write/Return False once chunk once beyond full[0m passed.


## Experiments repository

In [18]:
@DataStoreLosAlamos.method
def join_experiments(self, *p: os.PathLike) -> os.PathLike:
    return op.join(self.dir_base, "experiments", *p)

## Loading a data stream into a Dask dataframe

Dask dataframes easily leverage the chunking of the streams that we have wrought. However, given how the chunks are already sorted, one gets the best benefits from these by supplying knowledge of the *divisions* of the index key (here, time) across the partitions. This is why we implement a custom dataframe loading that quickly extracts the division knowledge.

### Event timestamps

The Los Alamos Cybersecurity dataset has been captured over a period of two months, but the exact dates are unknown; the timestamps provided in the dataset start at 0. While mapping these directly to timestamps would yield funny 1970's dates to events, we rather choose a more modern setting. Given the late-2015 moment the dataset was released, we shall assume the acquisition ran from January 1st, 2015, to February 27th.

In [19]:
@growing
class Time:
    START = pd.Timestamp("2015-01-01T00:00:00")
    END = pd.Timestamp("2015-02-28T00:00:00") - pd.Timedelta(nanoseconds=1)


Time.END

Timestamp('2015-02-27 23:59:59.999999999')

In [20]:
def seconds2ts(n: str) -> pd.Timestamp:
    return Time.START + pd.Timedelta(seconds=int(n))

In [21]:
%%test Timestamp mapping
assert_(eq, actual=seconds2ts("3600"), expected=pd.Timestamp("2015-01-01T01:00:00"))

Test [1mTimestamp mapping[0m passed.


### Data schemas

The `SCHEMAS` dictionary describe the columns for each of the four main data streams, in addition to the label array stored in `redteam.txt.gz`. Each stream is sorted (and thus indexable) by its `time` column, which is omitted from the schema descriptions to facilitate the usage of the schema objects.

In [22]:
SCHEMA = Sequence[Tuple[str, str]]


SCHEMAS: Mapping[str, SCHEMA] = {
    "dns": [
        ("host_focus", "object"),
        ("host_resolved", "object")
    ],
    "flows": [
        ("duration", "int64"),
        ("host_focus", "object"),
        ("port_focus", "object"),
        ("host_server", "object"),
        ("port_server", "object"),
        ("protocol", "category"),
        ("num_packets", "int32"),
        ("num_bytes", "int64")
    ],
    "proc": [
        ("userdomain_focus", "object"),
        ("host_focus", "object"),
        ("process", "object"),
        ("action", "category")
    ],
    "auth": [
        ("userdomain_init", "object"),
        ("userdomain_focus", "object"),
        ("host_init", "object"),
        ("host_focus", "object"),
        ("auth", "category"),
        ("logon", "category"),
        ("direction", "category"),
        ("result", "category")
    ],
    "redteam": [
        ("userdomain_focus", "object"),
        ("host_init", "object"),
        ("host_focus", "object")
    ]
}

### Figuring out which stream a file is

In [23]:
RXS_NAMES_STREAM = r"(" + "|".join(SCHEMAS.keys()) + ")"
RX_PATH2STREAM = re.compile(r"/" + RXS_NAMES_STREAM + r"/|" + RXS_NAMES_STREAM + r"\.txt\.gz")

In [24]:
def path2stream(path: os.PathLike) -> str:
    m = re.search(RX_PATH2STREAM, path)
    if m is None:
        return ""
    return m.group(1) or m.group(2)

In [25]:
%%test Stream name for a raw file
assert_(eq, actual=path2stream("/data/lanl/redteam.txt.gz"), expected="redteam")

Test [1mStream name for a raw file[0m passed.


In [26]:
%%test Stream name for a chunk file
assert_(eq, actual=path2stream("/data/lanl/chunked/auth/0034.txt.gz"), expected="auth")

Test [1mStream name for a chunk file[0m passed.


In [27]:
%%test Stream name for a stream-specific processing result (not chunking)
assert_(eq, actual=path2stream("/data/lanl/experiments/asdf/qwer/proc/zxcv"), expected="proc")

Test [1mStream name for a stream-specific processing result (not chunking)[0m passed.


In [28]:
%%test Path with no stream name
assert not path2stream("/data/lanl/wtf")

Test [1mPath with no stream name[0m passed.


### Getting the first timestamp of a LANL file

In [29]:
def get_timestamp_lower(path: os.PathLike) -> pd.Timestamp:
    with gzip.open(path, "rb") as file:
        line1 = next(file)  # Assumption: no file empty.
        num_seconds, *_ = line1.split(b",")
        return seconds2ts(num_seconds)

In [30]:
%%test First timestamp for line of a DNS stream file
with patch("gzip.open", return_value=io.BytesIO(b"90842,C326,C89\n")):
    assert_(eq, actual=get_timestamp_lower("asdf"), expected=pd.Timestamp("2015-01-02T01:14:02"))

Test [1mFirst timestamp for line of a DNS stream file[0m passed.


In [31]:
%%test First timestamp for line of a flows stream file
with patch("gzip.open", return_value=io.BytesIO(b"2957021,2,C347,50234,C812,443,https,12,15723\nqwerty\n")):
    assert_(eq, actual=get_timestamp_lower("asdf"), expected=pd.Timestamp("2015-02-04T05:23:41"))

Test [1mFirst timestamp for line of a flows stream file[0m passed.


### Loading a LANL CSV file

In [32]:
def read_lanl_csv(path: os.PathLike, **kwargs: Any) -> pd.DataFrame:
    stream = path2stream(path)
    if not stream:
        raise ValueError(f"Path {path} does not involve a LANL data stream.")
    schema = SCHEMAS[stream]
    return pd.read_csv(
        path,
        header=None,
        names=["time"] + [attr for attr, _ in schema],
        dtype=dict(schema),
        parse_dates=["time"],
        date_parser=seconds2ts,
        index_col="time",
        compression="gzip",
        **kwargs
    )

In [33]:
@contextmanager
def dummy_proc_content() -> ContextManager[os.PathLike]:
    content = b"""\
3,C3@DOM1,C4,P2,Start
18,C89@DOM1,C23,P78,Start
29,C14@DOM1,C90,P123,Start
53,C90@DOM1,C34,P23,End
"""
    with patch("gzip.builtins.open", return_value=io.BytesIO(gzip.compress(content))):
        yield "/path/with/proc/"

In [34]:
%%test Reading LANL content
with dummy_proc_content() as path:
    df = read_lanl_csv(path)
assert_(eq, actual=len(df), expected=4)
assert_(eq, actual=len(df.columns), expected=4)
assert_(eq, actual=df.index.dtype, expected=np.dtype("datetime64[ns]"))
assert_(
    eq,
    actual={c: str(dt) for c, dt in df.dtypes.items()},
    expected={
        "userdomain_focus": "object",
        "host_focus": "object",
        "process": "object",
        "action": "category"
    }
)

Test [1mReading LANL content[0m passed.


### Putting it all together into a Dask dataframe

In [35]:
@DataStoreLosAlamos.method
def get_stream(self, name: str) -> ddf.DataFrame:
    paths_chunk = sorted(glob(self.join_chunked(name, "*.txt.gz").compute()))
    divisions = [get_timestamp_lower(path) for path in paths_chunk] + [Time.END]

    schema = dict(SCHEMAS[name])
    return ddf.from_delayed(
        [dask.delayed(read_lanl_csv)(path) for path in paths_chunk],
        meta=pd.DataFrame(columns=schema.keys(), index=pd.DatetimeIndex([], name="time")).astype(schema),
        divisions=divisions,
        prefix="load_chunk",
        verify_meta=False
    )

In [36]:
%%test Stream dataframe coherence
ds = DataStoreLosAlamos("/lanl")
indices = [3, 2, 4, 0, 1]
with patch("__main__.glob", side_effect=[[f"/lanl/flows/{n:04d}.txt.gz" for n in indices]]),\
        patch.object(ds, "join_chunked", new=MagicMock()),\
        patch(
            "__main__.get_timestamp_lower",
            side_effect=[pd.Timestamp(s) for s in [
                "2015-01-01T00:00:04",
                "2015-01-11T12:45:32",
                "2015-01-27T18:19:19",
                "2015-02-12T14:10:23",
                "2015-02-23T18:02:38"
            ]]
        ):
    df = ds.get_stream("flows")
    assert_(eq, actual=df.npartitions, expected=5)
    assert_(
        eq,
        actual=list(df.index.divisions),
        expected=[pd.Timestamp(s) for s in [
            "2015-01-01T00:00:04",
            "2015-01-11T12:45:32",
            "2015-01-27T18:19:19",
            "2015-02-12T14:10:23",
            "2015-02-23T18:02:38",
            "2015-02-27T23:59:59.999999999"
        ]]
    )
    assert_(eq, actual=dict(df.dtypes), expected=dict(SCHEMAS["flows"]))

Test [1mStream dataframe coherence[0m passed.


In [37]:
@suite.test(name="Resolving a complete dataframe")
def resolving_whole_dataframe():
    all_events = [
        b"""\
    63,U34@DOM1,U23@DOM1,C98,C98,Kerberos,Network,LogOn,Success
    91,U67@DOM1,SYSTEM@C89,C89,C89,Negotiate,Service,LogOn,Success
    """,
        b"""\
    304,U45@DOM1,U45@DOM1,C234,C329,Kerberos,Network,LogOff,Success
    897,U93@DOM1,U93@DOM1,C123,C123,Kerberos,Network,LogOn,Success
    """,
        b"""\
    956,U93@DOM1,U93@DOM1,C123,C123,Kerberos,Network,LogOff,Success
    3456,U67@DOM1,U45@DOM1,C89,C329,Kerberos,Network,LogOn,Failure
    4127,U980@DOM1,U980@DOM1,C23,C32,Kerberos,Service,LogOn,Success
    """
    ]

    map_content = {
        op.join("/lanl", "chunked", "auth", f"{n:02d}.txt.gz"): content
        for n, content in enumerate(all_events)
    }

    def grab_content(path: os.PathLike, *args, **kwargs) -> io.RawIOBase:
        return io.BytesIO(gzip.compress(map_content[path]))

    ds = DataStoreLosAlamos("asdf")
    with patch.object(ds, "join_chunked", new=MagicMock()),\
            patch("__main__.glob", side_effect=[list(map_content.keys())]),\
            patch("gzip.builtins.open", side_effect=grab_content):
        df = ds.get_stream("auth")
        assert_(eq, actual=df.npartitions, expected=3)
        assert_(
            eq,
            actual=list(df.divisions),
            expected=[pd.Timestamp(s) for s in [
                "2015-01-01T00:01:03",
                "2015-01-01T00:05:04",
                "2015-01-01T00:15:56"
            ]] + [Time.END]
        )
        df_realized = df.compute()
        assert_(eq, actual=dict(df_realized.dtypes), expected=dict(SCHEMAS["auth"]))

    for line, ts_and_row in zip(b"".join(all_events).split(b"\n"), df_realized.iterrows()):
        num_seconds_expected, *cols_expected = line.split(b",")
        ts_expected = seconds2ts(num_seconds_expected)
        ts_obtained, row_obtained = ts_and_row
        assert_(eq, expected=ts_expected, obtained=ts_obtained)
        assert_(eq, expected=[str(c, encoding="utf-8") for c in cols_expected], obtained=list(row_obtained))

Test [1mResolving a complete dataframe[0m passed.


## Provide all streams in one query

In [38]:
@DataStoreLosAlamos.method
def streams(self) -> Mapping[str, ddf.DataFrame]:
    # First ensure all these streams have been chunked; leverage parallel cluster computation.
    streams = ["auth", "dns", "flows", "proc"]
    persisted = [self.join_chunked(name).persist() for name in streams]
    dask.compute(persisted)
    del persisted
    
    return {name: self.get_stream(name) for name in streams}

In [39]:
@suite.test
def gathering_all_streams():
    stream_delay: Mapping[str, float] = {
        "auth": 4.0,
        "dns": 0.5,
        "flows": 1.0,
        "proc": 2.0
    }
    longest = max(stream_delay.values())

    @dask.delayed
    def mock_join_chunked(name: str) -> os.PathLike:
        time.sleep(stream_delay[name])
        return name  # Unused
    
    def mock_get_stream(name: str) -> Tuple[str]:
        return (name,)
    
    cluster = LocalCluster(n_workers=4, threads_per_worker=1, dashboard_address=None)
    client = Client(cluster)
    try:
        ds = DataStoreLosAlamos("dummy")
        with patch.object(ds, "join_chunked", new=Mock(side_effect=mock_join_chunked)),\
                patch.object(ds, "get_stream", side_effect=mock_get_stream):
            tic = time.time()
            assert_(eq, actual=ds.streams(), expected={name: (name,) for name in ["auth", "dns", "flows", "proc"]})
            toc = time.time()
            assert_(approx(longest, 0.1), toc - tic)
    finally:
        client.close()
        cluster.close()

Test [1mgathering_all_streams[0m passed.


## Combination of all four streams in a single stream

This is useful for software that needs to consider the heterogeneous telemetry as a single homogeneous stream.

In [40]:
def combine_streams(
    ds: DataStoreLosAlamos,
    streams: Sequence[str] = [],
    start: Optional[pd.Timestamp] = None,
    end: Optional[pd.Timestamp] = None,
) -> ddf.DataFrame:
    if not streams:
        streams = ["auth", "dns", "flows", "proc"]

    to_concat = []
    for name in streams:
        df = ds.get_stream(name)[(start or Time.START):(end or Time.END)]
        df["stream"] = name
        to_concat.append(df)

    return ddf.concat(to_concat, interleave_partitions=True)

In [41]:
def dataframes_equal(**dfs: pd.DataFrame) -> ExplanationOnFailure:
    left, right = dfs.values()
    if not left.equals(right):
        return Explanation("The two dataframes are not equal", join_args([], dfs))
    return True

In [42]:
import numpy as np
import pandas as pd


class MockLANL(DataStoreLosAlamos):
    
    def __init__(self):
        super().__init__("dummy")
        
    def get_stream(self, name: str) -> ddf.DataFrame:
        return ddf.from_pandas(
            pd.DataFrame(
                data={
                    "auth": {
                        "index": [pd.Timestamp("2015-01-05T13:00:05")],
                        "userdomain_init": ["U3@DOM1"],
                        "userdomain_focus": ["U3@DOM1"],
                        "host_init": ["C328"],
                        "host_focus": ["C345"],
                        "auth": ["NTLM"],
                        "logon": ["?"],
                        "direction": ["LogOn"],
                        "result": ["Success"]
                    },
                    "proc": {
                        "index": [pd.Timestamp("2015-01-05T13:00:06"), pd.Timestamp("2015-01-05T13:00:32")],
                        "userdomain_focus": ["U4@DOM1", "U3@DOM1"],
                        "host_focus": ["C45", "C45"],
                        "process": ["P3254", "P129"],
                        "action": ["Start", "End"]
                    },
                    "flows": {
                        "index": [pd.Timestamp("2015-01-05T13:00:03")],
                        "duration": [3],
                        "host_focus": ["C89"],
                        "port_focus": ["N435"],
                        "host_server": ["C2390"],
                        "port_server": ["443"],
                        "protocol": ["3"],
                        "num_packets": [10],
                        "num_bytes": [1454]
                    },
                    "dns": {
                        "index": [pd.Timestamp("2015-01-05T13:00:03"), pd.Timestamp("2015-01-05T13:00:24")],
                        "host_focus": ["C89", "C234"],
                        "host_resolved": ["C2390", "C123"]
                    }
                }[name]
            ).set_index("index"),
            npartitions=1
        )

In [43]:
def test_df(data: Mapping[str, Sequence]) -> pd.DataFrame:
    df = pd.DataFrame(data=data)
    map_types = {
        "category": "object",
        "int64": "float64",
        "int32": "float64"
    }
    dict_dtype = {k: map_types.get(v, v) for k, v in set(sum(list(SCHEMAS.values()), [])) if k in df.columns}
    return df.astype(dict_dtype).set_index("time").sort_index()

In [44]:
%%test Combination of selected streams
assert_(
    dataframes_equal,
    expected=test_df({
        "time": [pd.Timestamp(s) for s in ["2015-01-05T13:00:03", "2015-01-05T13:00:03", "2015-01-05T13:00:24"]],
        "host_focus": ["C89", "C89", "C234"],
        "host_resolved": ["C2390", np.nan, "C123"],
        "stream": ["dns", "flows", "dns"],
        "duration": [np.nan, 3, np.nan],
        "port_focus": [np.nan, "N435", np.nan],
        "host_server": [np.nan, "C2390", np.nan],
        "port_server": [np.nan, "443", np.nan],
        "protocol": [np.nan, "3", np.nan],
        "num_packets": [np.nan, 10, np.nan],
        "num_bytes": [np.nan, 1454, np.nan]
    }),
    combination=combine_streams(MockLANL(), ["dns", "flows"]).compute().sort_index()
)

Test [1mCombination of selected streams[0m passed.


In [45]:
%%test Combination of all streams
assert_(
    dataframes_equal,
    expected=test_df({
        "time": [pd.Timestamp(s) for s in [
            "2015-01-05T13:00:05",
            "2015-01-05T13:00:06",
            "2015-01-05T13:00:32",
            "2015-01-05T13:00:03",
            "2015-01-05T13:00:24",
            "2015-01-05T13:00:03"
        ]],
        "userdomain_init": ["U3@DOM1", np.nan, np.nan, np.nan, np.nan, np.nan],
        "userdomain_focus": ["U3@DOM1", "U4@DOM1", "U3@DOM1", np.nan, np.nan, np.nan],
        "host_init": ["C328", np.nan, np.nan, np.nan, np.nan, np.nan],
        "host_focus": ["C345", "C45", "C45", "C89", "C234", "C89"],
        "auth": ["NTLM", np.nan, np.nan, np.nan, np.nan, np.nan],
        "logon": ["?", np.nan, np.nan, np.nan, np.nan, np.nan],
        "direction": ["LogOn", np.nan, np.nan, np.nan, np.nan, np.nan],
        "result": ["Success", np.nan, np.nan, np.nan, np.nan, np.nan],
        "stream": ["auth", "proc", "proc", "dns", "dns", "flows"],
        "host_resolved": [np.nan, np.nan, np.nan, "C2390", "C123", np.nan],
        "duration": [np.nan, np.nan, np.nan, np.nan, np.nan, 3],
        "port_focus": [np.nan, np.nan, np.nan, np.nan, np.nan, "N435"],
        "host_server": [np.nan, np.nan, np.nan, np.nan, np.nan, "C2390"],
        "port_server": [np.nan, np.nan, np.nan, np.nan, np.nan, "443"],
        "protocol": [np.nan, np.nan, np.nan, np.nan, np.nan, "3"],
        "num_packets": [np.nan, np.nan, np.nan, np.nan, np.nan, 10],
        "num_bytes": [np.nan, np.nan, np.nan, np.nan, np.nan, 1454],
        "process": [np.nan, "P3254", "P129", np.nan, np.nan, np.nan],
        "action": [np.nan, "Start", "End", np.nan, np.nan, np.nan]
    }),
    combination=combine_streams(MockLANL()).compute().sort_index()
)

Test [1mCombination of all streams[0m passed.


In [46]:
%%test Combination of all streams beyond a timestamp
assert_(
    dataframes_equal,
    expected=test_df({
        "time": [pd.Timestamp(s) for s in [
            "2015-01-05T13:00:32",
            "2015-01-05T13:00:24"
        ]],
        "userdomain_init": [np.nan, np.nan],
        "userdomain_focus": ["U3@DOM1", np.nan],
        "host_init": [np.nan, np.nan],
        "host_focus": ["C45", "C234"],
        "auth": [np.nan, np.nan],
        "logon": [np.nan, np.nan],
        "direction": [np.nan, np.nan],
        "result": [np.nan, np.nan],
        "stream": ["proc", "dns"],
        "host_resolved": [np.nan, "C123"],
        "duration": [np.nan, np.nan],
        "port_focus": [np.nan, np.nan],
        "host_server": [np.nan, np.nan],
        "port_server": [np.nan, np.nan],
        "protocol": [np.nan, np.nan],
        "num_packets": [np.nan, np.nan],
        "num_bytes": [np.nan, np.nan],
        "process": ["P129", np.nan],
        "action": ["End", np.nan]
    }),
    combination=combine_streams(MockLANL(), start=pd.Timestamp("2015-01-05T13:00:10")).compute().sort_index()
)

Test [1mCombination of all streams beyond a timestamp[0m passed.


In [47]:
%%test Combination of all streams prior to a timestamp
assert_(
    dataframes_equal,
    expected=test_df({
        "time": [pd.Timestamp(s) for s in [
            "2015-01-05T13:00:05",
            "2015-01-05T13:00:03",
            "2015-01-05T13:00:03"
        ]],
        "userdomain_init": ["U3@DOM1", np.nan, np.nan],
        "userdomain_focus": ["U3@DOM1", np.nan, np.nan],
        "host_init": ["C328", np.nan, np.nan],
        "host_focus": ["C345", "C89", "C89"],
        "auth": ["NTLM", np.nan, np.nan],
        "logon": ["?", np.nan, np.nan],
        "direction": ["LogOn", np.nan, np.nan],
        "result": ["Success", np.nan, np.nan],
        "stream": ["auth", "dns", "flows"],
        "host_resolved": [np.nan, "C2390", np.nan],
        "duration": [np.nan, np.nan, 3],
        "port_focus": [np.nan, np.nan, "N435"],
        "host_server": [np.nan, np.nan, "C2390"],
        "port_server": [np.nan, np.nan, "443"],
        "protocol": [np.nan, np.nan, "3"],
        "num_packets": [np.nan, np.nan, 10],
        "num_bytes": [np.nan, np.nan, 1454],
        "process": [np.nan, np.nan, np.nan],
        "action": [np.nan, np.nan, np.nan]
    }),
    combination=combine_streams(MockLANL(), end=pd.Timestamp("2015-01-05T13:00:05")).compute().sort_index()
)

Test [1mCombination of all streams prior to a timestamp[0m passed.


In [48]:
%%test Combination of all streams within a time interval
assert_(
    dataframes_equal,
    expected=test_df({
        "time": [pd.Timestamp(s) for s in [
            "2015-01-05T13:00:05",
            "2015-01-05T13:00:06"
        ]],
        "userdomain_init": ["U3@DOM1", np.nan],
        "userdomain_focus": ["U3@DOM1", "U4@DOM1"],
        "host_init": ["C328", np.nan],
        "host_focus": ["C345", "C45"],
        "auth": ["NTLM", np.nan],
        "logon": ["?", np.nan],
        "direction": ["LogOn", np.nan],
        "result": ["Success", np.nan],
        "stream": ["auth", "proc"],
        "host_resolved": [np.nan, np.nan],
        "duration": [np.nan, np.nan],
        "port_focus": [np.nan, np.nan],
        "host_server": [np.nan, np.nan],
        "port_server": [np.nan, np.nan],
        "protocol": [np.nan, np.nan],
        "num_packets": [np.nan, np.nan],
        "num_bytes": [np.nan, np.nan],
        "process": [np.nan, "P3254"],
        "action": [np.nan, "Start"]
    }),
    combination=combine_streams(
        MockLANL(),
        start=pd.Timestamp("2015-01-05T13:00:05"),
        end=pd.Timestamp("2015-01-05T13:00:10")
    ).compute().sort_index()
)

Test [1mCombination of all streams within a time interval[0m passed.


## Writing a stream to CSV

We restrict writing back CSV to streams that can hold in a reasonable amount of memory, meaning a single cluster node. In such a case, the stream is reduced from a Dask dataframe to a single-node regular Pandas dataframe.

In [49]:
import os


def write_stream_csv(path: os.PathLike, stream: ddf.DataFrame, **options: Any) -> None:
    df = stream.compute().sort_index()
    df.insert(0, "time", df.index.to_series().apply(lambda ts: int((ts - Time.START).total_seconds())))
    df.to_csv(path, index=False, **options)

## Selecting a feature subset for refining representation

When computing artifact-specific numerical representations of data, sometimes one may want to restrict further analysis steps to a subset of artifacts. While embracing the joint analysis of all artifacts can be done (at high runtime and memory usage), it may yield a very generic representation less able to model normal activity *tightly*: such a representation may suffer from reduced sensitivity to anomalous phenomena. So artifact subsets can bring a tighter representation that performs better at anomaly detection. Naturally, only experimental evidence can verify or undermine such hypotheses.

In [50]:
ArtifactIndex = Mapping[str, int]
FeaturesRestricted = Tuple[np.ndarray, ArtifactIndex]

### Restricting to subset of artifact types

Restricting, for instance, to only host or focused host features could be used to detect as-yet unknown malicious activity over a whole network.

In this dataset, not just any combination of vertex types make sense. Rather, one may want to work only with `host_focus` artifact features, or all artifacts whose name starts with `host`. So the type is chosen as a prefix of the actual encoded artifact name.

In [51]:
def _select_features(features: np.ndarray, vertices: Sequence[ig.Vertex]) -> FeaturesRestricted:
    a2i: ArtifactIndex = OrderedDict((v["name"], v.index) for v in vertices)
    return features[list(a2i.values()), :], {name: i for i, name in enumerate(a2i.keys())}        

In [52]:
def select_by_type(type_artifact: str, features: np.ndarray, graph: ig.Graph) -> FeaturesRestricted:
    return _select_features(features, [v for v in graph.vs if v["name"].startswith(type_artifact)])

In [53]:
def graph_and_features_for_selection_test():
    graph = ig.Graph()
    vindex: Mapping[str, ig.Vertex] = {}
    for name in [
        "userdomain_init:U34@DOM1",
        "userdomain_focus:U34@DOM1",
        "host_init:C23",
        "host_focus:C87",
        "host_focus:C27",
        "process:P8",
        "userdomain_focus:U12@DOM1",
        "host_focus:C49"
    ]:
        vindex[name] = graph.add_vertex(name)
    for name_s, name_e in [
        ("host_init:C23", "userdomain_init:U34@DOM1"),
        ("host_init:C23", "userdomain_focus:U34@DOM1"),
        ("host_init:C23", "host_focus:C87"),
        ("userdomain_init:U34@DOM1", "userdomain_focus:U34@DOM1"),
        ("userdomain_init:U34@DOM1", "host_focus:C87"),
        ("userdomain_focus:U34@DOM1", "host_focus:C87"),
        ("userdomain_focus:U34@DOM1", "host_focus:C27"),
        ("userdomain_focus:U34@DOM1", "process:P8"),
        ("process:P8", "host_focus:C27"),
        ("process:P8", "userdomain_focus:U12@DOM1"),
        ("process:P8", "host_focus:C49"),
        ("host_focus:C49", "userdomain_focus:U12@DOM1")
    ]:
        graph.add_edge(vindex[name_s], vindex[name_e])
        
    features = np.array([
        [0, 6, 8],
        [5, 8, 9],
        [1, 3, 8],
        [4, 0, 0],
        [1, 1, 1],
        [10, 11, 3],
        [7, 3, 6],
        [9, 2, 2]
    ])
    assert (features[[v.index for v in graph.vs], :] == features).all()
    return graph, features

In [54]:
def arrays_equal(**kwargs: np.ndarray) -> ExplanationOnFailure:
    left, right = kwargs.values()
    if left.size != right.size:
        return Explanation("Arrays of distinct size", join_args([], kwargs))
    if not np.isclose(left, right, equal_nan=True, atol=1e-5).all():
        return Explanation("Arrays not equal within tolerance", join_args([], kwargs))
    return True

In [55]:
%%test Selecting features by single vertex type
graph, features = graph_and_features_for_selection_test()
selected, a2i = select_by_type("host_focus", features, graph)
assert_(
    arrays_equal,
    expected=np.array([
        [4, 0, 0],
        [1, 1, 1],
        [9, 2, 2]
    ]),
    selected=selected
)
assert_(eq, expected={"host_focus:C87": 0, "host_focus:C27": 1, "host_focus:C49": 2})

Test [1mSelecting features by single vertex type[0m passed.


In [56]:
%%test Selecting features by more general vertex type
graph, features = graph_and_features_for_selection_test()
selected, a2i = select_by_type("host", features, graph)
assert_(
    arrays_equal,
    expected=np.array([
        [1, 3, 8],
        [4, 0, 0],
        [1, 1, 1],
        [9, 2, 2]
    ]),
    selected=selected
)
assert_(eq, expected={"host_init:C23": 0, "host_focus:C87": 1, "host_focus:C27": 2, "host_focus:C49": 3})

Test [1mSelecting features by more general vertex type[0m passed.


### Restricting to the neighbourhood of a set of artifacts

When investigating a compromise suspected around a certain host, then looking at anomalies in a representation of all artifacts related one step from it in the graph (its *family*) may yield fruit.

In [57]:
def select_family(artifacts_: Union[str, Container[str]], features: np.ndarray, graph: ig.Graph) -> FeaturesRestricted:
    artifacts: Set[str]
    if isinstance(artifacts_, str):
        artifacts = {artifacts_}
    else:
        artifacts = set(artifacts_)

    vs = set(graph.vs(name_in=artifacts))
    return _select_features(features, vs | set(sum([v.neighbors() for v in vs], [])))

In [58]:
%%test Select family of a single vertex
graph, features = graph_and_features_for_selection_test()
selected, a2i = select_family("host_focus:C87", features, graph)
assert_(
    eq,
    expected=["host_focus:C87", "host_init:C23", "userdomain_focus:U34@DOM1", "userdomain_init:U34@DOM1"],
    a2i_keys=sorted(list(a2i.keys()))
)
assert_(eq, expected=list(range(4)), a2i_indexes=sorted(list(a2i.values())))
for name, i in a2i.items():
    v, *_ = graph.vs(name=name)
    assert_(arrays_equal, expected=features[v.index, :], selected=selected[i, :])

Test [1mSelect family of a single vertex[0m passed.


In [59]:
%%test Select family of two neighbouring vertices
graph, features = graph_and_features_for_selection_test()
selected, a2i = select_family(["host_focus:C87", "host_init:C23"], features, graph)
assert_(
    eq,
    expected=["host_focus:C87", "host_init:C23", "userdomain_focus:U34@DOM1", "userdomain_init:U34@DOM1"],
    a2i_keys=sorted(list(a2i.keys()))
)
assert_(eq, expected=list(range(4)), a2i_indexes=sorted(list(a2i.values())))
for name, i in a2i.items():
    v, *_ = graph.vs(name=name)
    assert_(arrays_equal, expected=features[v.index, :], selected=selected[i, :])

Test [1mSelect family of two neighbouring vertices[0m passed.


In [60]:
%%test Select family of vertices with overlapping but not identical neighbourhoods
graph, features = graph_and_features_for_selection_test()
selected, a2i = select_family(["host_focus:C87", "host_focus:C27"], features, graph)
assert_(
    eq,
    expected=[
        "host_focus:C27",
        "host_focus:C87",
        "host_init:C23",
        "process:P8",
        "userdomain_focus:U34@DOM1",
        "userdomain_init:U34@DOM1"
    ],
    a2i_keys=sorted(list(a2i.keys()))
)
assert_(eq, expected=list(range(6)), a2i_indexes=sorted(list(a2i.values())))
for name, i in a2i.items():
    v, *_ = graph.vs(name=name)
    assert_(arrays_equal, expected=features[v.index, :], selected=selected[i, :])

Test [1mSelect family of vertices with overlapping but not identical neighbourhoods[0m passed.


In [61]:
%%test Select family of vertices without a overlapping neighbourhoods
graph, features = graph_and_features_for_selection_test()
selected, a2i = select_family(["host_focus:C87", "host_focus:C49"], features, graph)
assert_(
    eq,
    expected=[
        "host_focus:C49",
        "host_focus:C87",
        "host_init:C23",
        "process:P8",
        "userdomain_focus:U12@DOM1",
        "userdomain_focus:U34@DOM1",
        "userdomain_init:U34@DOM1"
    ],
    a2i_keys=sorted(list(a2i.keys()))
)
assert_(eq, expected=list(range(7)), a2i_indexes=sorted(list(a2i.values())))
for name, i in a2i.items():
    v, *_ = graph.vs(name=name)
    assert_(arrays_equal, expected=features[v.index, :], selected=selected[i, :])

Test [1mSelect family of vertices without a overlapping neighbourhoods[0m passed.


## Labeling artifacts

The `redteam.txt.gz` file carries direct information about authentication (`auth.txt.gz`) records corresponding to logon actions by attackers. The following function provides a labeling oracle for *artifacts* from any record within a time interval.

In [62]:
@DataStoreLosAlamos.method
def get_redteam(self) -> pd.DataFrame:
    return read_lanl_csv(op.join(self.dir_base, "redteam.txt.gz"))

In [63]:
import dask.dataframe as ddf
import pandas as pd


Artifact = Tuple[str, str]


@DataStoreLosAlamos.method
def label(
    self,
    artifacts: Iterator[Artifact],
    interval_time: Tuple[pd.Timestamp, pd.Timestamp]
) -> Iterator[Tuple[Artifact, float]]:
    lower, upper = interval_time
    auth = self.get_stream("auth")[lower:upper]
    redteam = self.get_redteam()
    auth_labeled: ddf.DataFrame = auth.merge(
        redteam,
        on=["time", "userdomain_focus", "host_init", "host_focus"]
    ).compute()
    artifacts_malicious = {col: set(auth_labeled[col]) for col in auth_labeled.columns}
    set_empty = set()
    for column, value in artifacts:
        if value in artifacts_malicious.get(column, set_empty):
            yield ((column, value), 1.0)

In [64]:
def test_label(
    interval_time: Tuple[pd.Timestamp, pd.Timestamp],
    expected: List[Tuple[Artifact, float]]
) -> None:
    DATA = {
        "proc": b"",
        "flows": b"",
        "dns": b"",
        "auth": b"""\
63,U34@DOM1,U23@DOM1,C98,C98,Kerberos,Network,LogOn,Success
91,U67@DOM1,SYSTEM@C89,C89,C89,Negotiate,Service,LogOn,Success
304,U45@DOM1,U45@DOM1,C234,C329,Kerberos,Network,LogOff,Success
897,U93@DOM1,U93@DOM1,C123,C123,Kerberos,Network,LogOn,Success
956,U93@DOM1,U93@DOM1,C123,C123,Kerberos,Network,LogOff,Success
3456,U67@DOM1,U45@DOM1,C89,C329,Kerberos,Network,LogOn,Failure
4127,U980@DOM1,U980@DOM1,C23,C32,Kerberos,Service,LogOn,Success
""",
        "redteam": b"""\
91,SYSTEM@C89,C89,C89
897,U93@DOM1,C123,C123
3456,U45@DOM1,C89,C329
"""
    }

    import gzip
    DIR_LANL = ("./...lanl...")
    try:
        os.makedirs(DIR_LANL)
        for path, content in DATA.items():
            with gzip.open(op.join(DIR_LANL, path + ".txt.gz"), "wb") as file:
                file.write(content)

        assert_(
            eq,
            expected=sorted(expected),
            labeled=sorted(list(DataStoreLosAlamos(DIR_LANL).label(
                [
                    ("userdomain_init", "U93@DOM1"),
                    ("userdomain_focus", "SYSTEM@C89"),
                    ("host_init", "C234"),
                    ("host_focus", "C459"),
                    ("host_focus", "C329"),
                    ("userdomain_focus", "U980@DOM1")
                ],
                interval_time
            )))
        )
    finally:
        if os.path.isdir(DIR_LANL):
            shutil.rmtree(DIR_LANL)

In [65]:
%%test Artifact labeling across the whole timeframe
test_label(
    (Time.START, Time.END),
    [
        (("userdomain_init", "U93@DOM1"), 1.0),
        (("userdomain_focus", "SYSTEM@C89"), 1.0),
        (("host_focus", "C329"), 1.0)
    ]
)    

Test [1mArtifact labeling across the whole timeframe[0m passed.


In [66]:
%%test Artifact labeling on a prefix of the timeframe
test_label(
    (Time.START, Time.START + pd.Timedelta(seconds=1000)),
    [
        (("userdomain_init", "U93@DOM1"), 1.0),
        (("userdomain_focus", "SYSTEM@C89"), 1.0)
    ]
)

Test [1mArtifact labeling on a prefix of the timeframe[0m passed.


# Test summary

In [67]:
if __name__ == "__main__":
    _ = summarize_results(suite)

32 passed, [37m0 failed[0m, [37m0 raised an error[0m
