In [None]:
%load_ext pycodestyle_magic
%flake8_on --max_line_length 120 --ignore W293,E302

In [None]:
from abc import ABC, abstractmethod
from contextlib import contextmanager
from glob import glob
import gzip
import json
from multiprocessing import cpu_count
from numbers import Number
import os
import os.path as op
import pickle
import sys
from typing import Tuple, Sequence, Mapping, Callable, Any, Optional, ContextManager, List

import dask
from distributed import Client, LocalCluster
import igraph as ig
import matplotlib.pyplot as pp
import numpy as np
import pandas as pd

In [None]:
%matplotlib inline

# Anomaly detection of heterogeneous telemetry data using graph embeddings

This anomaly detector is derived from [this paper](https://arxiv.org/abs/1812.02848), whose author applied the methodology to augment the indicator strengths of a heterogeneous alert feed. We apply it here to raw event data that typically feed the heuristic detectors of such alerting appliances.

We demonstrate using the [Los Alamos Cybersecurity](https://csr.lanl.gov/data/cyber1/) dataset, which we use to locate offensive events as anomalous behaviour. The dataset is composed of four streams of telemetry of distinct schema that report various perspective of IT behaviour within the Los Alamos National Laboratory IT infrastructure.

In [None]:
DIR_DATA = "/data/lanl"

Do change `DIR_DATA` so it points to a directory organized as expected by [this notebook](https://github.com/hamelin/lanl-tools/blob/master/Los%20Alamos%20Cybersecurity%20Data%20partitioning.ipynb), whose functions we use to split the dataset into time intervals.

As we will go along, we will add experiment configuration to an instance of the following class. Let's make it so we can easily add methods to that class as well.

In [None]:
class Config:

    def __init__(self, name) -> None:
        self._name = name
        
    def method(self, fn) -> Callable:
        def _method(*args, **kwargs) -> Callable:
            return fn(self, *args, **kwargs)

        setattr(self, fn.__name__, _method)
        return fn
    
    def __dask_tokenize__(self, *args, **kwargs) -> str:
        return self._name


cfg = Config("the-experiment")
cfg.path_lanl = DIR_DATA

In [None]:
@cfg.method
def join_raw(self, *p) -> os.PathLike:
    return op.join(self.path_lanl, "raw", *p)


assert op.isdir(cfg.join_raw())
assert op.isfile(cfg.join_raw("redteam.txt.gz"))

## Prelude: dividing the data streams in workable chunks

When we download the dataset, it comes as five quasi-CSV files (the column header is absent) respectively compressed in one Gzip block. This file organization makes parallel processing of substreams nearly impossible.

Given the documentation on the dataset, we know it translates very easily to dataframes. Let's divide the monolithic compressed streams into compressed chunks, each of which can become a *partition* of a Dask dataframe. Best practices when working with Dask dataframes suggest using partitions of about 100MB in size. We will not use Dask dataframes, but our handling of 100MB-partitioned data with delayed tasks should work along the same constraints and best practices.

In [None]:
SIZE_CHUNK = "100M"
DIR_RAW = op.join(DIR_DATA, "raw")
DIR_CHUNKED = op.join(DIR_DATA, "chunked")
os.makedirs(DIR_CHUNKED, exist_ok=True)
DIR_RAW, DIR_CHUNKED

In [None]:
STREAMS_TO_CHUNK = "\n".join(
    {"auth", "dns", "flows", "proc"} - set(os.listdir(DIR_CHUNKED))
)
STREAMS_TO_CHUNK

The `split` UNIX command is a perfect fit for this task. Composeit with `xargs` to chunk as-yet unchunked streams (file `redteam.txt.gz` is not so much a distinct data stream as a labeler for the `auth.txt.gz` records) in parallel. Unfortunately, the `!` magic precludes good visibility of the command; copy-paste it to a raw cell for examination.

In [None]:
!echo "$STREAMS_TO_CHUNK" | time xargs -I^ -P0 -t bash -x -c "mkdir -p $DIR_CHUNKED/^ && zcat $DIR_RAW/^.txt.gz | split --verbose -d -a 4 -C $SIZE_CHUNK --additional-suffix=.txt.gz --filter='gzip -c >\$FILE' - $DIR_CHUNKED/^/"

## Choosing the time interval size

Let us take a look at the offensive logons labeling data so that we can determine a data subset and time interval length that would suit the analysis. Naturally, we don't have such information in real life, but the constraints around data capture can initiate a trial-and-error for setting the time resolution at which to perform anomaly detection. It may also be fruitful to run anomaly detection using large time intervals and periods over certain datasets, and small intervals and periods and over other datasets; the practitioner will have to use the methodology and adapt it to their needs.

The moment where data was captured is not documented. As the dataset was published sometimes in 2015, let's anchor it to January 1st, 2015 -- just so we don't have to deal with weird timestamps relative to early Epoch.

In [None]:
ORIGIN = pd.Timestamp("2015-01-01T00:00:00")
ORIGIN

In [None]:
COLUMN = Tuple[str, str]
SCHEMA = Sequence[COLUMN]


def read_lanl_csv(path: os.PathLike, schema: SCHEMA, **kwargs: Any) -> pd.DataFrame:
    return pd.read_csv(
        path,
        header=None,
        names=["time"] + [attr for attr, _ in schema],
        dtype=dict(schema),
        parse_dates=["time"],
        date_parser=lambda n: ORIGIN + pd.Timedelta(seconds=int(n)),
        index_col="time",
        compression="gzip",
        **kwargs
    )

In [None]:
sr_labels = read_lanl_csv(
    cfg.join_raw("redteam.txt.gz"),
    [("user_domain_source", "object"), ("computer_source", "object"), ("computer_destination", "object")],
    squeeze=True,
    usecols=["time", "computer_source"],
)
sr_labels

The last day where an attack occurs is January 30th; let's consider the attacks happen over the whole month of January, so from January 1st to 31st.

In [None]:
cfg.time_frame = Config("time_frame")
cfg.time_frame.start = ORIGIN
cfg.time_frame.end = pd.Timestamp("2015-02-01T00:00:00")
tf = cfg.time_frame


@tf.method
def width(self) -> pd.Timedelta:
    return self.end - self.start


@tf.method
def range_end(self) -> pd.Timestamp:
    return self.end - pd.Timedelta(1)


@tf.method
def empty(self) -> pd.DatetimeIndex:
    return pd.date_range(self.start, self.start - pd.Timedelta(1))

Let's look at the repartition of attack events over various time intervals.

In [None]:
def plot_attacks(sr: pd.Series, cfg: Config, **timedelta_params: Number) -> None:
    interval = pd.Timedelta(**timedelta_params)
    time_frame = cfg.time_frame.width()
    num_intervals = time_frame // interval + min(1, (time_frame % interval).value)
    sr.groupby(lambda ts: ((ts - cfg.time_frame.start) // interval) * interval + cfg.time_frame.start)\
        .count()\
        .reindex(pd.date_range(cfg.time_frame.start, cfg.time_frame.end, freq=interval), fill_value=0)\
        .plot(kind="bar", figsize=(15, 4), xticks=np.linspace(0, num_intervals, 25))

In [None]:
plot_attacks(sr_labels, cfg, days=1)

The dispersion of attack numbers is very wide here: major incidents range in the hundreds of offensive events, while minor incidents run below 10 offensive events. Likely, such minor incidents, which are crucial to preparing the larger assaults, would be drowned into the noise of normal activity.

In [None]:
plot_attacks(sr_labels, cfg, hours=1)

The dispersion of the number of hacking events is less drastic, no more than one order of magnitudes between small and large incidents. In such a case, the many-attacks events between Jan 9 and Jan 14 would likely be detected; small incidents also run a detection chance, if somewhat lesser. Nonetheless, let's run a first analysis at this time resolution. **Decision executed in code cell below.**

In [None]:
cfg.interval = pd.Timedelta(hours=1)

In [None]:
@cfg.method
def calc_lb_interval(self, ts: pd.Timestamp) -> pd.Timestamp:
    return ((ts - self.time_frame.start) // self.interval) * self.interval + self.time_frame.start

In [None]:
@cfg.method
def index_partition(self) -> pd.DatetimeIndex:
    return pd.date_range(self.time_frame.start, self.time_frame.range_end(), freq=self.interval)

## Partitioning data streams

The anomaly detection methodology acts on event subsequences. Let's reorganize these streams according to the chosen time interval. For the sake of partition reuse for various experiments, let's process the full data streams.

Through this partitioning process, we will parse the raw data streams. Let's care for the data schema for each data stream as we do this.

In [None]:
STREAMS: Mapping[str, SCHEMA] = {
    "dns": [
        ("computer_source", "object"),
        ("computer_destination", "object")
    ],
    "flows": [
        ("duration", "int64"),
        ("computer_source", "object"),
        ("port_source", "object"),
        ("computer_destination", "object"),
        ("port_destination", "object"),
        ("protocol", "category"),
        ("num_packets", "int32"),
        ("num_bytes", "int64")
    ],
    "proc": [
        ("userdomain_source", "object"),
        ("computer_source", "object"),
        ("process", "object"),
        ("action", "category")
    ],
    "auth": [
        ("userdomain_source", "object"),
        ("userdomain_destination", "object"),
        ("computer_source", "object"),
        ("computer_destination", "object"),
        ("auth", "category"),
        ("logon", "category"),
        ("direction", "category"),
        ("result", "category")
    ]
}

The partitioning computations will be run through a Dask compute cluster. If you would rather start your own cluster internally, comment out the next cell and uncomment and run the one after.

In [None]:
cluster = LocalCluster(n_workers=cpu_count(), threads_per_worker=1)
client = Client(cluster)
cluster

In [None]:
# client = Client("localhost:8786")

In [None]:
@cfg.method
def join_partition(self, *p: os.PathLike) -> os.PathLike:
    return op.join(self.path_lanl, "partitions", self.interval.isoformat(), *p)

In [None]:
def name_partition(path_partition: os.PathLike, ts: pd.Timestamp) -> os.PathLike:
    return op.join(path_partition, ts.isoformat() + ".txt.gz")

In [None]:
def write_partition(df_partition: Optional[pd.DataFrame], path_partition: os.PathLike, mode: str = "w") -> None:
    if df_partition is not None:
        assert len(df_partition) > 0
        ts_interval = df_partition.interval.iloc[0]
        
        df_partition = df_partition.drop("interval", axis="columns").reset_index()
        df_partition.time = df_partition.time.apply(lambda ts: int((ts - ORIGIN).total_seconds()))

        os.makedirs(path_partition, exist_ok=True)
        df_partition.to_csv(
            name_partition(path_partition, ts_interval),
            mode=mode,
            header=False,
            index=False,
            compression="gzip"
        )

In [None]:
def get_path_chunk_metadata(path_chunk: os.PathLike, path_partition: os.PathLike) -> os.PathLike:
    name_chunk, _ = op.splitext(op.basename(path_chunk))
    return op.join(path_partition, "meta", name_chunk + ".json")


def get_chunk_metadata(path_chunk: os.PathLike, path_partition: os.PathLike) -> List[pd.Timestamp]:
    path = get_path_chunk_metadata(path_chunk, path_partition)
    if op.isfile(path):
        with open(path, "rt", encoding="utf-8") as file_meta:
            return [pd.Timestamp(s) for s in json.load(file_meta)]
    return []


def persist_chunk_metadata(path_chunk: os.PathLike, path_partition: os.PathLike, meta: Sequence[pd.Timestamp]) -> None:
    path = get_path_chunk_metadata(path_chunk, path_partition)
    os.makedirs(op.dirname(path), exist_ok=True)
    with open(path, "wt", encoding="utf-8") as file_meta:
        json.dump([ts.isoformat() for ts in meta], file_meta)

In [None]:
class Extremes:
    
    def __init__(self, min_interval: pd.Timestamp, max_interval: pd.Timestamp, path_chunk: os.PathLike) -> None:
        self.min_interval = min_interval
        self.max_interval = max_interval
        self.path_chunk = path_chunk
        
    def __dask_tokenize__(self, *args, **kwargs) -> str:
        return "".join(str(a) for a in [self.min_interval, self.max_interval, self.path_chunk])

In [None]:
def read_chunk(path: os.PathLike, schema: SCHEMA) -> pd.DataFrame:
    chunk = read_lanl_csv(path, schema)
    chunk["interval"] = chunk.index.to_series().apply(cfg.calc_lb_interval)
    return chunk

In [None]:
def partition_file_exists(path_partition: os.PathLike, ts: pd.Timestamp) -> bool:
    return op.isfile(name_partition(path_partition, ts))

In [None]:
@dask.delayed(pure=True)
def partition_chunk(path_chunk: os.PathLike, stream: str, cfg: Config) -> Extremes:
    path_partition = cfg.join_partition(stream)
    intervals = get_chunk_metadata(path_chunk, path_partition)
    if len(intervals) > 0:
        # If a previous run wrote all the non-extreme partitions, we can skip chunk processing here.
        if all(partition_file_exists(path_partition, ts) for ts in intervals[1:-1]):
            return Extremes(intervals[0], intervals[-1], path_chunk)

    chunk = read_chunk(path_chunk, STREAMS[stream])
    intervals = [pd.Timestamp(ts) for ts in chunk.interval.unique()]

    min_interval, max_interval = chunk.interval.agg(['min', 'max'])
    for _, df_partition in chunk[(chunk.interval > min_interval) & (chunk.interval < max_interval)].groupby("interval"):
        write_partition(df_partition, path_partition)
    
    persist_chunk_metadata(path_chunk, path_partition, intervals)
    return Extremes(min_interval, max_interval, path_chunk)

In [None]:
def transfer_partition_increment(
    path_chunk: os.PathLike,
    schema: SCHEMA,
    path_partition: os.PathLike,
    interval: pd.Timestamp
) -> None:
    chunk = read_chunk(path_chunk, schema)
    write_partition(chunk[chunk.interval == interval], path_partition, mode="a")

In [None]:
@dask.delayed(pure=True)
def consolidate_partial_partitions(
    path_partition: os.PathLike,
    stream: str,
    extremes: Sequence[Extremes],
    index: int
) -> None:
    extreme_here = extremes[index]
    interval_target = extreme_here.min_interval

    extremes_distinct: bool = (interval_target != extreme_here.max_interval)
    is_last_consolidator: bool = (index == len(extremes) - 1)
    if extremes_distinct or is_last_consolidator:
        # This invocation is responsible for combining chunks {i} whose extreme has max_interval[i] == interval_target.
        # This also covers the case of the first interval.
        if not partition_file_exists(path_partition, interval_target):
            for extreme in extremes:
                if extreme.max_interval == interval_target:
                    transfer_partition_increment(extreme.path_chunk, STREAMS[stream], path_partition, interval_target)
            transfer_partition_increment(extreme_here.path_chunk, STREAMS[stream], path_partition, interval_target)
            
        # The last consolidator is also responsible for transferring the last interval; it has been done already
        # if the two extremes were the same.
        if all([
            extremes_distinct,
            is_last_consolidator,
            not partition_file_exists(path_partition, extreme_here.max_interval)
        ]):
            transfer_partition_increment(extreme_here.path_chunk, STREAMS[stream], path_partition, extreme_here.max_interval)

In [None]:
@dask.delayed(pure=True)
def complete_partition(c, path_partition):
    return path_partition
    

def partition_stream(stream: str, cfg: Config) -> dask.delayed:
    path_partition = cfg.join_partition(stream)
    
    paths_chunks = sorted(glob(op.join(DIR_CHUNKED, stream, "*.txt.gz")))
    extremes_with_paths: List[Extremes] = [
        partition_chunk(path, stream, cfg)
        for path in paths_chunks
    ]
    consolidated: Sequence[None] = [
        consolidate_partial_partitions(path_partition, stream, extremes_with_paths, index)
        for index in range(len(extremes_with_paths))
    ]
    
    return complete_partition(consolidated, path_partition)

In [None]:
ld = [partition_stream(stream, cfg) for stream in STREAMS.keys()]
futs = client.compute(ld)
ld, futs

## Creating the dynamic artifact graph

The anomaly detection methodology associates each *artifact* of each event to a vertex in an *artifact graph*, a dynamic graph instantiated at each time interval of our partition; artifacts that appear in a given event are linked by an edge (or the weight of this edge is incremented), so that each event translates to a clique in the artifact graph.

For each stream, each event attribute that's tied to a closed set of values can be used as an artifact. What event fields are associated to artifacts is the purpose of the *graph architecture object*.

In [None]:
EventArtifacts = Sequence[str]
EventAttributes = Mapping[str, Any]


class GraphArch:
    
    NAME = "dont-use-this"
    
    def __init__(self, schema_data: SCHEMA) -> None:
        self._schema = dict(schema_data)

    @property
    def schema(self) -> Mapping[str, str]:
        return self._schema

    @abstractmethod
    def get_event_artifacts(self, event: pd.Series) -> EventArtifacts:
        return []

    # Not used yet -- use it to determine how to integrate non-artifact attributes into
    # embedding scheme.
    def get_event_attributes(self, event: pd.Series) -> EventAttributes:
        return {}
    
    def __dask_tokenize__(self, *args, **kwargs) -> str:
        return type(self).NAME

In this case, we will use as artifacts every feature of categorical dtype. Even though the set of values for some of these attributes is so small as to question to meaning of including them in the artifact graph (they will be very highly connected vertices in all steps of the dynamic graph, with variations unlikely to be understood as anomalous), we pull them in until we have data to support their exclusion.

In addition, the methodology of Palladino and Thissen does not specify how to take into account non-categorical attributes. We thus leave all of them out for the time being.

In [None]:
class AllCategoryNoAttr(GraphArch):
    
    NAME = "all-category-no-attr"

    def get_event_artifacts(self, event: pd.Series) -> EventArtifacts:
        return [
            f"{attr}::{value}"
            for attr, value in event.iteritems()
            if self.schema.get(attr, "") in ["object", "category"]
        ]


cfg.graph_build = AllCategoryNoAttr

So as not to lock ourselves to a certain graph handling library at this stage, let's hide it behind an abstraction. We are here involved with building graphs from events, so let's implement this abstraction according to the [Builder](https://en.wikipedia.org/wiki/Builder_pattern) design pattern.

In [None]:
class BuilderGraph(ABC):

    @abstractmethod
    def add_clique(self, vertices: EventArtifacts, attr: EventAttributes) -> None:
        pass
    
    @abstractmethod
    def get_ext_path(self) -> str:
        return ""
    
    @abstractmethod
    def write(self, path: os.PathLike) -> None:
        pass
        
    @abstractmethod
    def absorb(self, path: os.PathLike) -> None:
        pass

So here's a builder now that builds up NetworkX graphs.

In [None]:
class BuilderNetworkX(BuilderGraph):
    
    def __init__(self) -> None:
        self._graph = nx.Graph()
        
    def add_clique(self, vertices: EventArtifacts, attributes: EventAttributes) -> None:
        for i, left in enumerate(vertices):
            for right in vertices[i + 1:]:
                self._graph.add_edge(
                    left,
                    right,
                    weight=self._graph.adj.get(left, {}).get(right, {"weight": 0})["weight"] + 1,
                    **attributes
                )

    def get_ext_path(self) -> str:
        return "nxz"
    
    def write(self, path: os.PathLike) -> None:
        with gzip.open(path, "wb") as file:
            pickle.dump(self._graph, file)
            
    def absorb(self, path: os.PathLike) -> None:
        with gzip.open(path, "rb") as file:
            other = pickle.load(file)
        for left, right in other.edges:
            self._graph.add_edge(
                left,
                right,
                weight=self._graph.adj.get(left, {}).get(right, {}).get("weight", 0) + other[left][right]["weight"]
            )

In [None]:
@cfg.method
def join_experiment(self, *p: os.PathLike) -> os.PathLike:
    return op.join(
        self.path_lanl,
        "experiments",
        self.interval.isoformat(),
        f"{self.time_frame.start.isoformat()}__{self.time_frame.end.isoformat()}",
        self.graph_build.NAME,
        *p
    )

In [None]:
@cfg.method
def join_graphs(self, *p: os.PathLike) -> os.PathLike:
    return self.join_experiment("graphs", *p)

Have Pandas timestamps be tokenizable by Dask, so the graph building is understood as a *pure function* (in the sense of Dask).

In [None]:
def timestamp_tokenize(self, *args, **kwargs) -> str:
    return self.isoformat()


setattr(pd.Timestamp, "__dask_tokenize__", timestamp_tokenize)

For building graphs, we will distribute computations per partition, and then by stream.

In [None]:
@dask.delayed(pure=True)
def make_graph(
    path_partition: os.PathLike,
    ts: pd.Timestamp,
    klass_builder: type,
    cfg: Config
) -> os.PathLike:
    # Skip if this graph is already on disk.
    builder = klass_builder()
    stream = op.basename(path_partition)
    path_graph = cfg.join_graphs("tmp", f"{stream}_{ts.isoformat()}.{builder.get_ext_path()}")
    if op.isfile(path_graph):
        return path_graph

    # Skip building this graph if the graph resulting from the combination of all stream-specific
    # graphs is present.
    if op.isfile(cfg.join_graphs(ts.isoformat() + ".pkl")):
        return path_graph

    path_data = op.join(path_partition, ts.isoformat() + ".txt.gz")
    if op.isfile(path_data):
        df_partition = read_lanl_csv(path_data, STREAMS[stream])
    else:
        df_partition = pd.DataFrame({name: [] for name, _ in STREAMS[stream]}, index=cfg.empty())
    
    arch = cfg.graph_build(STREAMS[stream])
    for _, event in df_partition.iterrows():
        builder.add_clique(arch.get_event_artifacts(event), arch.get_event_attributes(event))
                
    os.makedirs(op.dirname(path_graph), exist_ok=True)
    builder.write(path_graph)

    return path_graph

In [None]:
@dask.delayed(pure=True)
def combine_graphs(
    ts: pd.Timestamp,
    klass_builder: type,
    paths_tmp: Sequence[os.PathLike]
) -> os.PathLike:
    builder = klass_builder()
    path_graph = cfg.join_graphs(f"{ts.isoformat()}.{builder.get_ext_path()}")
    if op.isfile(path_graph):
        return path_graph

    for path in paths_tmp:
        builder.absorb(path)

    os.makedirs(op.dirname(path_graph), exist_ok=True)
    builder.write(path_graph)
        
    for path in paths_tmp:
        os.remove(path)
        
    return path_graph

In [None]:
STREAMS_INVOLVED = list(STREAMS.keys())
# STREAMS_INVOLVED = ["auth"]

PARTITIONS: Mapping[str, os.PathLike] = {
    stream: partition_stream(stream, cfg)
    for stream in STREAMS_INVOLVED
}
PARTITIONS

In [None]:
GRAPHS_BY_STREAM: Mapping[str, Mapping[str, Sequence[os.PathLike]]] = {
    type_graph: {
        stream: [
            make_graph(path_partition, ts, klass_builder, cfg)
            for ts in cfg.index_partition()
        ]
        for stream, path_partition in PARTITIONS.items()
    }
    for type_graph, klass_builder in [("networkx", BuilderNetworkX)]
}
GRAPHS_BY_STREAM

In [None]:
GRAPHS_COMBINED: Mapping[str, Sequence[os.PathLike]] = {
    type_graph: [
        combine_graphs(
            ts,
            klass_builder,
            [GRAPHS_BY_STREAM[type_graph][stream][i] for stream in STREAMS_INVOLVED]
        )
        for i, ts in enumerate(cfg.index_partition())
    ]
    for type_graph, klass_builder in [("networkx", BuilderNetworkX)]
}

futs = client.compute(GRAPHS_COMBINED)

## Extracting features from graphs

In [None]:
g = ig.Graph()

In [None]:
list(g.es)

In [None]:
g.add_vertices(["asdf", "qwer"])
g.add_vertices(["zxcv"])

In [None]:
g.add_edge("asdf", "qwer")
list(g.es)

In [None]:
g.add_edge("asdf", "qwer")
list(g.es)

In [None]:
g.add_edge("zxcv", "asdf")
list(g.es)

In [None]:
g.incident("asdf")

In [None]:
g["qwer", "zxcv"]

In [None]:
[v["name"] for v in g.es[2].vertex_tuple]

In [None]:
("asdf", "qwer") in g.es

In [None]:
g["asdf", "qwer"]

In [None]:
list(g.vs.select(name="asdf"))

In [None]:
list(g.vs)