In [2]:
from pyarrow import csv

In [3]:
read_options = csv.ReadOptions(
               column_names=["time","source user@domain","destination user@domain", \
                             "source computer","destination computer","authentication type", \
                             "logon type","authentication orientation","success/failure","day"])


In [5]:
table_file = csv.read_csv('../lm-vol/redteam_complete.csv', read_options=read_options)

In [6]:
help(table_file.filter)

Help on built-in function filter:

filter(...) method of pyarrow.lib.Table instance
    Table.filter(self, mask, null_selection_behavior=u'drop')
    
    Select rows from the table.
    
    The Table can be filtered based on a mask, which will be passed to
    :func:`pyarrow.compute.filter` to perform the filtering, or it can
    be filtered through a boolean :class:`.Expression`
    
    Parameters
    ----------
    mask : Array or array-like or .Expression
        The boolean mask or the :class:`.Expression` to filter the table with.
    null_selection_behavior : str, default "drop"
        How nulls in the mask should be handled, does nothing if
        an :class:`.Expression` is used.
    
    Returns
    -------
    filtered : Table
        A table of the same schema, with only the rows selected
        by applied filtering
    
    Examples
    --------
    >>> import pyarrow as pa
    >>> import pandas as pd
    >>> df = pd.DataFrame({'year': [2020, 2022, 2019, 2021],
    ...

In [7]:
pandas_lanl = table_file.to_pandas()

In [8]:
pandas_lanl.shape

(740, 10)

In [9]:
pandas_lanl.columns

Index(['time', 'source user@domain', 'destination user@domain',
       'source computer', 'destination computer', 'authentication type',
       'logon type', 'authentication orientation', 'success/failure', 'day'],
      dtype='object')

In [10]:
import pandas as pd 
import torch
from torch import Tensor

In [11]:
pandas_lanl.sample(5)

Unnamed: 0,time,source user@domain,destination user@domain,source computer,destination computer,authentication type,logon type,authentication orientation,success/failure,day
171,170.0,760075,U1653@DOM1,C17693,C395,U1653@DOM1,NTLM,Network,LogOn,Success
246,245.0,767813,U66@DOM1,C17693,C2944,U66@DOM1,NTLM,Network,LogOn,Success
415,414.0,1076742,U453@DOM1,C17693,C2388,U453@DOM1,NTLM,Network,LogOn,Success
97,96.0,739706,U1600@DOM1,C17693,C92,U1600@DOM1,NTLM,Network,LogOn,Success
250,249.0,768546,U4353@DOM1,C17693,C17860,U4353@DOM1,NTLM,Network,LogOn,Success


In [10]:
pandas_lanl = pandas_lanl.rename(columns={'source user@domain': "src_user", 'destination user@domain': "dest_user"})

In [11]:
df = pandas_lanl ##shallow copy lol

In [12]:
df.columns

Index(['time', 'src_user', 'dest_user', 'source computer',
       'destination computer', 'authentication type', 'logon type',
       'authentication orientation', 'success/failure', 'day'],
      dtype='object')

In [13]:
df = df[(df['src_user'].str.contains('LOCAL')==False) & (df['src_user'].str.contains('SYSTEM')==False) & 
        (df['src_user'].str.contains('ANONYMOUS')==False) & (df['src_user'].str.contains('ADMINISTRARTOR')==False) &
        (df['src_user'].str.contains('$')==False) 
       ]

df = df[(df['dest_user'].str.contains('LOCAL')==False) & (df['dest_user'].str.contains('SYSTEM')==False) & 
        (df['dest_user'].str.contains('ANONYMOUS')==False) & (df['dest_user'].str.contains('ADMINISTRARTOR')==False) &
        (df['dest_user'].str.contains('$')==False) 
       ]


In [14]:
df.shape

(0, 10)

In [16]:
pandas_lanl.sample(2)

Unnamed: 0,time,src_user,dest_user,source computer,destination computer,authentication type,logon type,authentication orientation,success/failure,day
620080501,4521322,U341@DOM1,U341@DOM1,C612,C612,?,Network,LogOff,Success,53
99001974,1490866,C2070$@DOM1,C2070$@DOM1,C2071,C2071,?,?,TGT,Success,18


In [21]:
####decide to play with unique test data 
import pandas as pd 
import torch
from torch import Tensor

In [22]:

pandas_lanl = pd.read_csv("../lm-vol/redteam_complete.csv")

In [23]:
pandas_lanl.shape

(739, 10)

In [24]:
pandas_lanl.columns

Index(['Unnamed: 0', 'time', 'source', 'source_computer',
       'destination_computer', 'destination', 'authentication_type',
       'logon_type', 'authentication_orientation', 'Success'],
      dtype='object')

In [29]:
client_service = pandas_lanl[['source','destination_computer']].copy()
client_service.sample(5)

Unnamed: 0,source,destination_computer
418,U66@DOM1,C2388
714,U1145@DOM1,C2877
670,U524@DOM1,C15
545,U66@DOM1,C2388
377,U66@DOM1,C92


In [27]:
#df.rename(columns={"A": "a", "B": "c"})
client_service = client_service.rename(columns={"source user@domain":"client","destination computer":"service"})

In [28]:
unique_client_id = client_service['client'].unique()
unique_client_id = pd.DataFrame(data={
     'clientId': unique_client_id,
     'mappedID': pd.RangeIndex(len(unique_client_id)),
 })

print(unique_client_id)



unique_service_id = client_service['service'].unique()
unique_service_id = pd.DataFrame(data={
     'serviceId': unique_service_id,
     'mappedID': pd.RangeIndex(len(unique_service_id)),
 })

KeyError: 'client'

In [13]:
print(len(unique_client_id), len(unique_service_id))

68741 14247


In [14]:
client_service

Unnamed: 0,client,service
0,ANONYMOUS LOGON@C586,C586
1,ANONYMOUS LOGON@C586,C586
2,C101$@DOM1,C988
3,C1020$@DOM1,C1020
4,C1021$@DOM1,C625
...,...,...
1062869,U7319@?,C528
1062870,U1345@DOM1,C1640
1062871,U5462@DOM1,C1065
1062872,U5462@DOM1,C1065


In [15]:
# Perform merge to obtain the edges from users and movies:
client_id = pd.merge(client_service['client'], unique_client_id, left_on='client', right_on='clientId', how='left')
client_id = torch.from_numpy(client_id['mappedID'].values)
service_id = pd.merge(client_service['service'], unique_service_id, left_on='service', right_on='serviceId', how='left')
service_id = torch.from_numpy(service_id['mappedID'].values)

# following PyG semantics:
edge_index_client_to_service = torch.stack([client_id, service_id], dim=0)

print(edge_index_client_to_service)


tensor([[    0,     0,     1,  ..., 23357, 23357, 50922],
        [    0,     0,     1,  ...,    16,    16,  1384]])


In [16]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T


In [17]:
# pip install pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.13.0+cpu.html

In [18]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()
data["client"].node_id = torch.arange(len(unique_client_id))
data["service"].node_id = torch.arange(len(unique_service_id))

data["client", "auth_attempt", "service"].edge_index = edge_index_client_to_service

Transform = T.ToUndirected()
data = Transform(data)

In [19]:
transform = T.RandomLinkSplit(
    num_val=0.1,  # TODO
    num_test=0.1,  # TODO
    disjoint_train_ratio=0.3,  # TODO
    neg_sampling_ratio=2,  # TODO
    add_negative_train_samples=False,  # TODO
    edge_types=("client", "auth_attempt", "service"),
    rev_edge_types=("service", "rev_auth_attempt", "client"),
)

train_data, val_data, test_data = transform(data)

In [20]:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_data["client", "auth_attempt", "service"].edge_label_index
edge_label = train_data["client", "auth_attempt", "service"].edge_label

train_loader = LinkNeighborLoader(
    data=train_data,  # TODO
    num_neighbors=[20,10],  # TODO
    neg_sampling_ratio=2,  # TODO
    edge_label_index=(("client", "auth_attempt", "service"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True,
)

# Inspect a sample:
sampled_data = next(iter(train_loader))

print("Sampled mini-batch:")
print("===================")
print(sampled_data)

Sampled mini-batch:
HeteroData(
  [1mclient[0m={ node_id=[4065] },
  [1mservice[0m={ node_id=[1547] },
  [1m(client, auth_attempt, service)[0m={
    edge_index=[2, 5617],
    edge_label=[384],
    edge_label_index=[2, 384],
    input_id=[128]
  },
  [1m(service, rev_auth_attempt, client)[0m={ edge_index=[2, 14954] }
)


In [21]:
from torch_geometric.nn import SAGEConv, to_hetero

In [22]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        # Define a 2-layer GNN computation graph.
        # Use a *single* `ReLU` non-linearity in-between.
        # TODO:
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x
        # raise NotImplementedError

# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_movie: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]

        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        # self.movie_lin = torch.nn.Linear(20, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["client"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data["service"].num_nodes, hidden_channels)

        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)

        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())

        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
          "client": self.user_emb(data["client"].node_id),
          "service": self.movie_emb(data["service"].node_id),
        } 

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)

        pred = self.classifier(
            x_dict["client"],
            x_dict["service"],
            data["client", "auth_attempt", "service"].edge_label_index,
        )

        return pred

        
model = Model(hidden_channels=64)

print(model)

Model(
  (user_emb): Embedding(68741, 64)
  (movie_emb): Embedding(14247, 64)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (client__auth_attempt__service): SAGEConv(64, 64, aggr=mean)
      (service__rev_auth_attempt__client): SAGEConv(64, 64, aggr=mean)
    )
    (conv2): ModuleDict(
      (client__auth_attempt__service): SAGEConv(64, 64, aggr=mean)
      (service__rev_auth_attempt__client): SAGEConv(64, 64, aggr=mean)
    )
  )
  (classifier): Classifier()
)


In [None]:
import tqdm
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 8):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()

        
        # print(sampled_data)

        # TODO: Move `sampled_data` to the respective `device`
        sampled_data = sampled_data.to(device)
        # TODO: Run `forward` pass of the model
        pred = model.forward(sampled_data)

        # print(pred.size() == sampled_data["user", "rates", "movie"].edge_label.size())
        # print(sampled_data["user", "rates", "movie"].edge_label)
        # TODO: Apply binary cross entropy via
        # `F.binary_cross_entropy_with_logits(pred, ground_truth)`
        
        loss = F.binary_cross_entropy_with_logits(pred, sampled_data["client", "auth_attempt", "service"].edge_label)

        # raise NotImplementedError

        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Device: 'cpu'


100%|████████████████████████████████████████████████████████| 1993/1993 [30:33<00:00,  1.09it/s]


Epoch: 001, Loss: 0.1758


100%|████████████████████████████████████████████████████████| 1993/1993 [25:37<00:00,  1.30it/s]


Epoch: 002, Loss: 0.1307


100%|████████████████████████████████████████████████████████| 1993/1993 [11:17<00:00,  2.94it/s]


Epoch: 003, Loss: 0.1141


100%|████████████████████████████████████████████████████████| 1993/1993 [07:15<00:00,  4.57it/s]


Epoch: 004, Loss: 0.1012


 18%|██████████▎                                              | 360/1993 [01:10<05:15,  5.18it/s]

In [None]:
torch.save(model, '../lm-vol/torchmodel01')

In [None]:
!pip uninstall bokeh

Found existing installation: bokeh 3.1.0
Uninstalling bokeh-3.1.0:
  Would remove:
    /usr/local/bin/bokeh
    /usr/local/lib/python3.8/site-packages/bokeh-3.1.0.dist-info/*
    /usr/local/lib/python3.8/site-packages/bokeh/*
Proceed (y/n)? 