In [1]:
from pyarrow import csv

In [2]:
read_options = csv.ReadOptions(
               column_names=["time","source user@domain","destination user@domain", \
                             "source computer","destination computer","authentication type", \
                             "logon type","authentication orientation","success/failure","day"])


In [3]:
table_file = csv.read_csv('../lm-vol/LANL_train.csv', read_options=read_options)

In [6]:
help(table_file.filter)

Help on built-in function filter:

filter(...) method of pyarrow.lib.Table instance
    Table.filter(self, mask, null_selection_behavior=u'drop')
    
    Select rows from the table.
    
    The Table can be filtered based on a mask, which will be passed to
    :func:`pyarrow.compute.filter` to perform the filtering, or it can
    be filtered through a boolean :class:`.Expression`
    
    Parameters
    ----------
    mask : Array or array-like or .Expression
        The boolean mask or the :class:`.Expression` to filter the table with.
    null_selection_behavior : str, default "drop"
        How nulls in the mask should be handled, does nothing if
        an :class:`.Expression` is used.
    
    Returns
    -------
    filtered : Table
        A table of the same schema, with only the rows selected
        by applied filtering
    
    Examples
    --------
    >>> import pyarrow as pa
    >>> import pandas as pd
    >>> df = pd.DataFrame({'year': [2020, 2022, 2019, 2021],
    ...

In [7]:
pandas_lanl = table_file.to_pandas()

In [8]:
pandas_lanl.shape

(732190499, 10)

In [9]:
pandas_lanl.columns

Index(['time', 'source user@domain', 'destination user@domain',
       'source computer', 'destination computer', 'authentication type',
       'logon type', 'authentication orientation', 'success/failure', 'day'],
      dtype='object')

In [10]:
import pandas as pd 
import torch
from torch import Tensor

In [14]:
pandas_lanl.sample(5)

Unnamed: 0,time,source user@domain,destination user@domain,source computer,destination computer,authentication type,logon type,authentication orientation,success/failure,day
588857630,4372757,C8373$@DOM1,C8373$@DOM1,C612,C612,?,Network,LogOff,Success,51
658724146,4720518,C2980$@DOM1,C2980$@DOM1,C2980,C528,Kerberos,Network,LogOn,Success,55
306481276,3165553,ANONYMOUS LOGON@C586,ANONYMOUS LOGON@C586,C8385,C586,NTLM,Network,LogOn,Success,37
712676775,4937548,C2489$@DOM1,C2489$@DOM1,C2489,C586,Kerberos,Network,LogOn,Success,58
668307435,4764751,U204@DOM1,U204@DOM1,C1308,C457,Kerberos,Network,LogOn,Success,56


In [None]:
pandas_lanl.rename(columns={'source user@domain': "src_user", 'destination user@domain': "dest_user"})

In [16]:
df = pandas_lanl ##shallow copy lol

In [None]:
df.columns

In [13]:
df = df[(df['src_user'].str.contains('LOCAL')==False) & (df['src_user'].str.contains('SYSTEM')==False) & 
        (df['src_user'].str.contains('ANONYMOUS')==False) & (df['src_user'].str.contains('ADMINISTRARTOR')==False) &
        (df['src_user'].str.contains('$')==False) 
       ]

df = df[(df['dest_user'].str.contains('LOCAL')==False) & (df['dest_user'].str.contains('SYSTEM')==False) & 
        (df['dest_user'].str.contains('ANONYMOUS')==False) & (df['dest_user'].str.contains('ADMINISTRARTOR')==False) &
        (df['dest_user'].str.contains('$')==False) 
       ]


['T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__dataframe__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__reduce__',
 '__reduce_ex_

In [8]:
client_service = pandas_lanl[['source user@domain','destination computer']].copy()
client_service.sample(5)

Unnamed: 0,source user@domain,destination computer
523330290,NETWORK SERVICE@C9670,C9670
545907035,C104$@DOM1,C988
729814256,U1552@DOM1,C2106
94720870,C1763$@DOM1,C586
474355051,C612$@DOM1,C612


In [9]:
#df.rename(columns={"A": "a", "B": "c"})
client_service = client_service.rename(columns={"source user@domain":"client","destination computer":"service"})

In [10]:
unique_client_id = client_service['client'].unique()
unique_client_id = pd.DataFrame(data={
     'clientId': unique_client_id,
     'mappedID': pd.RangeIndex(len(unique_client_id)),
 })

print(unique_client_id)



unique_service_id = client_service['service'].unique()
unique_service_id = pd.DataFrame(data={
     'serviceId': unique_service_id,
     'mappedID': pd.RangeIndex(len(unique_service_id)),
 })

                    clientId  mappedID
0      ANONYMOUS LOGON@C1697         0
1       ANONYMOUS LOGON@C586         1
2               C10081$@DOM1         2
3                 C101$@DOM1         3
4               C10403$@DOM1         4
...                      ...       ...
74987          U10647@C17322     74987
74988                U9322@?     74988
74989            U10647@DOM9     74989
74990  ANONYMOUS LOGON@C3506     74990
74991             U790@DOM40     74991

[74992 rows x 2 columns]


In [11]:
print(len(unique_client_id), len(unique_service_id))

74992 15520


In [12]:
client_service

Unnamed: 0,client,service
0,ANONYMOUS LOGON@C1697,C1697
1,ANONYMOUS LOGON@C586,C586
2,ANONYMOUS LOGON@C586,C586
3,C10081$@DOM1,C528
4,C101$@DOM1,C988
...,...,...
732190494,U7813@DOM1,C5618
732190495,U8712@DOM1,C1065
732190496,U939@DOM1,C10
732190497,U9@?,C222


In [13]:
# Perform merge to obtain the edges from users and movies:
client_id = pd.merge(client_service['client'], unique_client_id, left_on='client', right_on='clientId', how='left')
client_id = torch.from_numpy(client_id['mappedID'].values)
service_id = pd.merge(client_service['service'], unique_service_id, left_on='service', right_on='serviceId', how='left')
service_id = torch.from_numpy(service_id['mappedID'].values)

# following PyG semantics:
edge_index_client_to_service = torch.stack([client_id, service_id], dim=0)

print(edge_index_client_to_service)


tensor([[    0,     1,     1,  ..., 26728, 63315,  1894],
        [    0,     1,     1,  ...,  2959,    54,    54]])


In [14]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T


In [15]:
# pip install pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.13.0+cpu.html

In [16]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()
data["client"].node_id = torch.arange(len(unique_client_id))
data["service"].node_id = torch.arange(len(unique_service_id))

data["client", "auth_attempt", "service"].edge_index = edge_index_client_to_service

Transform = T.ToUndirected()
data = Transform(data)

In [17]:
transform = T.RandomLinkSplit(
    num_val=0.1,  # TODO
    num_test=0.1,  # TODO
    disjoint_train_ratio=0.3,  # TODO
    neg_sampling_ratio=2,  # TODO
    add_negative_train_samples=False,  # TODO
    edge_types=("client", "auth_attempt", "service"),
    rev_edge_types=("service", "rev_auth_attempt", "client"),
)

train_data, val_data, test_data = transform(data)

In [18]:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_data["client", "auth_attempt", "service"].edge_label_index
edge_label = train_data["client", "auth_attempt", "service"].edge_label

train_loader = LinkNeighborLoader(
    data=train_data,  # TODO
    num_neighbors=[20,10],  # TODO
    neg_sampling_ratio=2,  # TODO
    edge_label_index=(("client", "auth_attempt", "service"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True,
)

# Inspect a sample:
sampled_data = next(iter(train_loader))

print("Sampled mini-batch:")
print("===================")
print(sampled_data)

Sampled mini-batch:
HeteroData(
  [1mclient[0m={ node_id=[2367] },
  [1mservice[0m={ node_id=[1082] },
  [1m(client, auth_attempt, service)[0m={
    edge_index=[2, 9850],
    edge_label=[384],
    edge_label_index=[2, 384],
    input_id=[128]
  },
  [1m(service, rev_auth_attempt, client)[0m={ edge_index=[2, 16156] }
)


In [19]:
from torch_geometric.nn import SAGEConv, to_hetero

In [20]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        # Define a 2-layer GNN computation graph.
        # Use a *single* `ReLU` non-linearity in-between.
        # TODO:
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x
        # raise NotImplementedError

# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_movie: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]

        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        # self.movie_lin = torch.nn.Linear(20, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["client"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data["service"].num_nodes, hidden_channels)

        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)

        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())

        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
          "client": self.user_emb(data["client"].node_id),
          "service": self.movie_emb(data["service"].node_id),
        } 

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)

        pred = self.classifier(
            x_dict["client"],
            x_dict["service"],
            data["client", "auth_attempt", "service"].edge_label_index,
        )

        return pred

        
model = Model(hidden_channels=64)

print(model)

Model(
  (user_emb): Embedding(74992, 64)
  (movie_emb): Embedding(15520, 64)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (client__auth_attempt__service): SAGEConv(64, 64, aggr=mean)
      (service__rev_auth_attempt__client): SAGEConv(64, 64, aggr=mean)
    )
    (conv2): ModuleDict(
      (client__auth_attempt__service): SAGEConv(64, 64, aggr=mean)
      (service__rev_auth_attempt__client): SAGEConv(64, 64, aggr=mean)
    )
  )
  (classifier): Classifier()
)


In [None]:
import tqdm
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 8):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()

        
        # print(sampled_data)

        # TODO: Move `sampled_data` to the respective `device`
        sampled_data = sampled_data.to(device)
        # TODO: Run `forward` pass of the model
        pred = model.forward(sampled_data)

        # print(pred.size() == sampled_data["user", "rates", "movie"].edge_label.size())
        # print(sampled_data["user", "rates", "movie"].edge_label)
        # TODO: Apply binary cross entropy via
        # `F.binary_cross_entropy_with_logits(pred, ground_truth)`
        
        loss = F.binary_cross_entropy_with_logits(pred, sampled_data["client", "auth_attempt", "service"].edge_label)

        # raise NotImplementedError

        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Device: 'cpu'


  0%|                                           | 986/1372858 [05:37<108:49:57,  3.50it/s]

In [None]:
torch.save(model, '../lm-vol/torchmodel01')