In [None]:
# %pip install chardet

In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import stellargraph as sg

2023-03-01 07:32:11.689559: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-01 07:32:12.406712: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-01 07:32:12.541715: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-01 07:32:12.541771: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

In [2]:
graph_structure = ('source user@domain', 'source computer', 'destination computer')
G = nx.Graph()

In [3]:
for chunk in pd.read_csv('../lm-vol/LANL_train_unique_v2.csv', chunksize=1000000):
    for index, row in chunk.iterrows():
        user_node = row[graph_structure[0]]
        ip_node = row[graph_structure[1]]
        service_node = row[graph_structure[2]]
        G.add_nodes_from([ip_node, service_node, user_node])
        G.add_edge(ip_node, user_node)
        G.add_edge(ip_node, service_node)

In [4]:
nx.write_graphml(G, "../lm-vol/LANL_train_v2.graphml")

In [5]:
from stellargraph import StellarGraph
from stellargraph.data import EdgeSplitter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

graph = StellarGraph.from_networkx(G)


In [6]:
print(graph.info())

StellarGraph: Undirected multigraph
 Nodes: 92127, Edges: 981716

 Node types:
  default: [92127]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [981716]
        Weights: all 1 (default)
        Features: none


In [7]:
p = 1.0
q = 1.0
dimensions = 128
num_walks = 20
walk_length = 10
window_size = 5
num_iter = 100
workers = 32

In [8]:
from stellargraph.data import BiasedRandomWalk
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

# init callback class
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [9]:
rw = BiasedRandomWalk(graph)
walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q)
print(f"Number of random walks for Graph: {len(walks)}")

model = Word2Vec(
    walks,
    vector_size=dimensions,
    window=window_size,
    min_count=0,
    sg=0,
    hs=0,
    negative=5,
    cbow_mean=1,
    workers=workers,
    epochs=num_iter,
    compute_loss=True,
    callbacks=[callback()]
)
model.get_latest_training_loss()
model.save(f"../lm-vol/2_28_new_structure_word2vec.model")
model.wv.save(f"../lm-vol/2_28_new_structure_word2vec.wordvectors")

Number of random walks for Graph: 1842540
Loss after epoch 0: 1158454.875
Loss after epoch 1: 924504.375
Loss after epoch 2: 806799.75
Loss after epoch 3: 718559.5
Loss after epoch 4: 679811.5
Loss after epoch 5: 599827.5
Loss after epoch 6: 629410.0
Loss after epoch 7: 601714.0
Loss after epoch 8: 596464.0
Loss after epoch 9: 591978.5
Loss after epoch 10: 600476.5
Loss after epoch 11: 593272.5
Loss after epoch 12: 534011.0
Loss after epoch 13: 525481.0
Loss after epoch 14: 523690.0
Loss after epoch 15: 544804.0
Loss after epoch 16: 540934.0
Loss after epoch 17: 534969.0
Loss after epoch 18: 515628.0
Loss after epoch 19: 525492.0
Loss after epoch 20: 525633.0
Loss after epoch 21: 525747.0
Loss after epoch 22: 532589.0
Loss after epoch 23: 549554.0
Loss after epoch 24: 530454.0
Loss after epoch 25: 516066.0
Loss after epoch 26: 527237.0
Loss after epoch 27: 490034.0
Loss after epoch 28: 430248.0
Loss after epoch 29: 425046.0
Loss after epoch 30: 437306.0
Loss after epoch 31: 452064.0
Lo

In [None]:
model = Word2Vec(
    walks,
    vector_size=dimensions,
    window=window_size,
    min_count=0,
    sg=0,
    hs=0,
    negative=5,
    cbow_mean=1,
    workers=workers,
    epochs=500,
    compute_loss=True,
    callbacks=[callback()]
)
model.get_latest_training_loss()
model.save(f"../lm-vol/2_28_new_structure_500_epochs_word2vec.model")
model.wv.save(f"../lm-vol/2_28_new_structure_500_epochs_word2vec.wordvectors")

Loss after epoch 0: 1151925.25
Loss after epoch 1: 915426.125
Loss after epoch 2: 767171.125
Loss after epoch 3: 754841.5
Loss after epoch 4: 665682.5
Loss after epoch 5: 597978.0
Loss after epoch 6: 627858.5
Loss after epoch 7: 634333.5
Loss after epoch 8: 620995.5
Loss after epoch 9: 582733.5
Loss after epoch 10: 545363.0
Loss after epoch 11: 606058.5
Loss after epoch 12: 517048.0
Loss after epoch 13: 551043.0
Loss after epoch 14: 553164.0
Loss after epoch 15: 503703.0
Loss after epoch 16: 529072.0
Loss after epoch 17: 557045.0
Loss after epoch 18: 500790.0
Loss after epoch 19: 529170.0
Loss after epoch 20: 564730.0
Loss after epoch 21: 549851.0
Loss after epoch 22: 596270.0
Loss after epoch 23: 538685.0
Loss after epoch 24: 552950.0
Loss after epoch 25: 543224.0
Loss after epoch 26: 588844.0
Loss after epoch 27: 487458.0
Loss after epoch 28: 458192.0
Loss after epoch 29: 467352.0
Loss after epoch 30: 467362.0
Loss after epoch 31: 470612.0
Loss after epoch 32: 472988.0
Loss after epo

In [None]:
node_embeddings = (model.wv.vectors) 

In [None]:
def operator_hadamard(u, v):
    return u * v

In [None]:
# Define an edge splitter on the original graph:
edge_splitter_test = EdgeSplitter(G)

# Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from graph, and obtain the
# reduced graph graph_test with the sampled links removed:
graph_test, edges_train, labels_train = edge_splitter_test.train_test_split(
    p=0.99, method="global"
)

In [None]:
training_edges = [operator_hadamard(model.wv[edge[0]], model.wv[edge[1]]) for edge in edges_train]

In [None]:
clf = LogisticRegressionCV(cv=5, max_iter=1000, random_state=0).fit(training_edges, labels_train)
clf.score(training_edges, labels_train)

In [None]:
anomalies = []
inconclusive = 0
for chunk in pd.read_csv('../lm-vol/LANL_test.csv', header=None, chunksize=1000000):
    display(chunk)
    for index, row in chunk.iterrows():
        try: 
            client_embedding = model.wv[(row[graph_structure[0]])]
            ip_embedding = model.wv[(row[graph_structure[1]])]
            service_embedding = model.wv[(row[graph_structure[2]])]
            client_to_ip_embedding = [operator_hadamard(client_embedding, ip_embedding)]
            ip_to_service_embedding = [operator_hadamard(ip_embedding, service_embedding)]
            client_to_ip = clf.predict_proba(client_to_ip_embedding)
            ip_to_service = clf.predict_proba(ip_to_service_embedding)
            anomalies.append([client_to_ip, ip_to_service])
            
        except: 
            anomalies.append('')
            inconclusive += 1

In [None]:
client_to_ip_pred = []
ip_to_service_pred = []
for anomaly in anomalies:
    if anomaly != '':
        client_to_ip_pred.append(anomaly[0][0][1])
        ip_to_service_pred.append(anomaly[1][0][1])
    else:
        client_to_ip_pred.append(None)
        ip_to_service_pred.append(None)
test_df['client_to_ip_pred'] = client_to_ip_pred
test_df['ip_to_service_pred'] = ip_to_service_pred
print(inconclusive)

In [None]:
print(len(anomalies)

In [None]:
anomalies_df= pd.DataFrame(anomalies)
anomalies_df.to_csv('../lm-vol/anomalies.csv')

In [None]:
anomalies2 = []
inconclusive2 = 0
for chunk in pd.read_csv('../lm-vol/LANL_test.csv', header=None, chunksize=1000000):
    display(chunk)
    for index, row in chunk.iterrows():
        try: 
            client_embedding = model.wv[(row[graph_structure[0]])]
            ip_embedding = model.wv[(row[graph_structure[1]])]
            service_embedding = model.wv[(row[graph_structure[2]])]
            client_to_ip_embedding = [operator_hadamard(client_embedding, ip_embedding)]
            ip_to_service_embedding = [operator_hadamard(ip_embedding, service_embedding)]
            client_to_ip = clf.predict_proba(client_to_ip_embedding)
            ip_to_service = clf.predict_proba(ip_to_service_embedding)
            anomalies2.append([client_to_ip, ip_to_service])
            
        except: 
            anomalies2.append('')
            inconclusive += 1
    break