In [2]:
import gensim
from gensim.models import KeyedVectors
from stellargraph.data import EdgeSplitter
import networkx as nx
from sklearn.linear_model import LogisticRegressionCV
import pandas as pd
import pickle

In [3]:
train_df = pd.read_csv('../../../../lm-vol/LANL_train_unique_v2.csv')

In [5]:
train_df

Unnamed: 0,source user@domain,destination computer,source computer
0,ANONYMOUS LOGON@C1697,C1697,C1697
1,ANONYMOUS LOGON@C586,C586,C1505
2,ANONYMOUS LOGON@C586,C586,C586
3,C10081$@DOM1,C528,C528
4,C101$@DOM1,C988,C988
...,...,...,...
1431275,U8500@DOM1,C529,C1128
1431276,U8500@DOM1,C457,C1128
1431277,U8500@DOM1,C1114,C1115
1431278,U8500@DOM1,C1115,C1115


In [6]:
test_df = pd.read_csv('../../../../lm-vol/LANL_test_unique_v2.csv')

In [7]:
test_df

Unnamed: 0,source user@domain,destination computer,source computer
0,ANONYMOUS LOGON@C586,C586,C1250
1,ANONYMOUS LOGON@C586,C586,C586
2,C101$@DOM1,C988,C988
3,C1020$@DOM1,C1020,C1020
4,C1021$@DOM1,C625,C1021
...,...,...,...
1062869,U7319@?,C528,C528
1062870,U1345@DOM1,C1640,C23653
1062871,U5462@DOM1,C1065,C17256
1062872,U5462@DOM1,C1065,C1065


In [8]:
model = gensim.models.Word2Vec.load('../../../../lm-vol/2_28_new_structure_500_epochs_word2vec.model')
wv = KeyedVectors.load('../../../../lm-vol/2_28_new_structure_500_epochs_word2vec.wordvectors', mmap='r')
G = nx.read_graphml('../../../../lm-vol/LANL_train_unique_v2.graphml')

def operator_hadamard(u, v):
    return u * v

In [9]:
with open('../../../../lm-vol/2_28_new_structure_lr.pkl', 'rb') as file:
    clf = pickle.load(file)

In [10]:
graph_structure = ('source user@domain', 'source computer', 'destination computer')
client_to_ip_pred = []
ip_to_service_pred = []
inconclusive = 0
for index, row in test_df.iterrows():
    try:
        client_embedding = wv[(row[graph_structure[0]])]
        ip_embedding = wv[(row[graph_structure[1]])]
        service_embedding = wv[(row[graph_structure[2]])]
        client_to_ip_embedding = [operator_hadamard(client_embedding, ip_embedding)]
        ip_to_service_embedding = [operator_hadamard(ip_embedding, service_embedding)]
        client_to_ip = clf.predict_proba(client_to_ip_embedding)
        ip_to_service = clf.predict_proba(ip_to_service_embedding)
        client_to_ip_pred.append(client_to_ip[0][1])
        ip_to_service_pred.append(ip_to_service[0][1])
    except: 
        client_to_ip_pred.append(None)
        ip_to_service_pred.append(None)
        inconclusive += 1

In [12]:
len(client_to_ip_pred)

1062874

In [13]:
test_df['client_to_ip'] = client_to_ip_pred
test_df['ip_to_service'] = ip_to_service_pred

In [16]:
anomalous_edges = len(test_df[(test_df["ip_to_service"].astype(float) <= 0.1) | (test_df["client_to_ip"].astype(float) <= 0.1)])
normal_edges = len(test_df[(test_df["ip_to_service"].astype(float) > 0.1) & (test_df["client_to_ip"].astype(float) > 0.1)])
print(anomalous_edges/(normal_edges+anomalous_edges))
print(anomalous_edges)
print(normal_edges)

0.3626282239611694
378928
666021


test_G = nx.Graph()
for index, row in test_df.iterrows():
    user_node = row[graph_structure[0]]
    ip_node = row[graph_structure[1]]
    service_node = row[graph_structure[2]]
    test_G.add_nodes_from([ip_node, service_node, user_node])
    test_G.add_edge(user_node, ip_node)
    test_G.add_edge(ip_node, service_node)

In [None]:
test_nodes = set(test_G.nodes)
train_nodes = set(G.nodes)
missing_nodes = test_nodes.difference(train_nodes)
filtered_test_G = test_G.copy()
for index, edge in enumerate(filtered_test_G.edges):
    if edge[0] in missing_nodes or edge[1] in missing_nodes:
        filtered_test_G.remove_edge(*edge)

In [None]:
edge_vectors = []
for index, edge in enumerate(filtered_test_G.edges):
    src = model.wv[edge[0]]
    dest = model.wv[edge[1]]
    src_to_dest = [operator_hadamard(src, dest)]
    prob = clf.predict_proba(src_to_dest)
    if prob[0][1] <= 0.1:
        edge_vectors.append((src_to_dest, 1))
    else:
        edge_vectors.append((src_to_dest, 0))

In [None]:
anomalous_edge_vectors = filter(lambda x: x[1] == 1, edge_vectors)
print(len(list(anomalous_edge_vectors)))

In [None]:
tsne = TSNE(n_components=2, random_state=7, perplexity=15)
embeddings = np.array([edge[0][0] for edge in edge_vectors])
embeddings_2d = tsne.fit_transform(embeddings)

In [None]:
labels = np.array([edge[1] for edge in edge_vectors])
figure = plt.figure(figsize=(11, 9))

ax = figure.add_subplot(111)

ax.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], s=5, c=labels, cmap='bwr')

In [None]:
predictions = []
for index, row in test_df.iterrows():
    try:
        client = model.wv[row['client']]
        ip = model.wv[row['id.orig_h']]
        service = model.wv[row['service']]
        client_to_ip = [operator_hadamard(client, ip)]
        ip_to_service = [operator_hadamard(service, ip)]
        prob1 = clf.predict_proba(client_to_ip)
        prob2 = clf.predict_proba(ip_to_service)
        if prob1[0][1] <= 0.1:
            predictions.append(1)
        elif prob2[0][1]<= 0.1:
            predictions.append(1)
        else:
            predictions.append(0)
    except:
        predictions.append(-1)
test_df['predictions'] = predictions

In [None]:
test_df[['client', 'service', 'id.orig_h', 'predictions', 'malicious']]

In [None]:
red_labels = list(test_df.malicious)
labels = list(test_df.predictions)
true_positives = 0
false_positives = 0
true_negatives = 0
false_negatives = 0
for index, label in enumerate(labels):
    if label == 1:
        if red_labels[index] == 1:
            true_positives += 1
        else:
            false_positives += 1
    else:
        if red_labels[index] == 1:
            false_negatives += 1
        else:
            true_negatives += 1
print(f'true postives: {true_positives}')
print(f'true negatives: {true_negatives}')
print(f'false postives: {false_positives}')
print(f'false negatives: {false_negatives}')
print(f'false positive rate: {false_positives/(true_negatives+false_positives)}')
print(f'true positive rate: {true_positives/(true_positives+false_negatives)}')

In [None]:
test_df['predictions'].value_counts()

In [None]:
anomaly_df = test_df[test_df['predictions'] == 1]
missing_df = test_df[test_df['predictions'] == -1]

In [None]:
anomaly_df[['client', 'id.orig_h', 'service', 'ts']]

In [None]:
anomalies = []
for index, row in anomaly_df.iterrows():
    anomaly = [row['client'], row['id.orig_h'], row['service']]
    anomalies.append(anomaly)

In [None]:
def explain_graph(anomaly, train_graph):
    
    def filter_node(node):
        return node in filtered_nodes
    
    client_node, ip_node, service_node = anomaly
    client_neighbors = [n for n in train_graph.neighbors(client_node)]
    ip_neighbors = [n for n in train_graph.neighbors(ip_node)]
    service_neighbors = [n for n in train_graph.neighbors(service_node)]
    print(f'client {anomaly[0]} has {len(client_neighbors)} neighbors')
    print(f'ip {anomaly[1]} has {len(ip_neighbors)} neighbors')
    print(f'service {anomaly[2]} has {len(service_neighbors)} neighbors')
    filtered_nodes = set(client_neighbors + ip_neighbors + service_neighbors + anomaly)
    view = nx.subgraph_view(train_graph, filter_node = filter_node)
    ColorScale = plt.get_cmap('RdYlGn')
    subgraph = nx.Graph(view)
    subgraph.add_edge(client_node, ip_node)
    subgraph.add_edge(ip_node, service_node)
    for edge in subgraph.edges:
        src, dest = edge
        edge_embedding = [operator_hadamard(model.wv[src],model.wv[dest])]
        edge_prob = clf.predict_proba(edge_embedding)
        subgraph[src][dest]['prob'] = ColorScale(edge_prob[0][1])
    node_colors = []
    for node in subgraph.nodes:
        if node == client_node:
            node_colors.append('yellow')
        elif node == ip_node:
            node_colors.append('orange')
        elif node == service_node:
            node_colors.append('red')
        else:
            node_colors.append('#1f77b4')
    edge_colors = [x[2] for x in subgraph.edges.data('prob')]
    nx.draw(subgraph, edge_color=edge_colors, node_color=node_colors, with_labels=True)

In [None]:
explain_graph(anomalies[75], G)