In [1]:
from algorithm import GraphWrapper
from pathlib import Path
from tqdm import tqdm

input_path = Path('../data/benign_graphs/tc3-theia/firefox/nd')
input_paths = list(input_path.glob('*.json'))[:200]
input_graphs = [GraphWrapper(input_path) 
                for input_path in tqdm(input_paths, desc='Reading graphs')]
len(input_graphs)

Reading graphs: 100%|██████████| 200/200 [00:08<00:00, 24.58it/s]


200

In [5]:
from collections import deque
from algorithm import GraphWrapper, EdgeWrapper, NodeWrapper, Subgraph, IN, OUT

def get_subgraphs(graph: GraphWrapper, direction: str) -> list[Subgraph]:
    result: list[Subgraph] = []
    visited_edges: set[EdgeWrapper] = set()

    queue = deque([(0, graph.source_edge_id)])
    while len(queue) > 0:
        # Pop
        depth, edge_id = queue.popleft()
        edge: EdgeWrapper = graph.get_edge(edge_id)
        if edge in visited_edges:
            continue
        visited_edges.add(edge)
        
        # Add subgraph
        result.append(Subgraph(graph, edge.get_ref_id(), direction, depth))
        
        # Extend queue
        node_id: int = edge.node_ids[direction]
        node: NodeWrapper = graph.get_node(node_id)
        next_edge_ids: list[int] = node.edge_ids[direction]

        queue.extend([(depth + 1, next_edge_id)
                      for next_edge_id in next_edge_ids])
    return result

subgraphs = []
for graph in input_graphs:
    subgraphs.extend(get_subgraphs(graph, IN))
len(subgraphs)

196148

In [79]:
x = [graph.depth for graph in subgraphs]
min(x), max(x)

(0, 6)

In [9]:
from algorithm import to_nx
nx_graphs = [to_nx(graph) 
             for graph in tqdm(subgraphs)
             if graph.depth > 0]
len(nx_graphs)

100%|██████████| 196148/196148 [00:13<00:00, 14903.99it/s]


196148

In [10]:
from karateclub import Graph2Vec

graph2vec = Graph2Vec(
    wl_iterations=80,
    attributed=True,
    dimensions=128,
    workers=4,
    epochs=5
)

graph2vec.fit(nx_graphs)

graph2vec_embedding = graph2vec.get_embedding()
len(graph2vec_embedding)

196148

In [29]:
sample_graph = nx_graphs[0]
expected = graph2vec_embedding[0]
actual = graph2vec.infer([sample_graph])[0]
cosine_similarity([expected], [actual])[0][0]


0.9835127

In [33]:
actual

array([-0.33686686,  1.3241478 , -0.20804486, -0.7133608 ,  0.16706073,
        2.2758617 , -0.07645781, -1.0982784 , -0.11991999, -0.28946844,
        1.0980189 , -2.2240322 , -0.6862391 , -1.0491819 , -0.7996058 ,
        1.1191347 , -0.9507253 ,  0.8966808 , -1.4844282 , -0.23176792,
        1.2167865 , -0.74406123, -1.7537078 , -0.53550535, -2.1943705 ,
       -0.28732145, -1.3630453 ,  0.6877426 , -2.0306783 ,  0.28121036,
       -1.2813061 ,  0.31585386,  1.6081222 , -0.18307124,  0.03101732,
        1.3529453 ,  0.12766427,  0.82769   , -0.17096856, -0.7875405 ,
       -0.24664208,  0.49636075, -0.30190617,  1.682239  ,  0.8090765 ,
        1.576274  ,  0.20942189,  1.174763  , -0.43624714,  0.92903787,
       -0.7365148 ,  0.8803508 , -0.6727627 ,  1.4832268 , -1.6317298 ,
        1.2458901 , -0.40801254, -1.6215914 , -0.65813184,  0.07842028,
       -1.3635583 ,  0.80056465,  1.2470622 ,  0.8999733 ,  0.9479297 ,
       -0.58597016,  0.37318158,  0.8078734 , -0.93007463, -0.25

In [44]:
x = [graph2vec.infer([sample_graph])[0] for _ in tqdm(range(100))]

100%|██████████| 100/100 [00:18<00:00,  5.53it/s]


In [84]:
import random

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

indices = list(range(len(nx_graphs)))
sample_indices = random.choices(indices, k=20000)
columns = ['index', 'embedding_i', 'inferred_i', 'cosine_similarity']
df = pd.DataFrame(columns=columns)

for i in tqdm(sample_indices):
    graph = subgraphs[i]
    row = {
        'index': i,
        'depth': graph.depth,
        '#edges': len(graph.edges),
        '#nodes': len(graph.nodes),
        'embedding_i': graph2vec_embedding[i],
        'inferred_i': graph2vec.infer([nx_graphs[i]])[0],
        'cosine_similarity': cosine_similarity([graph2vec_embedding[i]], [graph2vec.infer([nx_graphs[i]])[0]])[0, 0]
    }
    df = df.append([row], ignore_index=True)

 34%|███▍      | 33872/100000 [03:10<06:10, 178.26it/s]


KeyboardInterrupt: 

In [None]:
df['#edges'] = df['index'].apply(lambda i: len(subgraphs[i].edges))

In [83]:
# group df by #edges
grouped_data = df.groupby('#edges')['cosine_similarity'].agg(['mean', 'std', 'count'])
grouped_data

Unnamed: 0_level_0,mean,std,count
#edges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,0.971427,0.020596,9878
2.0,0.976955,0.023588,20
3.0,0.956525,0.024739,5
4.0,0.97476,0.003769,5
5.0,0.963782,,1
7.0,0.979989,0.005538,4
17.0,0.987531,,1
22.0,0.966198,,1
25.0,0.980197,,1
28.0,0.989507,,1


In [None]:
import networkx as nx


In [73]:
grouped_data

Unnamed: 0_level_0,mean,std
#edges,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.971014,0.02009
2,0.979394,0.023532
4,0.976383,0.001113
7,0.974751,0.001285
49,0.976232,0.006081
310,0.991879,0.00054
346,0.994548,0.000254
417,0.994258,0.000175
428,0.995232,0.000212
584,0.992591,0.000331


In [65]:
df['cosine_similarity'].std()

0.020090329

In [53]:
np.std(similarities)

0.020014646

In [None]:
import pickle
with open('graph2vec_embedding.p', 'wb') as f:
    pickle.dump(graph2vec_embedding, f)
with open('subgraphs.p', 'wb') as f:
    pickle.dump(subgraphs, f)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations

data = {
    'delta_depth': [],
    'depth_1': [],
    'depth_2': [],
    'cosine_similarity': [],
    'parent_graph': []
}

graph_ids = list(set([s.graph.source_edge_id for s in subgraphs]))[:5]
for i_g, graph_id in enumerate(graph_ids):
    contained_subgraphs = [(i, s) for i, s in enumerate(subgraphs) if s.graph.source_edge_id == graph_id]
    comparisons = list(combinations(contained_subgraphs, 2))
    for (i, s1), (j, s2) in tqdm(comparisons, desc=f'{i_g+1}/{len(graph_ids)} ({graph_id})'):
        data['depth_1'].append(s1.depth)
        data['depth_2'].append(s2.depth)
        data['cosine_similarity'].append(cosine_similarity([graph2vec_embedding[i]], [graph2vec_embedding[j]])[0][0])


0/5: 100%|██████████| 500500/500500 [01:24<00:00, 5938.67it/s]
1/5: 100%|██████████| 500500/500500 [01:24<00:00, 5953.98it/s]
2/5: 100%|██████████| 500500/500500 [01:23<00:00, 5991.87it/s]
3/5: 100%|██████████| 500500/500500 [01:23<00:00, 5971.07it/s]
4/5: 100%|██████████| 500500/500500 [01:23<00:00, 5969.79it/s]


In [21]:
import pandas as pd
df = pd.DataFrame(data)
df.head()

ValueError: All arrays must be of the same length

In [18]:

grouped_data = df.groupby('delta_depth')['cosine_similarity'].agg(['mean', 'std'])
grouped_data.head()

Unnamed: 0_level_0,mean,std
delta_depth,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.948009,0.076453
1,0.731554,0.067857
2,0.954441,0.070242
3,0.219831,0.023357


In [88]:
import networkx as nx
G = nx_graphs[0]
# This displays the graph as a png
nx.draw(G, with_labels=True)

TypeError: '_AxesStack' object is not callable

<Figure size 640x480 with 0 Axes>