In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# constant

In [None]:
data_folder  = "/content/drive/My Drive/Graph2Vec/data/"
workers = 4
wl_iterations = 2
dimensions = 64
min_count = 1
down_sampling = 0.0001
epochs = 100
learning_rate = 0.01
output_path = "/content/drive/My Drive/Graph2Vec/vectors/vector.csv"
# saved model 
# "/content/drive/My Drive/Graph2Vec/model/model" 
seve_model_path = "/content/drive/My Drive/Graph2Vec/model/model1"

# WeisfeilerLehman

In [None]:
import hashlib
class WeisfeilerLehman:

  def __init__(self, graph, features, iterations):

    self.iterations = iterations
    self.graph = graph
    self.features = features
    self.nodes = self.graph.nodes()
    self.extracted_features = [str(v) for k, v in features.items()]
    self.do_recursions()

  def do_a_recursion(self):

    new_features = {}
    for node in self.nodes:
      nebs = self.graph.neighbors(node)
      degs = [self.features[neb] for neb in nebs]
      features = [str(self.features[node])]+sorted([str(deg) for deg in degs])
      features = "_".join(features)
      hash_object = hashlib.md5(features.encode())
      hashing = hash_object.hexdigest()
      new_features[node] = hashing
    self.extracted_features = self.extracted_features + list(new_features.values())
    return new_features

  def do_recursions(self):

    for _ in range(self.iterations):
      self.features = self.do_a_recursion()

# 


In [None]:
import json
import networkx as nx
import pandas as pd
def dataset_reader(path):
   name = path.strip(".json").split("/")[-1]
    data = json.load(open(path, encoding="utf8"))
    # print(path)
    edges = []
    features = {}
    if(len(data["Processes"]) > 0) :
        incidents = {}
        incidents_list = data["Incidents"]
        for inc in incidents_list:
            if (inc["MitreAttacks"] is None):
                continue
            else:
                attacks = ""
                for a in inc["MitreAttacks"]:
                    attacks += " " + a
                if (inc["ProcessOID"] in incidents):
                    incidents[inc["ProcessOID"]] = incidents.get(inc["ProcessOID"]) + " " + attacks
                else:
                    incidents.update({inc["ProcessOID"]: ""})

        features.update({"CommandLine": {}})
        features.update({"Image": {}})
        features.update({"Incidents": incidents})
        features.update({"ProcessType": {}})

        for p in data["Processes"]:
            if (p["ProcessType"] == "Child process"):
                edges.append([p["ParentPID"], p["ProcessID"]])
            if (p["ProcessID"] not in incidents):
                features["Incidents"].update({p["ProcessID"]: ""})
            features["CommandLine"].update({p["ProcessID"]: p["CommandLine"]})
            features["Image"].update({p["ProcessID"]: p["Image"]})
            features["ProcessType"].update({p["ProcessID"]: p["ProcessType"]})
            
  graph = nx.Graph(edges)
  return graph, features, name

def feature_extractor(path, rounds):
  graph, features, name = dataset_reader(path)
  words_List = []
  for key in features:
    words_List = words_List + WeisfeilerLehman(graph, features[key], rounds).extracted_features
  doc = TaggedDocument(words=words_List, tags=["g_" + name])
  return doc

def save_to_csv(output_path, model, files, dimensions):
  out = []
    for f in files:
        identifier = f.split("/")[-1].strip(".json")
        vector = [identifier] + list(model.docvecs["g_"+identifier])
        if("malware" in identifier) :
            vector  = vector  + [1]
        else:
            vector  = vector  + [0]
        out.append(vector)
    column_names = ["id"]+["x_"+str(dim) for dim in range(dimensions)] + ["label"]
    out = pd.DataFrame(out, columns=column_names)
    out = out.sort_values(["id"])
    out.to_csv(output_path, index=None)

# main

In [None]:
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from joblib import Parallel, delayed
import glob
from gensim.test.utils import get_tmpfile
import os

# graphs = glob.glob(f"/content/drive/My Drive/Graph2Vec/malware/**/*.json") + glob.glob(f"/content/drive/My Drive/Graph2Vec/benign/**/*.json")
graphs = []
for r, d, f in os.walk(data_folder):
  for file in f:
    if '.json' in file:
      graphs.append(os.path.join(r, file))

document_collections = Parallel(n_jobs=workers)(delayed(feature_extractor)(g, wl_iterations) for g in tqdm(graphs))

model = Doc2Vec(document_collections,
                    vector_size=dimensions,
                    window=2,
                    min_count=min_count,
                    dm=0,
                    sample=down_sampling,
                    workers=workers,
                    epochs=epochs,
                    alpha=learning_rate,
                    negative = 5
                    )
# fname = get_tmpfile("test_model")
model.save(seve_model_path)

save_to_csv(output_path, model, graphs, dimensions)