In [None]:
%pip install -r requirements.txt -q

In [None]:
import json
import numpy as np
import plotly.express as px
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import chromadb
from pprint import pprint

In [None]:
# load the csv files with numpy
baseline_file = 'assets/552466d6-7728-11ee-b936-00505696604e-sysop-baseline-createprocess-only-enabled.csv'
baseline_data = np.loadtxt(
    baseline_file, delimiter=',', dtype=str,
    # max_rows=10,
    skiprows=1,
    usecols=(1, 4, 5)
    )

#pprint(baseline_data)
pprint(baseline_data.shape)


In [None]:
alert_file = 'assets/218b4360-d459-11ee-8308-0050569a682f-sysop-alerts-createprocess.csv'
alert_data = np.loadtxt(
    alert_file, delimiter=',', dtype=str,
    # max_rows=10,
    skiprows=1,
    usecols=(3, 7, 8)
    )

# switch the columns vendorname and productname
alert_data[:, 1], alert_data[:, 2] = alert_data[:, 2], alert_data[:, 1].copy()

# rename file to path, productname to appname, vendorname to vendor
alert_data[:, 0] = np.char.replace(alert_data[:, 0], 'file', 'path')
alert_data[:, 1] = np.char.replace(alert_data[:, 1], 'productname', 'appname')
alert_data[:, 2] = np.char.replace(alert_data[:, 2], 'vendorname', 'vendor')

#pprint(alert_data)
pprint(alert_data.shape)

In [None]:
# normalize the data by lowercasing and removing punctuation
def normalize_data(data):
    return np.array([[
        x[0].lower().replace('.', '').replace(',', ''),
        x[1].lower().replace('.', '').replace(',', ''),
        x[2].lower().replace('.', '').replace(',', '')
    ] for x in data])

# remove well-known vendors
def remove_well_known_vendor(data):
    well_known_vendors = [
        'microsoft',
        'microsoft corp',
        'microsoft corp.',
        'microsoft corporation',
        'apple',
        'google llc',
        'google llc.',
        'intel® corporation',
        'intel corporation']

    return np.array([x for x in data if x[2] not in well_known_vendors])

baseline_data = normalize_data(baseline_data)
baseline_data = remove_well_known_vendor(baseline_data)
pprint(baseline_data)

alert_data = normalize_data(alert_data)
alert_data = remove_well_known_vendor(alert_data)
pprint(alert_data)

In [None]:
# concatenate 3 columns into 1 column
flat_baseline_data = np.array([' '.join(x) for x in baseline_data])
flat_alert_data = np.array([' '.join(x) for x in alert_data])

# take only the first column
# flat_baseline_data = baseline_data[:, 0]
# flat_alert_data = alert_data[:, 0]

pprint(flat_baseline_data.shape)
pprint(flat_alert_data.shape)

# remove duplicates
flat_baseline_data = np.unique(flat_baseline_data)
flat_alert_data = np.unique(flat_alert_data)

pprint(flat_baseline_data.shape)
pprint(flat_alert_data.shape)

In [None]:
# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:
baseline_embeddings = model.encode(flat_baseline_data)
alert_embeddings = model.encode(flat_alert_data)

pprint(baseline_embeddings.shape)
pprint(alert_embeddings.shape)

# combine embeddings in single array
embeddings = np.concatenate((baseline_embeddings, alert_embeddings))


In [None]:
# Perform TSNE to reduce to 3 components
tsne_model = TSNE(n_components=3, random_state=42)
tsne_embeddings_values = tsne_model.fit_transform(embeddings)

hover_names = []
colors = []

# baseline data
for baseline in flat_baseline_data:
    hover_names.append(baseline)
    colors.append('baseline')
# alert data
for alert in flat_alert_data:
    hover_names.append(alert)
    colors.append('alert')

fig = px.scatter_3d(
    x = tsne_embeddings_values[:,0],
    y = tsne_embeddings_values[:,1],
    z = tsne_embeddings_values[:,2],
    hover_name=hover_names,
    color = colors,
)

fig.update_traces(marker=dict(size=3))  # Increase the marker size uniformly

fig.update_layout(
    scene=dict(
        xaxis=dict(showticklabels=False, title=''),
        yaxis=dict(showticklabels=False, title=''),
        zaxis=dict(showticklabels=False, title=''),
    ),
    #showlegend=False,
    autosize=True,
    #width=600,  # Width of the plot
    #height=600,  # Height of the plot
    margin=dict(l=50, r=50, b=50, t=50, pad=4)  # Margins
)
fig.show()