In [None]:
%pip install -r requirements.txt -q

In [2]:
import json
import numpy as np
import plotly.express as px
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
from pprint import pprint

In [67]:
# load the csv files with numpy
baseline_file = 'assets/d9eaf13e-6aed-11ed-af76-005056b3b165-script-baseline-only-enabled.csv'
baseline_data = np.loadtxt(
    baseline_file, delimiter=',', dtype=str,
    #max_rows=3,
    #skiprows=1,
    usecols=(2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
    )

pprint(baseline_data)
pprint(baseline_data.shape)


array([['process', 'argument', 'parent1', ..., 'parent3argument',
        'parent4', 'parent4argument'],
       ['C:\\Windows\\System32\\WindowsPowerShell\\v1.0\\powershell.exe',
        "-ExecutionPolicy Restricted -Command Write-Host 'Final result: 1';",
        'C:\\Windows\\System32\\CompatTelRunner.exe', ..., '', '', ''],
       ['C:\\Windows\\System32\\WindowsPowerShell\\v1.0\\powershell.exe',
        "-ExecutionPolicy Restricted -Command Write-Host 'Final result: 1';",
        'C:\\Windows\\System32\\CompatTelRunner.exe', ..., '', '', ''],
       ...,
       ['C:\\WINDOWS\\system32\\cscript.exe',
        '/Nologo D:\\TPGMS\\AIDAR1\\SupportFiles\\CodeSource\\DLT_VIEWER\\utilities\\DLTExecute.vbs D:\\TPGMS\\AIDAR1\\SupportFiles\\CodeSource\\DLT_VIEWER\\dlt_viewer.exe D:\\TPGMS\\AIDAR1\\SupportFiles\\CodeSource\\DLT_VIEWER\\projects\\AIDAR1_H1.dlp D:\\RESULTS\\DLT\\1BUH3A10BGAD_H1_20231019_15133.dlt',
        'C:\\Program Files\\National Instruments\\TestStand 2017\\Bin\\SeqEdit.ex

In [69]:
alert_file = 'assets/d9eaf13e-6aed-11ed-af76-005056b3b165-script-alerts.csv'
alert_data = np.loadtxt(
    alert_file, delimiter=',', dtype=str,
    max_rows=10,
    #skiprows=1,
    usecols=(1, 3, 4, 5)
    )

# # switch the columns vendorname and productname
# alert_data[:, 1], alert_data[:, 2] = alert_data[:, 2], alert_data[:, 1].copy()

# # rename file to path, productname to appname, vendorname to vendor
# alert_data[:, 0] = np.char.replace(alert_data[:, 0], 'file', 'path')
# alert_data[:, 1] = np.char.replace(alert_data[:, 1], 'productname', 'appname')
# alert_data[:, 2] = np.char.replace(alert_data[:, 2], 'vendorname', 'vendor')

pprint(alert_data)
pprint(alert_data.shape)

array([['file', 'parent1', 'parent2', 'parent3'],
       ['C:\\Windows\\SysWOW64\\WindowsPowerShell\\v1.0\\powershell.exe',
        'C:\\Windows\\System32\\cmd.exe',
        'C:\\Program Files\\OCS Inventory Agent\\OCSInventory.exe',
        'C:\\Windows\\System32\\cmd.exe'],
       ['C:\\Windows\\SysWOW64\\WindowsPowerShell\\v1.0\\powershell.exe',
        'C:\\Windows\\System32\\cmd.exe',
        'C:\\Program Files\\OCS Inventory Agent\\OCSInventory.exe',
        '<NA>'],
       ['C:\\Windows\\SysWOW64\\WindowsPowerShell\\v1.0\\powershell.exe',
        'C:\\Windows\\System32\\cmd.exe',
        'C:\\Program Files\\OCS Inventory Agent\\OCSInventory.exe',
        '<NA>'],
       ['C:\\Windows\\SysWOW64\\WindowsPowerShell\\v1.0\\powershell.exe',
        'C:\\Windows\\System32\\cmd.exe',
        'C:\\Program Files\\OCS Inventory Agent\\OCSInventory.exe',
        '<NA>'],
       ['C:\\Windows\\SysWOW64\\WindowsPowerShell\\v1.0\\powershell.exe',
        'C:\\Windows\\System32\\cmd.exe',
    

In [None]:
# normalize the data by lowercasing and removing punctuation
def normalize_data(data):
    return np.array([[
        x[0].lower().replace('.', '').replace(',', ''),
        x[1].lower().replace('.', '').replace(',', ''),
        x[2].lower().replace('.', '').replace(',', '')
    ] for x in data])

baseline_data = normalize_data(baseline_data)
pprint(baseline_data)

alert_data = normalize_data(alert_data)
pprint(alert_data)

In [None]:
# concatenate 3 columns into 1 column
flat_baseline_data = np.array([' '.join(x) for x in baseline_data])
flat_alert_data = np.array([' '.join(x) for x in alert_data])

# take only the first column
# flat_baseline_data = baseline_data[:, 0]
# flat_alert_data = alert_data[:, 0]

pprint(flat_baseline_data.shape)
pprint(flat_alert_data.shape)

# remove duplicates
flat_baseline_data = np.unique(flat_baseline_data)
flat_alert_data = np.unique(flat_alert_data)

pprint(flat_baseline_data.shape)
pprint(flat_alert_data.shape)

In [None]:
# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:
baseline_embeddings = model.encode(flat_baseline_data)
alert_embeddings = model.encode(flat_alert_data)

pprint(baseline_embeddings.shape)
pprint(alert_embeddings.shape)

# combine embeddings in single array
embeddings = np.concatenate((baseline_embeddings, alert_embeddings))


In [None]:
# Perform TSNE to reduce to 3 components
tsne_model = TSNE(n_components=3, random_state=42)
tsne_embeddings_values = tsne_model.fit_transform(embeddings)

hover_names = []
colors = []

# baseline data
for baseline in flat_baseline_data:
    hover_names.append(baseline)
    colors.append('baseline')
# alert data
for alert in flat_alert_data:
    hover_names.append(alert)
    colors.append('alert')

fig = px.scatter_3d(
    x = tsne_embeddings_values[:,0],
    y = tsne_embeddings_values[:,1],
    z = tsne_embeddings_values[:,2],
    hover_name=hover_names,
    color = colors,
)

fig.update_traces(marker=dict(size=3))  # Increase the marker size uniformly

fig.update_layout(
    scene=dict(
        xaxis=dict(showticklabels=False, title=''),
        yaxis=dict(showticklabels=False, title=''),
        zaxis=dict(showticklabels=False, title=''),
    ),
    #showlegend=False,
    autosize=True,
    #width=600,  # Width of the plot
    #height=600,  # Height of the plot
    margin=dict(l=50, r=50, b=50, t=50, pad=4)  # Margins
)
fig.show()