In [None]:
%pip install -r requirements.txt -q

In [None]:
import numpy as np
import plotly.express as px
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE

In [None]:
# Define constants
data_file = 'sysop-baseline-uniq.csv'
num_data_per_app = 30

run_name = "mpnet-base-all-nli-triplet"
base_model_name = "sentence-transformers/all-MiniLM-L6-v2"
output_model_dir = f"models/{run_name}"


In [None]:
# use numpy to load csv file, the csv file contains single column of data with header
def load_csv_file(file_path):
    data = np.genfromtxt(
        file_path,
        delimiter='\t',
        dtype=str)
    return data

data = load_csv_file(data_file)

In [None]:
# extrac dismhost.exe in temp
# https://answers.microsoft.com/en-us/windows/forum/all/what-is-dismhostexe-in-temp-folder/7fa5bcf5-bfaf-4952-b05d-cec0a57461e3

# extract google\chrome\application
# extract microsoft\edgewebview\application
# extract microsoft\edgeupdate
# extract rockwell software
# extract national instruments

filters = [
    'dismhost.exe',
    'google\\chrome\\application',
    'microsoft\\edgewebview\\application',
    'microsoft\\edgeupdate',
    'rockwell software',
    'national instruments'
]


In [None]:
# group by the filtered data, create a map of filtered data
# map key is filters, map value is the filtered data
filtered_data_map = {}
for filter in filters:
    # find all rows that contains the filter
    filter_data = data[np.char.find(data, filter) != -1]
    # append the filtered data to the filtered_data
    filtered_data_map[filter] = filter_data

# only keep the top 30 data in the filtered data
for key in filtered_data_map:
    filtered_data_map[key] = filtered_data_map[key][:num_data_per_app]


In [None]:
# Load the pre-trained model
model = SentenceTransformer(base_model_name)


In [None]:
# get embeddings per filter
app_embeddings = {}
details = []
for app, v in filtered_data_map.items():
    app_embeddings[app] = model.encode(v)
    for data in v:
        # keep last 50 characters
        details.append(data)

In [None]:
# combine embeddings in single array
embeddings = np.vstack((app_embeddings[filters[0]], app_embeddings[filters[1]]))
for app in filters[2:]:
    embeddings = np.vstack((embeddings, app_embeddings[app]))

In [None]:
# Perform TSNE to reduce to 3 components
tsne_model = TSNE(n_components=3, random_state=42)
tsne_embeddings_values = tsne_model.fit_transform(embeddings)

hover_names = details
colors = [filter for filter in filters for _ in range(num_data_per_app)]

fig = px.scatter_3d(
    x = tsne_embeddings_values[:,0],
    y = tsne_embeddings_values[:,1],
    z = tsne_embeddings_values[:,2],
    hover_name=hover_names,
    color = colors,
)

fig.update_traces(marker=dict(size=13))  # Increase the marker size uniformly

fig.update_layout(
    scene=dict(
        xaxis=dict(showticklabels=False, title=''),
        yaxis=dict(showticklabels=False, title=''),
        zaxis=dict(showticklabels=False, title=''),
    ),
    #showlegend=False,
    autosize=True,
    #width=600,  # Width of the plot
    #height=600,  # Height of the plot
    margin=dict(l=50, r=50, b=50, t=50, pad=4)  # Margins
)
fig.show()

In [None]:
# create a triplet data
# triplet data is a tuple of 3 data, the first data is the anchor, the second data is the positive, and the third data is the negative
# the anchor and positive are from the same application, the negative is from different application
triplet_data = []
for filter in filters:
    app_data = filtered_data_map[filter]
    for i in range(num_data_per_app):
        anchor = app_data[i]
        positive = app_data[(i + 1) % num_data_per_app]
        for filter2 in filters:
            if filter2 != filter:
                negative = filtered_data_map[filter2][i]
                triplet_data.append((anchor, positive, negative))

# shuffle the triplet data
np.random.shuffle(triplet_data)


In [None]:
from datasets import Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

In [None]:
# split the dataset into train, validation, and test
train_data = triplet_data[:int(len(triplet_data) * 0.8)]
validation_data = triplet_data[int(len(triplet_data) * 0.8):int(len(triplet_data) * 0.9)]
test_data = triplet_data[int(len(triplet_data) * 0.9):]

# convert the triplet data into dataset
train_dataset = Dataset.from_dict({
    "anchor": [data[0] for data in train_data],
    "positive": [data[1] for data in train_data],
    "negative": [data[2] for data in train_data],
})
validation_dataset = Dataset.from_dict({
    "anchor": [data[0] for data in validation_data],
    "positive": [data[1] for data in validation_data],
    "negative": [data[2] for data in validation_data],
})
test_dataset = Dataset.from_dict({
    "anchor": [data[0] for data in test_data],
    "positive": [data[1] for data in test_data],
    "negative": [data[2] for data in test_data],
})

In [None]:
# define a loss function
loss = MultipleNegativesRankingLoss(model)


In [None]:
# (Optional) Specify training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=output_model_dir,

    # Optional training parameters:
    num_train_epochs=1,
    # If per_device_train_batch_size 8 and you are using 2 GPUs,
    # each GPU will process 8 samples per batch, resulting in a total batch size of 16 across all devices.
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    warmup_ratio=0.1,

    fp16=False,  # Set to False if GPU can't handle FP16
    bf16=True,  # Set to True if GPU supports BF16

    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicates

    use_mps_device=True,

    # Optional tracking/debugging parameters:

    eval_strategy="steps",
    eval_steps=100,

    # The checkpoint save strategy to adopt during training.
    # ”no”: No save is done during training.
    # ”epoch”: Save is done at the end of each epoch.
    # ”steps”: Save is done every save_steps.
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,

    logging_steps=100,

    run_name=run_name,  # Used in W&B if `wandb` is installed
)

In [None]:
# (Optional) Create an evaluator & evaluate the base model
val_evaluator = TripletEvaluator(
    anchors=validation_dataset["anchor"],
    positives=validation_dataset["positive"],
    negatives=validation_dataset["negative"],
    name="validation",
)
val_evaluator(model)


In [None]:
# Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    loss=loss,
    evaluator=val_evaluator,
)
trainer.train()


In [None]:
# (Optional) Evaluate the trained model on the test set, after training completes
test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name="test",
)
test_evaluator(model)


In [None]:
# get embeddings per filter
app_embeddings = {}
details = []
for app, v in filtered_data_map.items():
    app_embeddings[app] = model.encode(v)
    for data in v:
        # keep last 50 characters
        details.append(data)

In [None]:
# combine embeddings in single array
embeddings = np.vstack((app_embeddings[filters[0]], app_embeddings[filters[1]]))
for app in filters[2:]:
    embeddings = np.vstack((embeddings, app_embeddings[app]))

In [None]:
# Perform TSNE to reduce to 3 components
tsne_model = TSNE(n_components=3, random_state=42)
tsne_embeddings_values = tsne_model.fit_transform(embeddings)

hover_names = details
colors = [filter for filter in filters for _ in range(num_data_per_app)]

fig = px.scatter_3d(
    x = tsne_embeddings_values[:,0],
    y = tsne_embeddings_values[:,1],
    z = tsne_embeddings_values[:,2],
    hover_name=hover_names,
    color = colors,
)

fig.update_traces(marker=dict(size=13))  # Increase the marker size uniformly

fig.update_layout(
    scene=dict(
        xaxis=dict(showticklabels=False, title=''),
        yaxis=dict(showticklabels=False, title=''),
        zaxis=dict(showticklabels=False, title=''),
    ),
    #showlegend=False,
    autosize=True,
    #width=600,  # Width of the plot
    #height=600,  # Height of the plot
    margin=dict(l=50, r=50, b=50, t=50, pad=4)  # Margins
)
fig.show()