## **Mount Google Drive for access to the dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Imports**

In [None]:
! pip install transformers datasets
from google.colab import files
from transformers import AutoTokenizer, AutoModel
import torch
import time

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Installing collected 

## **Load Model And Tokenizer**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

## **Preprocessing Functions**

In [None]:
def chunk_html(html, chunk_size):
    return [html[i:i+chunk_size] for i in range(0, len(html), chunk_size)]

import os
import csv
overview_csv_path = '/content/drive/MyDrive/dataset_labeled/SS.csv'
test_csv_path = '/content/drive/MyDrive/dataset_labeled/test_data.csv'
train_csv_path = '/content/drive/MyDrive/dataset_labeled/train_data.csv'

base_path = '/content/drive/MyDrive/dataset_labeled/GroundTruthModels-SS'
# file_ending = '.html.content_tags'
file_ending = '.html.content'
# file_ending = '.html.tags'

human_class = {0: 'clone', 1: 'near duplicate', 2: 'distinct'}

## Calculate and compute embeddings

In [None]:
# function to produce the embedding of one state in a state-pair
def process_state(state, base_path, application_name, file_ending, tokenizer, model, max_seq_length):
    # Construct the file path
    path = os.path.join(base_path, application_name, state + file_ending)
    # df = pd.read_csv(f'/content/drive/MyDrive/')
    # Read the HTML file content
    with open(path, 'r') as file:
        html_state = file.read()

    # Chunk the HTML content
    html_state_chunks = chunk_html(html_state, max_seq_length)
    outputs = []

    """
    tokenize each chunk -> transform raw text into a format
    suitable for the model
    """
    for chunk in html_state_chunks:
        tokens = tokenizer(chunk, return_tensors="pt")
        # tokenized representation of the chunk
        input_ids = tokens["input_ids"]

        # disable gradient calculations
        with torch.no_grad():
            output = model(input_ids=input_ids)

        """
        normalize along the features dimension to ensure embeddings
        have a consistent scale, p = 2: utilize L2 norm
        """
        output.last_hidden_state = torch.nn.functional.normalize(output.last_hidden_state, p=2, dim=2)
        outputs.append(output)

        """
        output.last_hidden_state is pytorch tensor of shape:
        [batch_size, sequence_length, features]
        """

    """
    concatenate the outputs of all chunks (i.e. tensors) along the
    second dimension (sequence length)
    """
    aggregated_output = torch.cat([output.last_hidden_state for output in outputs], dim=1)
    print(aggregated_output)

    """
    compute the mean(-> average) of the elements in the input
    tensor along the sequence length dimension
    """
    aggregated_output = torch.mean(aggregated_output, dim=1)

    """
    as batchSize == 1: normalizing across dim=0 to ensure that the
    final embedding vector has a unit norm
    """
    aggregated_output = torch.nn.functional.normalize(aggregated_output, p=2, dim=0)

    return aggregated_output

In [None]:
"""
whole representation version, iterate through dataset (each application at a time), produce embeddings,
calculate similarity and print it
"""

init_count = 1

try:
    with open(test_csv_path, 'r') as overview:
        print(f'file_ending: {file_ending}')
        csv_reader = csv.reader(overview)
        header = next(csv_reader)
        appname_index = header.index('appname')
        state1_index = header.index('state1')
        state2_index = header.index('state2')
        human_classification_index = header.index('HUMAN_CLASSIFICATION')

        # caches the final,normalized embeddings of already computed states
        cached = {}
        count = 0

        max_seq_length = model.config.max_position_embeddings
        i = init_count

        # Iterate through each row in the CSV, which always contains one pair and the label assigned to it
        for state_pair in csv_reader:
            i -= 1
            if i > 0: continue
            first_cached = False
            second_cached = False

            """ restart at every new application as state names are not unique
            (e.g index.html.content exists in every application, possibly
            interfering with the chaching)"""
            application_name = state_pair[appname_index]
            if application_name != "addressbook":
              print("not application anymore: ", application_name); break
            state1 = state_pair[state1_index]
            state2 = state_pair[state2_index]

            if state1 != 'state141' or state2 != 'state239': continue

            # # only for content
            # if application_name == "adressbook" and (state1 == "state27" or state2 == "state27"):
            #   print(f"({init_count + count}), {application_name}, State1:{state1}, State2:{state2}, Human Classification:{human_classification}, cosine similarity: -1, SKIPPED (STATE27)")
            #   count += 1
            #   continue


            human_classification = state_pair[human_classification_index]

            if state1 in cached: first_cached = True
            if state2 in cached: second_cached = True

            if not first_cached:
              aggregated_output_1 = process_state(state1, base_path, application_name, file_ending, tokenizer, model, max_seq_length)
            if not second_cached:
              aggregated_output_2 = process_state(state2, base_path, application_name, file_ending, tokenizer, model, max_seq_length)

            # cache the outputs
            cached[state1] = aggregated_output_1
            cached[state2] = aggregated_output_2

            similarity_score = torch.cosine_similarity(cached[state1], cached[state2])

            print(f"({init_count + count - 1}), {application_name}, State1:{state1}, State2:{state2}, Human Classification:{human_classification}, cosine similarity: {similarity_score.item()}")
            count += 1
            # break

except FileNotFoundError:
    print(f"File not found")
except Exception as e:
    print(f"An error occurred: {str(e)}")

file_ending: .html.content
tensor([[[-0.0129,  0.0060, -0.0014,  ..., -0.0101,  0.0222,  0.0350],
         [ 0.0008,  0.0180, -0.0011,  ...,  0.0302,  0.0191, -0.0401],
         [-0.0679,  0.0265,  0.0095,  ..., -0.0014,  0.0201, -0.0002],
         ...,
         [ 0.0042, -0.0147,  0.0277,  ..., -0.0055,  0.0286, -0.0298],
         [ 0.0054,  0.0042,  0.0362,  ...,  0.0110,  0.0183, -0.0352],
         [ 0.0461,  0.0182, -0.0141,  ...,  0.0076, -0.0447, -0.0130]]])
tensor([[[-0.0190,  0.0124,  0.0268,  ...,  0.0034,  0.0246,  0.0406],
         [ 0.0294,  0.0537,  0.0294,  ...,  0.0202,  0.0129, -0.0335],
         [-0.0705,  0.0310,  0.0142,  ...,  0.0160,  0.0191,  0.0055],
         ...,
         [-0.0115, -0.0139,  0.0201,  ...,  0.0025,  0.0345, -0.0341],
         [ 0.0065, -0.0062,  0.0427,  ...,  0.0456,  0.0181, -0.0262],
         [ 0.0466,  0.0113, -0.0005,  ...,  0.0048, -0.0482, -0.0136]]])
(0), addressbook, State1:state141, State2:state239, Human Classification:1, cosine simila