In [None]:
! pip install transformers datasets evaluate
! pip install -U accelerate
! pip install -U transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# State-Pair classification

In [3]:
from google.colab import userdata
from huggingface_hub import HfFolder

# Retrieve the access token from Colab's userdata
hf_access_token = userdata.get('hf_accesstoken')

if hf_access_token is not None:
    # Set the token in the Hugging Face Folder (this authenticates you)
    HfFolder.save_token(hf_access_token)
else:
    print("No Hugging Face access token available.")


## Initalize

In [None]:
import os
import csv
import json
import pandas as pd

base_path = '/content/drive/MyDrive/dataset_labeled/GroundTruthModels-SS'
app_num = 8

file_ending = '.html.content_tags'
# file_ending = '.html.content'
# file_ending = '.html.tags'

feature = file_ending.split('.')[-1]

apps = ['addressbook', 'claroline', 'ppma', 'mrbs', 'mantisbt', 'dimeshift', 'pagekit', 'phoenix', 'petclinic'] # order from WebEmbed Paper Table 5 (Results)
cur_test_app = apps[app_num] # the respective application that is not seen by the model during training in the current training process
print(f"Test App: {cur_test_app}")

# Dictionary containing the number of occurrences for each class (0 and 1) in each app's dataset, with [n0, n1], n0: distinct/n1: clone/Near-duplicate
absolute_labels = {
    'addressbook': [6142, 2373],
    'claroline': [14988, 2778],
    'ppma': [4320, 531],
    'mrbs': [7254, 4071],
    'mantisbt': [10206, 1119],
    'dimeshift': [10683, 945],
    'pagekit': [5782, 3948],
    'phoenix': [6569, 4606],
    'petclinic': [9411, 1615]
}

## Load dataset

In [None]:
from datasets import load_dataset, Features, ClassLabel, Value, Dataset, DatasetDict

print(f"Loading Dataset for {cur_test_app}, with for feature: {file_ending.split('.')[-1]}")

data_files = {"train": f"/content/drive/MyDrive/dataset_labeled/train_test_sets/{feature}_archive/train_{cur_test_app}.csv", "test": f"/content/drive/MyDrive/dataset_labeled/train_test_sets/{feature}_archive/test_{cur_test_app}.csv"}
dataset = load_dataset("lgk03/97k-state-pairs", data_files=data_files)

# transforming into binary classification
def map_labels(example):
    if example['HUMAN_CLASSIFICATION'] == 2:
        example['HUMAN_CLASSIFICATION'] = 0
    else:
        example['HUMAN_CLASSIFICATION'] = 1
    return example

dataset = dataset.map(map_labels)


## Preprocess

load a DistilBERT tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Preprocess:

In [7]:
import torch

def preprocess(examples):
  tokenized_inputs = tokenizer(examples['trimmed_state1'], examples['trimmed_state2'],
                                padding='max_length',
                                truncation='longest_first',
                                max_length=512,
                                return_tensors='pt') # Return PyTorch tensors

  return {'input_ids': tokenized_inputs['input_ids'],
          'attention_mask': tokenized_inputs['attention_mask'],
          'labels': torch.tensor(examples['HUMAN_CLASSIFICATION'])
          }

In [None]:
print(f"Tokenizing Data for {cur_test_app} with html representation: {file_ending.split('.')[-1]}")
tokenized_data = {split: ds.map(preprocess, batched=True, cache_file_name=None, batch_size=32) for split, ds in dataset.items()}

Create a batch of examples using [DataCollatorWithPadding](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorWithPadding). It's more efficient to *dynamically pad* the samples to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [9]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

Load F_1-Score as metric

In [None]:
import evaluate

f1 = evaluate.load("f1")

In [11]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Train

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# num labels 2 => binary classification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)

In [13]:
import torch
import torch.nn.functional as F
from transformers import Trainer

# arr: [n0, n1], (refer to dictionary above)
def calculate_class_weights(arr):
    n0 = arr[0]
    n1 = arr[1]

    N = n0 + n1
    w0 = N / n0
    w1 = N / n1
    w_min = min(w0, w1)

    return torch.tensor([w0 / w_min, w1 / w_min])

class FocalLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        class_weights = calculate_class_weights(absolute_labels[cur_test_app]).to(logits.device)  # Adjusted based on class distribution

        gamma = 2.0
        ce_loss = F.cross_entropy(logits, labels, weight=class_weights, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** gamma * ce_loss).mean()

        return (focal_loss, outputs) if return_outputs else focal_loss


print(f"Training model: ACROSSAPPS_NDD-{cur_test_app}_test-{file_ending.split('.')[-1]}")

training_args = TrainingArguments(
    output_dir=f"ACROSSAPPS_NDD-{cur_test_app}_test-{file_ending.split('.')[-1]}",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = FocalLossTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# BERT Base:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_data["train"],
#     eval_dataset=tokenized_data["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

trainer.train()

Training model: ACROSSAPPS_NDD-petclinic_test-content_tags


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.2103,0.354109,0.851442,0.865685,0.898421,0.851442
1,0.1574,0.308872,0.932523,0.934023,0.936368,0.932523


TrainOutput(global_step=1348, training_loss=0.17481904072294602, metrics={'train_runtime': 4572.2974, 'train_samples_per_second': 37.756, 'train_steps_per_second': 0.295, 'total_flos': 2.285367301832909e+16, 'train_loss': 0.17481904072294602, 'epoch': 1.9985174203113418})

In [14]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/lgk03/ACROSSAPPS_NDD-petclinic_test-content_tags/commit/0098ea9cb279df5334971f8e2fc2fea8d794f6ba', commit_message='End of training', commit_description='', oid='0098ea9cb279df5334971f8e2fc2fea8d794f6ba', pr_url=None, pr_revision=None, pr_num=None)