In [None]:
! pip install transformers datasets evaluate
! pip install -U accelerate
! pip install -U transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# State-Pair classification

In [3]:
from google.colab import userdata
from huggingface_hub import HfFolder

# Retrieve the access token from Colab's userdata
hf_access_token = userdata.get('hf_accesstoken')

if hf_access_token is not None:
    # Set the token in the Hugging Face Folder (this authenticates you)
    HfFolder.save_token(hf_access_token)
else:
    print("No Hugging Face access token available.")


## Initalize

In [28]:
import os
import csv
import json
import pandas as pd

base_path = '/content/drive/MyDrive/dataset_labeled/GroundTruthModels-SS'
app_num = 8

file_ending = '.html.tags'

apps = ['addressbook', 'claroline', 'ppma', 'mrbs', 'mantisbt', 'dimeshift', 'pagekit', 'phoenix', 'petclinic'] # order from WebEmbed Paper Table 5 (Results)
cur_test_app = apps[app_num]


# Dictionary containing the number of occurrences for each class (0 and 1) in each app's dataset, with [n0, n1], n0: distinct/n1: clone/Near-duplicate
absolute_labels = {
    'addressbook': [6142, 2373],
    'claroline': [14988, 2778],
    'ppma': [4320, 531],
    'mrbs': [7254, 4071],
    'mantisbt': [10206, 1119],
    'dimeshift': [10683, 945],
    'pagekit': [5782, 3948],
    'phoenix': [6569, 4606],
    'petclinic': [9411, 1615]
}


## Load and Prepare Dataset

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict, Features, ClassLabel, Value
from sklearn.model_selection import train_test_split

# Load the CSV file into a pandas DataFrame
file_path = f"/content/drive/MyDrive/dataset_labeled/trimmed_apps/{file_ending.split('.')[-1]}_archive/{cur_test_app}_trimmed.csv"
df = pd.read_csv(file_path)

# Split the DataFrame into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the DataFrames into Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Define the feature types
features = Features({
    'HUMAN_CLASSIFICATION': ClassLabel(names=['class_0', 'class_1']),
    'trimmed_state1': Value('string'),
    'trimmed_state2': Value('string'),
})

# Transforming into binary classification
def map_labels(example):
    if example['HUMAN_CLASSIFICATION'] == 2:
        example['HUMAN_CLASSIFICATION'] = 0
    else:
        example['HUMAN_CLASSIFICATION'] = 1
    return example

# Apply the transformation
dataset = dataset.map(map_labels)

print(f"Dataset loaded and split into training and test sets with 80/20 split for {cur_test_app}{file_ending}")

## Preprocess

load a DistilBERT tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Preprocess:

In [31]:
import torch
def preprocess(examples):
    tokenized_inputs = tokenizer(examples['trimmed_state1'], examples['trimmed_state2'],
                                 padding='max_length',
                                 truncation='longest_first',
                                 max_length=512,
                                 return_tensors='pt') # Return PyTorch tensors

    return {'input_ids': tokenized_inputs['input_ids'],
            'attention_mask': tokenized_inputs['attention_mask'],
            'labels': torch.tensor(examples['HUMAN_CLASSIFICATION'])
            }

In [None]:
print(f"Tokenizing Data for {cur_test_app} with html representation: {file_ending.split('.')[-1]}")
tokenized_data = {split: ds.map(preprocess, batched=True, cache_file_name=None, batch_size=32) for split, ds in dataset.items()}

In [None]:
tokenized_data

Create a batch of examples using [DataCollatorWithPadding](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorWithPadding). It's more efficient to *dynamically pad* the samples to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [34]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

Load F_1-Score as metric

In [35]:
import evaluate

f1 = evaluate.load("f1")

In [36]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Train

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# num labels 2 => binary classification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)

In [38]:
import torch
import torch.nn.functional as F
from transformers import Trainer

# arr: [n0, n1], (refer to dictionary above)
def calculate_class_weights(arr):
    n0 = arr[0]
    n1 = arr[1]

    N = n0 + n1
    w0 = N / n0
    w1 = N / n1
    w_min = min(w0, w1)

    return torch.tensor([w0 / w_min, w1 / w_min])

class FocalLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        class_weights = calculate_class_weights(absolute_labels[cur_test_app]).to(logits.device)  # Adjusted based on class distribution

        gamma = 2.0
        ce_loss = F.cross_entropy(logits, labels, weight=class_weights, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** gamma * ce_loss).mean()

        return (focal_loss, outputs) if return_outputs else focal_loss


print(f"Training model: WITHINAPPS_NDD-{cur_test_app}_test-{file_ending.split('.')[-1]}-CWAdj")

training_args = TrainingArguments(
    output_dir=f"WITHINAPPS_NDD-{cur_test_app}_test-{file_ending.split('.')[-1]}-CWAdj",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = FocalLossTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# BERT Base
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_data["train"],
#     eval_dataset=tokenized_data["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

trainer.train()

Training model: WITHINAPPS_NDD-petclinic_test-tags-CWAdj


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.226465,0.851768,0.870958,0.929076,0.851768
2,No log,0.20311,0.864914,0.8816,0.932231,0.864914
3,No log,0.1772,0.87942,0.893416,0.936088,0.87942
4,No log,0.161476,0.909338,0.918161,0.945603,0.909338
5,No log,0.151044,0.909338,0.918161,0.945603,0.909338


TrainOutput(global_step=345, training_loss=0.20627490057461503, metrics={'train_runtime': 1228.249, 'train_samples_per_second': 35.905, 'train_steps_per_second': 0.281, 'total_flos': 5841812280729600.0, 'train_loss': 0.20627490057461503, 'epoch': 5.0})

In [39]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/lgk03/WITHINAPPS_NDD-petclinic_test-tags-CWAdj/commit/6d1094b060ecbfce421652bd80083a0c8792fe62', commit_message='End of training', commit_description='', oid='6d1094b060ecbfce421652bd80083a0c8792fe62', pr_url=None, pr_revision=None, pr_num=None)