In [1]:
import torch
import pandas as pd
from ast import literal_eval
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer

In [2]:
LOAD_FROM_HUB = False

In [3]:
# Load dataset

if LOAD_FROM_HUB:
    train = load_dataset("lguenth/backsum", split="train")
    test = load_dataset("lguenth/backsum", split="test")
else:
    train = pd.read_pickle("../data/train.pkl")
    test = pd.read_pickle("../data/test.pkl")

In [None]:
# Calculate baseline from generic Sentence Transformer

from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch

data = train
model = SentenceTransformer('all-MiniLM-L6-v2')

matches_without_cutoff = []
matches_with_cutoff = []
scores = []

def evaluate(row):
    corpus = row.target_doc
    source = row.source_text
    target = row.target_text

    source_embedding = model.encode(source, convert_to_tensor=True)
    corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
    similarity_scores = util.cos_sim(source_embedding, corpus_embeddings)[0]
    
    threshold = 0.85
    best_match = torch.topk(similarity_scores, k=1)
    predicted_labels = [1 if score >= threshold else 0 for score in similarity_scores]

    accuracy = accuracy_score(row.labels, predicted_labels)
    # f1 = f1_score(row.labels, predicted_labels)
    # precision = precision_score(row.labels, predicted_labels, average="binary")
    # recall = recall_score(row.labels, predicted_labels)
    
    scores.append({
        "accuracy": accuracy, 
        # "f1": f1, 
        # "precision": precision, 
        # "recall": recall
    })

    if row.labels == predicted_labels: matches_with_cutoff.append(1)
    if target == corpus[best_match[1]]: matches_without_cutoff.append(1)

accuracy_with_cutoff = sum([i["accuracy"] for i in scores])/len(data)
# f1_with_cutoff = sum([i["f1"] for i in scores])/len(data)
# recall_with_cutoff = sum([i["recall"] for i in scores])/len(data)
# precision_with_cutoff = sum([i["precision"] for i in scores])/len(data)

accuracy_without_cutoff = sum(matches_without_cutoff)/len(data)

print("Number of matches with similarity threshold: ", sum(matches_with_cutoff))
print("Number of matches without threshold: ", sum(matches_without_cutoff))

print("\n========\n")

print("Average accuracy with similarity threshold: ", accuracy_with_cutoff)
print("Accuracy without threshold: ", accuracy_without_cutoff)

# print("\n========\n")

# print("Average F1 with threshold: ", f1_with_cutoff)
# print("Average recall with threshold: ", recall_with_cutoff)
# print("Average precision with threshold: ", precision_with_cutoff)

In [None]:
# Working example

from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import numpy as np
import evaluate

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

dataset = load_dataset("yelp_review_full")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
metric = evaluate.load("accuracy")

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
import numpy as np

class BacksumDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, tokenizer, max_seq_length):
        self.source_encodings = tokenizer(source_sentences, padding=True, truncation=True, max_length=max_seq_length, return_tensors="pt")
        self.target_encodings = tokenizer(target_sentences, padding=True, truncation=True, max_length=max_seq_length, return_tensors="pt")
        self.similarity_scores = torch.tensor(similarity_scores, dtype=torch.float32)

    def __len__(self):
        return len(self.source_encodings.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.source_encodings.input_ids[idx],
            "attention_mask": self.source_encodings.attention_mask[idx],
            "target_ids": self.target_encodings.input_ids[idx],
            "similarity_scores": self.similarity_scores[idx]
        }

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

dataset = BacksumDataset(source_sentences, target_sentences, tokenizer, 512)
dataloader = DataLoader(dataset)

In [None]:
from sentence_transformers import SentenceTransformer

sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentence_transformer)