In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import datasets
from sentence_transformers import SentenceTransformer, util

In [125]:
annotations = pd.read_pickle("../data/binary-dataset.pkl")

dataset = annotations[["source_text", "target_text", "label"]].copy()
# dataset["strategy"] = annotations["strategy"].astype("category").cat.codes

print(f"Non-matching examples:\n{dataset.loc[dataset.label == 0].info()}\n\n")
print(f"Matching examples:\n{dataset.loc[dataset.label == 1].info()}")

In [None]:
train_data, test_data = train_test_split(dataset, test_size=0.2, shuffle=True, random_state=42)

train_data.to_csv("../data/train.csv", index=False)
train_data.to_json("../data/train.jsonl", orient="records")

test_data.to_csv("../data/test.csv", index=False)
test_data.to_json("../data/test.jsonl", orient="records")

In [None]:
source_texts = df["source_text"].to_list()
target_texts = df["target_text"].to_list()
labels = df["label"].astype("int").to_list()

sentence_tf = SentenceTransformer('all-MiniLM-L6-v2')
source_embeddings = sentence_tf.encode(source_texts, convert_to_tensor=True)
target_embeddings = sentence_tf.encode(target_texts, convert_to_tensor=True)

cos_sim = np.array(util.cos_sim(source_embeddings, target_embeddings))
df["cos_sim"] = [sim[0] for sim in cos_sim]

In [None]:
# Working example: Binary classification, fine-tuned on distilbert-base-uncased

from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
import datasets
import numpy as np
import evaluate

def preprocess_function(batch):
    return tokenizer(batch["source_text"], batch["target_text"], truncation=True, padding="max_length")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
dataset = datasets.load_dataset("lguenth/backsum", data_files={"train": "train/seqclass-train.jsonl", "test": "test/seqclass-test.jsonl"})
tokenized_data = dataset.map(preprocess_function, batched=True)
print(tokenized_data)

def compute_metrics(eval_pred):
    f1_score = datasets.load_metric("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    print(labels)
    print(predictions)
    f1_score.add_batch(predictions=predictions, references=labels)
    return f1_score.compute()

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()