<a href="https://colab.research.google.com/github/hankserr/paraphrase_detector/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Version: 1.0.2 deberta-v3-small dates dataset

# Install necessary packages
!pip install datasets
!pip install evaluate

# Import required libraries
import torch
import numpy as np
import tensorflow as tf
from datasets import load_dataset
import evaluate
from transformers import (
    TFAutoModelForSequenceClassification, AutoTokenizer,
    DataCollatorWithPadding, create_optimizer,
    DataCollatorForSeq2Seq, TFAutoModelForSeq2SeqLM
)
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers.keras_callbacks import KerasMetricCallback

def load_datasets():
    """Load the PAWS dataset."""
    # train_dataset = load_dataset("paws", "labeled_final", split="train")
    # test_dataset = load_dataset("paws", "labeled_final", split="test")
    """Load the paraphrase_dataset_dates dataset."""
    dataset = load_dataset('csv', data_files=['paraphrase_dataset_dates.csv'])
    dataset = dataset["train"]
    # Example: Using a 80-20 split for train-test
    split_ratio = 0.8
    split_point = int(split_ratio * len(dataset))

    train_dataset = dataset.select(range(split_point))  # First 80% for training
    test_dataset = dataset.select(range(split_point, len(dataset)))  # Remaining 20% for testing
    return train_dataset, test_dataset

def preprocess_datasets(train_dataset, test_dataset, tokenizer):
    """Preprocess the datasets by tokenizing and formatting."""
    def encode(examples):
      # Need to replace sentence1/sentence2 with Input & Paraphrase
      return tokenizer(examples["Input"], examples["Paraphrase"], truncation=True, padding="max_length", max_length=128)

    train_dataset = train_dataset.map(encode, batched=True)
    test_dataset = test_dataset.map(encode, batched=True)

    # Needed to switch from labels to Type
    train_dataset = train_dataset.map(lambda examples: {"labels": examples["Type"]})
    test_dataset = test_dataset.map(lambda examples: {"labels": examples["Type"]})

    train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "labels"])
    test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "labels"])

    return train_dataset, test_dataset

def create_data_collator(tokenizer):
    """Create a data collator for dynamic padding."""
    return DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

def load_model():
    """Load the DistilBERT model."""
    id2label = {0: "NEGATIVE", 1: "POSITIVE"}
    label2id = {"NEGATIVE": 0, "POSITIVE": 1}
    return TFAutoModelForSequenceClassification.from_pretrained(
        "microsoft/deberta-v3-small", num_labels=2
    )

def create_optimizer_and_schedule(dataset, batch_size=16, num_epochs=4):
    """Create an optimizer and learning rate schedule."""
    batches_per_epoch = len(dataset) // batch_size
    total_train_steps = int(batches_per_epoch * num_epochs)
    return create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

def compute_metrics(eval_pred):
    """Compute evaluation metrics."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    # decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    accuracy = evaluate.load("accuracy")
    return accuracy.compute(predictions=predictions, references=labels)

def prepare_tf_datasets(model, train_dataset, test_dataset, data_collator, batch_size=16):
    """Convert datasets to TensorFlow datasets."""
    tf_train_set = model.prepare_tf_dataset(
        train_dataset,
        shuffle=True,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    tf_validation_set = model.prepare_tf_dataset(
        test_dataset,
        shuffle=False,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    return tf_train_set, tf_validation_set

def train_model(model, tf_train_set, tf_validation_set, optimizer, num_epochs=6):
    """Train the model."""
    model.compile(optimizer=optimizer, metrics=["accuracy"])

    metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

    checkpoint_callback = ModelCheckpoint(
        filepath='model_checkpoint_epoch_{epoch:02d}.h5',
        save_weights_only=True,
        save_freq='epoch',
    )

    model.fit(
        x=tf_train_set,
        validation_data=tf_validation_set,
        epochs=num_epochs,
        callbacks=[metric_callback, checkpoint_callback]
    )

def predict_paraphrase(sentence1, sentence2, tokenizer, model):
    inputs = tokenizer(sentence1, sentence2, return_tensors='tf', truncation=True, padding='max_length', max_length=128)
    outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    logits = outputs.logits
    probabilities = tf.nn.softmax(logits, axis=-1)
    predicted_class = tf.argmax(logits, axis=-1).numpy()[0]

    return predicted_class, probabilities.numpy()[0]


def main():
    train_dataset, test_dataset = load_datasets()
    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
    train_dataset, test_dataset = preprocess_datasets(train_dataset, test_dataset, tokenizer)
    data_collator = create_data_collator(tokenizer)
    model = load_model()
    optimizer, _ = create_optimizer_and_schedule(train_dataset)
    tf_train_set, tf_validation_set = prepare_tf_datasets(model, train_dataset, test_dataset, data_collator)
    train_model(model, tf_train_set, tf_validation_set, optimizer)
    diff_1 = "02/23/2025"
    diff_2 = "January 23, 2025"

    same_1 = "January 23, 2024"
    same_2 = "Jan 23, 24"

    reorder_1 = "January 23, 2024"
    reorder_2 = "01/23/24"

    para_1 = "5-10-99"
    para_2 = "Mar 10 1999"

    # Predict if the sentences are paraphrases
    predicted_class, probabilities = predict_paraphrase(diff_1, diff_2, tokenizer, model)
    print("Diff")
    print(f"Predicted class: {predicted_class} (0 means not paraphrased, 1 means paraphrased)")
    print(f"Probabilities: {probabilities}")

    predicted_class, probabilities = predict_paraphrase(same_1, same_2, tokenizer, model)
    print("Same")
    print(f"Predicted class: {predicted_class} (0 means not paraphrased, 1 means paraphrased)")
    print(f"Probabilities: {probabilities}")

    predicted_class, probabilities = predict_paraphrase(reorder_1, reorder_2, tokenizer, model)
    print("Reorder")
    print(f"Predicted class: {predicted_class} (0 means not paraphrased, 1 means paraphrased)")
    print(f"Probabilities: {probabilities}")

    predicted_class, probabilities = predict_paraphrase(para_1, para_2, tokenizer, model)
    print("Para")
    print(f"Predicted class: {predicted_class} (0 means not paraphrased, 1 means paraphrased)")
    print(f"Probabilities: {probabilities}")



if __name__ == "__main__":
    main()

1
20000

Train: 16000 


Test: 4000 



All model checkpoint layers were used when initializing TFDebertaV2ForSequenceClassification.

Some layers of TFDebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['cls_dropout', 'classifier', 'pooler']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Diff
Predicted class: 1 (0 means not paraphrased, 1 means paraphrased)
Probabilities: [0.00250539 0.9974946 ]
Same
Predicted class: 1 (0 means not paraphrased, 1 means paraphrased)
Probabilities: [2.2308688e-05 9.9997771e-01]
Reorder
Predicted class: 1 (0 means not paraphrased, 1 means paraphrased)
Probabilities: [4.984133e-05 9.999502e-01]
Para
Predicted class: 0 (0 means not paraphrased, 1 means paraphrased)
Probabilities: [0.8426618  0.15733825]
