<a href="https://colab.research.google.com/github/hankserr/paraphrase_detector/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Version: 1.1.1 deberta-v3-small dates dataset enlarged

# Install necessary packages
!pip install datasets
!pip install evaluate

# Import required libraries
import torch
import numpy as np
import tensorflow as tf
from datasets import load_dataset
import evaluate
from transformers import (
    TFAutoModelForSequenceClassification, AutoTokenizer,
    DataCollatorWithPadding, create_optimizer,
    DataCollatorForSeq2Seq, TFAutoModelForSeq2SeqLM
)
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers.keras_callbacks import KerasMetricCallback

def load_datasets():
    """Load the PAWS dataset."""
    # train_dataset = load_dataset("paws", "labeled_final", split="train")
    # test_dataset = load_dataset("paws", "labeled_final", split="test")
    """Load the paraphrase_dataset_dates dataset."""
    dataset = load_dataset('csv', data_files=['paraphrase_dataset_dates.csv'])
    dataset = dataset["train"]
    # Example: Using a 80-20 split for train-test
    split_ratio = 0.8
    split_point = int(split_ratio * len(dataset))

    train_dataset = dataset.select(range(split_point))  # First 80% for training
    test_dataset = dataset.select(range(split_point, len(dataset)))  # Remaining 20% for testing

    print(f"\nTrain dataset size: {len(train_dataset)}")
    print(f"Test dataset size: {len(test_dataset)}\n")

    return train_dataset, test_dataset

def preprocess_datasets(train_dataset, test_dataset, tokenizer):
    """Preprocess the datasets by tokenizing and formatting."""
    def encode(examples):
      # Need to replace sentence1/sentence2 with Input & Paraphrase
      return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length", max_length=128)

    train_dataset = train_dataset.map(encode, batched=True)
    test_dataset = test_dataset.map(encode, batched=True)

    # Needed to switch from labels to Type
    train_dataset = train_dataset.map(lambda examples: {"labels": examples["label"]})
    test_dataset = test_dataset.map(lambda examples: {"labels": examples["label"]})

    train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "labels"])
    test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "labels"])

    return train_dataset, test_dataset

def create_data_collator(tokenizer):
    """Create a data collator for dynamic padding."""
    return DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

def load_model():
    """Load the DistilBERT model."""
    id2label = {0: "NEGATIVE", 1: "POSITIVE"}
    label2id = {"NEGATIVE": 0, "POSITIVE": 1}
    return TFAutoModelForSequenceClassification.from_pretrained(
        "microsoft/deberta-v3-small", num_labels=2
    )

def create_optimizer_and_schedule(dataset, batch_size=16, num_epochs=4):
    """Create an optimizer and learning rate schedule."""
    batches_per_epoch = len(dataset) // batch_size
    total_train_steps = int(batches_per_epoch * num_epochs)
    return create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

def compute_metrics(eval_pred):
    """Compute evaluation metrics."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    # decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    accuracy = evaluate.load("accuracy")
    return accuracy.compute(predictions=predictions, references=labels)

def prepare_tf_datasets(model, train_dataset, test_dataset, data_collator, batch_size=16):
    """Convert datasets to TensorFlow datasets."""
    tf_train_set = model.prepare_tf_dataset(
        train_dataset,
        shuffle=True,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    tf_validation_set = model.prepare_tf_dataset(
        test_dataset,
        shuffle=False,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    return tf_train_set, tf_validation_set

def train_model(model, tf_train_set, tf_validation_set, optimizer, num_epochs=3):
    """Train the model."""
    model.compile(optimizer=optimizer, metrics=["accuracy"])

    metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

    checkpoint_callback = ModelCheckpoint(
        filepath='model_checkpoint_epoch_{epoch:02d}.h5',
        save_weights_only=True,
        save_freq='epoch',
    )

    model.fit(
        x=tf_train_set,
        validation_data=tf_validation_set,
        epochs=num_epochs,
        callbacks=[metric_callback, checkpoint_callback]
    )

def predict_paraphrase(sentence1, sentence2, tokenizer, model):
    inputs = tokenizer(sentence1, sentence2, return_tensors='tf', truncation=True, padding='max_length', max_length=128)
    outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    logits = outputs.logits
    probabilities = tf.nn.softmax(logits, axis=-1)
    predicted_class = tf.argmax(logits, axis=-1).numpy()[0]

    return predicted_class, probabilities.numpy()[0]


def main():
    train_dataset, test_dataset = load_datasets()
    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
    train_dataset, test_dataset = preprocess_datasets(train_dataset, test_dataset, tokenizer)
    data_collator = create_data_collator(tokenizer)
    model = load_model()
    optimizer, _ = create_optimizer_and_schedule(train_dataset)
    tf_train_set, tf_validation_set = prepare_tf_datasets(model, train_dataset, test_dataset, data_collator)
    train_model(model, tf_train_set, tf_validation_set, optimizer)
    diff_1 = "02/23/2025"
    diff_2 = "January 23, 2025"

    same_1 = "January 23, 2024"
    same_2 = "Jan 23, 24"

    reorder_1 = "January 23, 2024"
    reorder_2 = "01/23/24"

    para_1 = "5-10-99"
    para_2 = "Mar 10 1999"

    # Predict if the sentences are paraphrases
    predicted_class, probabilities = predict_paraphrase(diff_1, diff_2, tokenizer, model)
    print("Diff")
    print(f"Predicted class: {predicted_class} (0 means not paraphrased, 1 means paraphrased)")
    print(f"Probabilities: {probabilities}")

    predicted_class, probabilities = predict_paraphrase(same_1, same_2, tokenizer, model)
    print("Same")
    print(f"Predicted class: {predicted_class} (0 means not paraphrased, 1 means paraphrased)")
    print(f"Probabilities: {probabilities}")

    predicted_class, probabilities = predict_paraphrase(reorder_1, reorder_2, tokenizer, model)
    print("Reorder")
    print(f"Predicted class: {predicted_class} (0 means not paraphrased, 1 means paraphrased)")
    print(f"Probabilities: {probabilities}")

    predicted_class, probabilities = predict_paraphrase(para_1, para_2, tokenizer, model)
    print("Para")
    print(f"Predicted class: {predicted_class} (0 means not paraphrased, 1 means paraphrased)")
    print(f"Probabilities: {probabilities}")



if __name__ == "__main__":
    main()



Generating train split: 0 examples [00:00, ? examples/s]


Train dataset size: 134400
Test dataset size: 33600



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/134400 [00:00<?, ? examples/s]

Map:   0%|          | 0/33600 [00:00<?, ? examples/s]

Map:   0%|          | 0/134400 [00:00<?, ? examples/s]

Map:   0%|          | 0/33600 [00:00<?, ? examples/s]

tf_model.h5:   0%|          | 0.00/565M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDebertaV2ForSequenceClassification.

Some layers of TFDebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier', 'pooler', 'cls_dropout']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.




Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Epoch 2/3
Epoch 3/3
Diff
Predicted class: 0 (0 means not paraphrased, 1 means paraphrased)
Probabilities: [9.999896e-01 1.033763e-05]
Same
Predicted class: 1 (0 means not paraphrased, 1 means paraphrased)
Probabilities: [1.4689664e-06 9.9999857e-01]
Reorder
Predicted class: 1 (0 means not paraphrased, 1 means paraphrased)
Probabilities: [8.028607e-06 9.999920e-01]
Para
Predicted class: 0 (0 means not paraphrased, 1 means paraphrased)
Probabilities: [0.9405641 0.0594359]
