<a href="https://colab.research.google.com/github/hankserr/paraphrase_detector/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Version: 0.0.1 distilbert-base-uncased

# Install necessary packages
!pip install datasets
!pip install evaluate

# Import required libraries
import torch
import numpy as np
import tensorflow as tf
from datasets import load_dataset
import evaluate
from transformers import (
    TFAutoModelForSequenceClassification, AutoTokenizer,
    DataCollatorWithPadding, create_optimizer,
    DataCollatorForSeq2Seq, TFAutoModelForSeq2SeqLM
)
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers.keras_callbacks import KerasMetricCallback

def load_datasets():
    """Load the PAWS dataset."""
    train_dataset = load_dataset("paws", "labeled_final", split="train")
    test_dataset = load_dataset("paws", "labeled_final", split="test")
    return train_dataset, test_dataset

def preprocess_datasets(train_dataset, test_dataset, tokenizer):
    """Preprocess the datasets by tokenizing and formatting."""
    def encode(examples):
      return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length", max_length=128)

    train_dataset = train_dataset.map(encode, batched=True)
    test_dataset = test_dataset.map(encode, batched=True)

    train_dataset = train_dataset.map(lambda examples: {"labels": examples["label"]})
    test_dataset = test_dataset.map(lambda examples: {"labels": examples["label"]})

    train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "labels"])
    test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "labels"])

    return train_dataset, test_dataset

def create_data_collator(tokenizer):
    """Create a data collator for dynamic padding."""
    return DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

def load_model():
    """Load the DistilBERT model."""
    id2label = {0: "NEGATIVE", 1: "POSITIVE"}
    label2id = {"NEGATIVE": 0, "POSITIVE": 1}
    return TFAutoModelForSequenceClassification.from_pretrained(
        "microsoft/deberta-v3-large", num_labels=2
    )

def create_optimizer_and_schedule(dataset, batch_size=16, num_epochs=4):
    """Create an optimizer and learning rate schedule."""
    batches_per_epoch = len(dataset) // batch_size
    total_train_steps = int(batches_per_epoch * num_epochs)
    return create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

def compute_metrics(eval_pred):
    """Compute evaluation metrics."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    # decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    accuracy = evaluate.load("accuracy")
    return accuracy.compute(predictions=predictions, references=labels)

def prepare_tf_datasets(model, train_dataset, test_dataset, data_collator, batch_size=16):
    """Convert datasets to TensorFlow datasets."""
    tf_train_set = model.prepare_tf_dataset(
        train_dataset,
        shuffle=True,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    tf_validation_set = model.prepare_tf_dataset(
        test_dataset,
        shuffle=False,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    return tf_train_set, tf_validation_set

def train_model(model, tf_train_set, tf_validation_set, optimizer, num_epochs=3):
    """Train the model."""
    model.compile(optimizer=optimizer, metrics=["accuracy"])

    metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

    checkpoint_callback = ModelCheckpoint(
        filepath='model_checkpoint_epoch_{epoch:02d}.h5',
        save_weights_only=True,
        save_freq='epoch',
    )

    model.fit(
        x=tf_train_set,
        validation_data=tf_validation_set,
        epochs=num_epochs,
        callbacks=[metric_callback, checkpoint_callback]
    )

def main():
    train_dataset, test_dataset = load_datasets()
    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
    train_dataset, test_dataset = preprocess_datasets(train_dataset, test_dataset, tokenizer)
    data_collator = create_data_collator(tokenizer)
    model = load_model()
    optimizer, _ = create_optimizer_and_schedule(train_dataset)
    tf_train_set, tf_validation_set = prepare_tf_datasets(model, train_dataset, test_dataset, data_collator)
    train_model(model, tf_train_set, tf_validation_set, optimizer)

if __name__ == "__main__":
    main()




############ TEST ##############
##### Prediction Function #####
def predict_paraphrase(sentence1, sentence2):
    inputs = tokenizer(sentence1, sentence2, return_tensors='tf', truncation=True, padding='max_length', max_length=128)
    outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    logits = outputs.logits
    probabilities = tf.nn.softmax(logits, axis=-1)
    predicted_class = tf.argmax(logits, axis=-1).numpy()[0]

    return predicted_class, probabilities.numpy()[0]

# Test sentences
diff_1 = "Enter the first sentence: "
diff_2 = "Enter the second sentence: "

same_1 = "These sentences are the same"
same_2 = "These sentences are the same"

reorder_1 = "These are the same sentences"
reorder_2 = "These sentences are the same"

para_1 = "In January 2011 , the Deputy Secretary General of FIBA Asia , Hagop Khajirian , inspected the venue together with SBP - President Manuel V. Pangilinan ."
para_2 = "In January 2011 , FIBA Asia deputy secretary general Hagop Khajirian along with SBP president Manuel V. Pangilinan inspected the venue ."

# Predict if the sentences are paraphrases
predicted_class, probabilities = predict_paraphrase(diff_1, diff_2)
print("Diff")
print(f"Predicted class: {predicted_class} (0 means not paraphrased, 1 means paraphrased)")
print(f"Probabilities: {probabilities}")

predicted_class, probabilities = predict_paraphrase(same_1, same_2)
print("Same")
print(f"Predicted class: {predicted_class} (0 means not paraphrased, 1 means paraphrased)")
print(f"Probabilities: {probabilities}")

predicted_class, probabilities = predict_paraphrase(reorder_1, reorder_2)
print("Reorder")
print(f"Predicted class: {predicted_class} (0 means not paraphrased, 1 means paraphrased)")
print(f"Probabilities: {probabilities}")

predicted_class, probabilities = predict_paraphrase(para_1, para_2)
print("Para")
print(f"Predicted class: {predicted_class} (0 means not paraphrased, 1 means paraphrased)")
print(f"Probabilities: {probabilities}")




##### microsoft/deberta-v3-large Results #####

# 3087/3087 [==============================] - 5988s 2s/step - loss: 0.1454 - accuracy: 0.9360 - val_loss: 0.1756 - val_accuracy: 0.9360
# Epoch 2/3
# 3087/3087 [==============================] - 5885s 2s/step - loss: 0.0731 - accuracy: 0.9474 - val_loss: 0.1971 - val_accuracy: 0.9474
# Epoch 3/3
# 3087/3087 [==============================] - 5858s 2s/step - loss: 0.0441 - accuracy: 0.9493 - val_loss: 0.2347 - val_accuracy: 0.9492

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/9.79k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.43M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/49401 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/49401 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

tf_model.h5:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFDebertaV2ForSequenceClassification.

Some layers of TFDebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['cls_dropout', 'classifier', 'pooler']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.




Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Epoch 2/3
Epoch 3/3


NameError: name 'tokenizer' is not defined