In [3]:
## Dipper Paraphraser Model 
## Code taken from https://huggingface.co/kalpeshk2011/dipper-paraphraser-xxl
class DipperParaphraser(object):
    def __init__(self, model="kalpeshk2011/dipper-paraphraser-xxl", verbose=True):
        time1 = time.time()
        self.tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-xxl',cache_dir = 'cache')
        self.model = T5ForConditionalGeneration.from_pretrained(model,cache_dir = 'cache')
        if verbose:
            print(f"{model} model loaded in {time.time() - time1}")
        self.model.cuda()
        self.model.eval()

    def paraphrase(self, input_text, lex_diversity, order_diversity, prefix="", sent_interval=3, **kwargs):
        """Paraphrase a text using the DIPPER model.

        Args:
            input_text (str): The text to paraphrase. Make sure to mark the sentence to be paraphrased between <sent> and </sent> blocks, keeping space on either side.
            lex_diversity (int): The lexical diversity of the output, choose multiples of 20 from 0 to 100. 0 means no diversity, 100 means maximum diversity.
            order_diversity (int): The order diversity of the output, choose multiples of 20 from 0 to 100. 0 means no diversity, 100 means maximum diversity.
            **kwargs: Additional keyword arguments like top_p, top_k, max_length.
        """
        assert lex_diversity in [0, 20, 40, 60, 80, 100], "Lexical diversity must be one of 0, 20, 40, 60, 80, 100."
        assert order_diversity in [0, 20, 40, 60, 80, 100], "Order diversity must be one of 0, 20, 40, 60, 80, 100."

        lex_code = int(100 - lex_diversity)
        order_code = int(100 - order_diversity)

        input_text = " ".join(input_text.split())
        sentences = sent_tokenize(input_text)
        prefix = " ".join(prefix.replace("\n", " ").split())
        output_text = ""

        for sent_idx in range(0, len(sentences), sent_interval):
            curr_sent_window = " ".join(sentences[sent_idx:sent_idx + sent_interval])
            final_input_text = f"lexical = {lex_code}, order = {order_code}"
            if prefix:
                final_input_text += f" {prefix}"
            final_input_text += f" <sent> {curr_sent_window} </sent>"

            final_input = self.tokenizer([final_input_text], return_tensors="pt")
            final_input = {k: v.cuda() for k, v in final_input.items()}

            with torch.inference_mode():
                outputs = self.model.generate(**final_input, **kwargs)
            outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
            prefix += " " + outputs[0]
            output_text += " " + outputs[0]

        return output_text

In [4]:
!pip install torch transformers sklearn nltk
!pip install --editable .

Obtaining file:///mnt/projects/qanon_proj/RobustCrawl/GPT
[31mERROR: file:///mnt/projects/qanon_proj/RobustCrawl/GPT does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

In [5]:
from transformers import T5ForConditionalGeneration,T5Tokenizer
from nltk.tokenize import sent_tokenize
import torch
from transformers import T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-xxl',cache_dir='cache') ## Load in Tokenizer for model 

In [None]:
model =  T5ForConditionalGeneration.from_pretrained('kalpeshk2011/dipper-paraphraser-xxl',cache_dir='cache') ## Load in Paraphrasing Model 
device = torch.device('cuda')
model = model.to(device)

In [None]:
## Dipper Paraphraser Model 
## Code adapted from https://huggingface.co/kalpeshk2011/dipper-paraphraser-xxl
## input_text: input text that is paraphrased
## lex_diversity: Lexical diversity of paraphrased output. See https://arxiv.org/pdf/2303.13408.pdf
## order_diversity: Order diversity of paraphrased output. See https://arxiv.org/pdf/2303.13408.pdf
## top_p: p-value for p-sampling
## max_length: maximum length for sampling
## sent_interval: the number of sentences to parapghrase per iteration
def create_paraphrase(input_text,lex_diversity=60,order_diversity=0,top_p=0.75,max_length=512,sent_interval = 3):
    lex_code = int(100 - lex_diversity)
    order_code = int(100 - order_diversity)
    prefix = ''
    input_text = " ".join(input_text.split())
    sentences = sent_tokenize(input_text)
    output_text = ""

    for sent_idx in range(0, len(sentences), sent_interval):
        curr_sent_window = " ".join(sentences[sent_idx:sent_idx + sent_interval])
        final_input_text = f"lexical = {lex_code}, order = {order_code}"
        final_input_text += f" <sent> {curr_sent_window} </sent>"

        final_input = tokenizer([final_input_text], return_tensors="pt")
        final_input = {k: v.cuda() for k, v in final_input.items()}

        with torch.inference_mode():
            outputs = model.generate(**final_input,do_sample=True,top_p=top_p,max_length=max_length)
        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        prefix += " " + outputs[0]
        output_text += " " + outputs[0]
    return output_text