In [9]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from sentence_splitter import SentenceSplitter, split_text_into_sentences

In [10]:
model_name = "tuner007/pegasus_paraphrase"
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
text = "Cats are omnivores. They eat vegetative items such as rice, milk, pulses, etc, as well as fish, meat, birds, mice, etc. Therefore, cats can feed on both types of food. It is worth mentioning in this my pet cat essay for Class 6 that cats are considered sacred in several cultures such as the Japanese culture. Cats are often depicted as symbols of wit and honour. Several folklores include stories about the intelligence of cats. "

In [11]:
splitter = SentenceSplitter(language="en")
sentence_list = splitter.split(text)

sentence = sentence_list[1]
sentence

'They eat vegetative items such as rice, milk, pulses, etc, as well as fish, meat, birds, mice, etc. Therefore, cats can feed on both types of food.'

In [13]:
inputs = tokenizer([sentence], return_tensors="pt", truncation=True)
inputs

{'input_ids': tensor([[  322,  1461, 66948,   843,   253,   130,  3484,   108,  2612,   108,
         31478,   108,   733,   108,   130,   210,   130,  1731,   108,  2804,
           108,  3806,   108, 10378,   108,   733,   107,  3272,   108,  4901,
           137,  2529,   124,   302,  1020,   113,   425,   107,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [14]:
paraphrase = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    num_beams=4,
    max_length=128,
    # min_length=min_length,
    early_stopping=True,
)

In [16]:
paraphrased_text = tokenizer.decode(paraphrase[0], skip_special_tokens=True)

In [17]:
paraphrased_text

'Cats can eat both types of food, as well as vegetative items.'

In [8]:
# Split the text into sentences
splitter = SentenceSplitter(language="en")
sentence_list = splitter.split(text)

for sentence in sentence_list:
    sentence = sentence.strip()

    inputs = tokenizer([sentence], return_tensors="pt", truncation=True)

    batch = tokenizer(
        [sentence],
        truncation=True,
        padding="longest",
        max_length=60,
        return_tensors="pt",
    ).to(torch_device)

['Cats are omnivores.',
 'They eat vegetative items such as rice, milk, pulses, etc, as well as fish, meat, birds, mice, etc. Therefore, cats can feed on both types of food.',
 'It is worth mentioning in this my pet cat essay for Class 6 that cats are considered sacred in several cultures such as the Japanese culture.',
 'Cats are often depicted as symbols of wit and honour.',
 'Several folklores include stories about the intelligence of cats.']