
> # ***Pegasus Paraphrase***

In [None]:
pip install --upgrade transformers sentencepiece

In [12]:
import pandas as pd
train_data = pd.read_csv("train.csv")
train_data.head(5)

Unnamed: 0,Y,text
0,1,"I don't know man, that Jeep looks really cool ..."
1,0,Bring back old wits end and lets see how long ...
2,0,"I don't think he's listening, he's more focuse..."
3,1,Either that or we are all being tricked by the...
4,1,You forgot the


In [13]:
len(train_data) 

52061

In [14]:
#since we will be working on implementation let's take a small subset
train_data = train_data[:1000]

In [15]:
#I decide to divide my subset into two groups since it will make more sense that a two 1-labeled sentence can augment 1-labeled sentence
grouped = train_data.groupby(train_data["Y"])
one_labeled_texts = grouped.get_group(1)
zero_labeled_texts = grouped.get_group(0)

In [18]:
one_labeled_texts = one_labeled_texts.reset_index(drop= True)
zero_labeled_texts = zero_labeled_texts.reset_index(drop= True)

In [20]:
#The distribution is close
len(one_labeled_texts) , len(zero_labeled_texts)

(483, 517)

In [23]:
#We need to turn our data to a form that our pegasus model can use 
seperated_data_one = []
data = one_labeled_texts["text"]
context = ""
for j in range(len(data)):
  seperated_data_one.append(data[j])

print(seperated_data_one[0])

I don't know man, that Jeep looks really cool too...


In [24]:
seperated_data_zero = []
data = zero_labeled_texts["text"]
context = ""
for j in range(len(data)):
  seperated_data_zero.append(data[j])

print(seperated_data_zero[0])

Bring back old wits end and lets see how long your mana lasts :)


In [25]:
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration

model = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum") 
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFPegasusForConditionalGeneration.

All the layers of TFPegasusForConditionalGeneration were initialized from the model checkpoint at google/pegasus-xsum.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFPegasusForConditionalGeneration for predictions without further training.


Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

In [26]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
 
def get_response(input_text,num_return_sequences):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [27]:
# Takes the input paragraph and splits it into a list of sentences
!pip install sentence_splitter
from sentence_splitter import SentenceSplitter, split_text_into_sentences
splitter = SentenceSplitter(language='en')

Collecting sentence_splitter
  Downloading sentence_splitter-1.4-py2.py3-none-any.whl (44 kB)
[K     |████████████████████████████████| 44 kB 1.5 MB/s 
Installing collected packages: sentence-splitter
Successfully installed sentence-splitter-1.4


In [36]:
#Now let's define a function that implements all the process to a given context(sentence)
def augment(context):
  sentence_list = splitter.split(context)
  paraphrase = []
  for i in sentence_list:
    a = get_response(i,1)
    paraphrase.append(a)
  paraphrase2 = [' '.join(x) for x in paraphrase]
  paraphrase3 = [' '.join(x for x in paraphrase2) ]
  paraphrased_text = str(paraphrase3).strip('[]').strip("'")
  return paraphrased_text

In [39]:
seperated_data_zero[4]

'Great deal, I still use my Volcano more than any of my vapes, so I think you should grab it.'

In [40]:
augment(seperated_data_zero[4])

'I think you should grab it, I use my Volcano more than any of the others.'

In [41]:
#Now we will define an empty list that will hold all sentences while we implement augmentation to our data
augmented_0_data=[]

for s in range(len(seperated_data_zero)):
  augmented_0_data.append(augment(seperated_data_zero[s]))

In [42]:
augmented_1_data=[]

for s in range(len(seperated_data_one)):
  augmented_1_data.append(augment(seperated_data_one[s]))

In [43]:
#Creating datasets from our augmented sentences
df_1_augmented = pd.DataFrame(augmented_1_data, columns = ['text'])
df_0_augmented = pd.DataFrame(augmented_0_data, columns = ['text'])

In [46]:
df_1_augmented['Y'] = 1
df_0_augmented['Y'] = 0

In [None]:
df_1_augmented

In [48]:
df_1_augmented = df_1_augmented[['Y','text']]
df_0_augmented = df_0_augmented[['Y','text']]

In [49]:
#Now our augmented datasets are ready to be saved as csv file
df_1_augmented.to_csv('df_1_augmented.csv')
df_0_augmented.to_csv('df_0_augmented.csv')