In [105]:
import pandas as pd
import os
from parrot import Parrot
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [106]:
os.chdir("/home/573/rh2942/WASSA-2023-EMP")

In [107]:
train_dev = pd.read_csv("./processed_data/preprocessed_train_dev.csv", index_col=0)

In [108]:
train_dev.columns

Index(['conversation_id', 'article_id', 'essay', 'speaker_id', 'gender',
       'education', 'race', 'age', 'income', 'speaker_number', 'split',
       'essay_id', 'empathy', 'distress', 'emotion',
       'personality_conscientiousness', 'personality_openess',
       'personality_extraversion', 'personality_agreeableness',
       'personality_stability', 'iri_perspective_taking',
       'iri_personal_distress', 'iri_fantasy', 'iri_empathatic_concern',
       'article', 'article_id_text', 'demographic', 'demographic_essay'],
      dtype='object')

In [109]:
# keeping chosen columns only
train_dev = train_dev[['essay', 'empathy', 'distress', 'emotion',
       'personality_conscientiousness', 'personality_openess',
       'personality_extraversion', 'personality_agreeableness',
       'personality_stability', 'iri_perspective_taking',
       'iri_personal_distress', 'iri_fantasy', 'iri_empathatic_concern', 'demographic', 'demographic_essay']]

In [110]:
train_dev.sample(2)

Unnamed: 0,essay,empathy,distress,emotion,personality_conscientiousness,personality_openess,personality_extraversion,personality_agreeableness,personality_stability,iri_perspective_taking,iri_personal_distress,iri_fantasy,iri_empathatic_concern,demographic,demographic_essay
323,I feel so bad for these children after reading...,1.0,1.625,Sadness,6.75,6.75,6.75,6.75,7.0,4.643,2.0715,4.143,4.643,I am male. My age is 29.0 years. My education ...,I am male. My age is 29.0 years. My education ...
507,This is just disgusting and makes me really do...,5.833333,6.5,Disgust,5.5,5.0,2.0,5.5,4.5,3.429,2.857,2.857,2.714,I am male. My age is 32.0 years. My education ...,I am male. My age is 32.0 years. My education ...


In [111]:
paraphrased = train_dev.copy()

In [112]:
paraphrased.sample(2)

Unnamed: 0,essay,empathy,distress,emotion,personality_conscientiousness,personality_openess,personality_extraversion,personality_agreeableness,personality_stability,iri_perspective_taking,iri_personal_distress,iri_fantasy,iri_empathatic_concern,demographic,demographic_essay
772,I just read an article about lead levels in wa...,5.333333,4.75,Neutral,6.5,5.5,5.0,6.5,6.5,3.714,2.286,3.286,4.571,I am male. My age is 33.0 years. My education ...,I am male. My age is 33.0 years. My education ...
323,I feel so bad for these children after reading...,1.0,1.625,Sadness,6.75,6.75,6.75,6.75,7.0,4.643,2.0715,4.143,4.643,I am male. My age is 29.0 years. My education ...,I am male. My age is 29.0 years. My education ...


In [113]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

In [114]:
def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=1,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7
):
    max_length=len(question) #length of existing sentence is the limit
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids
    
    outputs = model.generate(
        input_ids.to(device), temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

In [115]:
for index, row in paraphrased.iterrows():
    paraphrased.loc[index, "demographic"] = paraphrase(paraphrased.loc[index, "demographic"])[0] # 0-index to take the one and only first paraphrased item
    paraphrased.loc[index, "essay"] = paraphrase(paraphrased.loc[index, "essay"])[0]



In [116]:
paraphrased["demographic_essay"] = paraphrased["demographic"] + paraphrased["essay"]

In [117]:
paraphrased

Unnamed: 0,essay,empathy,distress,emotion,personality_conscientiousness,personality_openess,personality_extraversion,personality_agreeableness,personality_stability,iri_perspective_taking,iri_personal_distress,iri_fantasy,iri_empathatic_concern,demographic,demographic_essay
0,I am filled with sadness seeing people living ...,6.833333,6.625,Hope/Sadness,7.00,5.50,1.00,6.50,6.0,4.857,2.0000,3.429,5.000,"With my age of 37.0 years, four year bachelor'...","With my age of 37.0 years, four year bachelor'..."
1,Why isn't there more effort to assist the peop...,5.833333,6.000,Anger,5.50,5.00,2.00,5.50,4.5,3.429,2.8570,2.857,2.714,"My name is a male with an age of 32.0 years, f...","My name is a male with an age of 32.0 years, f..."
2,"After reading the article, I am filled with sa...",1.000000,1.375,Sadness,6.75,6.75,6.75,6.75,7.0,4.643,2.0715,4.143,4.643,"As a male with 290 years of age, four year bac...","As a male with 290 years of age, four year bac..."
3,"Despite having an amazing story to tell, someo...",6.166667,6.625,Sadness,6.00,6.00,5.00,4.50,3.5,5.000,4.1430,4.857,5.000,"My status is female, I have a two-year associa...","My status is female, I have a two-year associa..."
4,It seems as though an all-around kind and gene...,6.833333,1.000,Neutral,7.00,5.50,1.00,6.50,6.0,4.857,2.0000,3.429,5.000,"With my age of 37.0 years, four year bachelor'...","With my age of 37.0 years, four year bachelor'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
982,I'm feeling apprehensive about the individuals...,1.000000,1.875,Fear/Sadness,6.75,6.75,6.75,6.75,7.0,4.643,2.0715,4.143,4.643,"As a male with 290 years of age, four year bac...","As a male with 290 years of age, four year bac..."
983,Despite my best efforts to buy food with purpo...,3.500000,3.625,Sadness,5.00,3.00,5.00,4.00,3.5,2.714,3.0000,3.143,3.286,"I am a female, 21.0 years old, with 0 educatio...","I am a female, 21.0 years old, with 0 educatio..."
984,I wish the article had provided more informati...,4.000000,4.375,Neutral,5.00,3.00,5.00,4.00,3.5,2.714,3.0000,3.143,3.286,"With 21.0 years of age, a High School Diploma,...","With 21.0 years of age, a High School Diploma,..."
985,This is a truly shocking story. I find it asto...,6.000000,6.000,Fear,6.00,6.00,6.00,6.00,6.0,4.000,2.2860,3.571,3.714,"I am a female, 27.0 years old, with an educati...","I am a female, 27.0 years old, with an educati..."


In [118]:
train_dev_paraphrased = pd.concat([train_dev, paraphrased], axis=0, ignore_index=True)

In [119]:
train_dev_paraphrased.to_csv("./processed_data/train_dev_paraphrased.csv")

# Parrot

In [20]:
paraphrased.loc[429,'essay']

'I am not surprised that the nations of Africa are having difficulty agreeing on conservation efforts for elephants. There is much disharmony politically between them. Africa is home to a number of endangered and threatened animals. I think the global community has to be involved in these conservation efforts.'

In [22]:
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=True)

In [29]:
parrot.augment(input_phrase=paraphrased.loc[429,'essay'], use_gpu=True, do_diverse=True, max_return_phrases=5)

[('I am not surprised that the nations of Africa are having difficulty agreeing on conservation efforts for elephants. There is much disharmony politically between them. Africa is home to a number of endangered and threatened animals. I think the global community has to be involved in these conservation efforts.',
  0)]

Current problem: the paraphrased sentence is pretty much the same as original

# Pegasus

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [32]:
def get_response(input_text,num_return_sequences,num_beams):
    batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

In [33]:
get_response(input_text = paraphrased.loc[429,'essay'], num_return_sequences=1 ,num_beams=10)

["I am not surprised that the nations of Africa don't agree on elephants."]

Current problem: only the first sentence is paraphrased