In [1]:
import pandas as pd
import os
from parrot import Parrot
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
os.chdir("/home/573/rh2942/WASSA-2023-EMP")

In [3]:
train_dev = pd.read_csv("./processed_data/preprocessed_train_dev.csv", index_col=0)

In [4]:
train_dev.columns

Index(['conversation_id', 'article_id', 'essay', 'speaker_id', 'gender',
       'education', 'race', 'age', 'income', 'speaker_number', 'split',
       'essay_id', 'empathy', 'distress', 'emotion',
       'personality_conscientiousness', 'personality_openess',
       'personality_extraversion', 'personality_agreeableness',
       'personality_stability', 'iri_perspective_taking',
       'iri_personal_distress', 'iri_fantasy', 'iri_empathatic_concern',
       'article', 'article_id_text', 'demographic', 'demographic_essay'],
      dtype='object')

In [5]:
# keeping chosen columns only
train_dev = train_dev[['essay', 'empathy', 'distress', 'emotion',
       'personality_conscientiousness', 'personality_openess',
       'personality_extraversion', 'personality_agreeableness',
       'personality_stability', 'iri_perspective_taking',
       'iri_personal_distress', 'iri_fantasy', 'iri_empathatic_concern', 'article', 'demographic', 'demographic_essay']]

In [6]:
train_dev.sample(2)

Unnamed: 0,essay,empathy,distress,emotion,personality_conscientiousness,personality_openess,personality_extraversion,personality_agreeableness,personality_stability,iri_perspective_taking,iri_personal_distress,iri_fantasy,iri_empathatic_concern,article,demographic,demographic_essay
642,"After reading the article, i can't help but fe...",1.0,1.625,Sadness,6.75,6.75,6.75,6.75,7.0,4.643,2.0715,4.143,4.643,"Celebrity jeweler's surrogate son, second man ...",I am male. My age is 29.0 years. My education ...,I am male. My age is 29.0 years. My education ...
768,I just read a story about an apparent murder s...,4.166667,1.25,Neutral,6.0,6.5,1.0,6.0,5.5,5.0,2.286,3.286,4.143,"Wife Who Died Alongside Husband, Children in M...",I am male. My age is 29.0 years. My education ...,I am male. My age is 29.0 years. My education ...


# Replacing long article by its summary

In [1]:
import transformers as trf

In [81]:
checkpoint = "philschmid/flan-t5-base-samsum"
tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint, model_max_length=25000) #25K because article text has maximum ~21K characters

summariser = trf.pipeline(task="summarization", model=checkpoint, tokenizer=tokeniser, device="cuda:0")

In [None]:
for index, _ in train_dev.iterrows():
    train_dev.loc[index, "article"] = summariser(train_dev.loc[index, "article"])[0]["summary_text"] # 0-index to take the one and only first paraphrased item

# Paraphrase

In [95]:
paraphrased = train_dev.copy()

In [96]:
paraphrased.sample(2)

Unnamed: 0,essay,empathy,distress,emotion,personality_conscientiousness,personality_openess,personality_extraversion,personality_agreeableness,personality_stability,iri_perspective_taking,iri_personal_distress,iri_fantasy,iri_empathatic_concern,article,demographic,demographic_essay
145,I'm mostly bothered by this story because of h...,1.5,3.5,Disgust/Sadness,2.5,2.0,1.5,2.0,1.0,3.286,4.429,2.714,2.571,"The Sungai Putri, meaning River of the Princes...",I am male. My age is 41.0 years. My education ...,I am male. My age is 41.0 years. My education ...
801,So I just read an article about how a father a...,5.0,4.75,Anger,6.0,6.0,5.5,6.5,3.0,4.857,3.143,2.571,4.857,A father and son have died after falling off a...,I am male. My age is 25.0 years. My education ...,I am male. My age is 25.0 years. My education ...


In [97]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

In [98]:
def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=1,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7
):
    max_length=len(question) #length of existing sentence is the limit
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids
    
    outputs = model.generate(
        input_ids.to(device), temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

In [99]:
for index, row in paraphrased.iterrows():
    paraphrased.loc[index, "demographic"] = paraphrase(paraphrased.loc[index, "demographic"])[0] # 0-index to take the one and only first paraphrased item
    paraphrased.loc[index, "essay"] = paraphrase(paraphrased.loc[index, "essay"])[0]
    paraphrased.loc[index, "article"] = paraphrase(paraphrased.loc[index, "article"])[0]



In [100]:
paraphrased["demographic_essay"] = paraphrased["demographic"] + paraphrased["essay"]

In [101]:
paraphrased

Unnamed: 0,essay,empathy,distress,emotion,personality_conscientiousness,personality_openess,personality_extraversion,personality_agreeableness,personality_stability,iri_perspective_taking,iri_personal_distress,iri_fantasy,iri_empathatic_concern,article,demographic,demographic_essay
0,I am filled with sadness seeing people living ...,6.833333,6.625,Hope/Sadness,7.00,5.50,1.00,6.50,6.0,4.857,2.0000,3.429,5.000,The absence of food in the mountains is causin...,My profile details a black or African-American...,My profile details a black or African-American...
1,Why isn't there more effort to assist the peop...,5.833333,6.000,Anger,5.50,5.00,2.00,5.50,4.5,3.429,2.8570,2.857,2.714,The absence of food in the mountains is causin...,"I am a male with an age of 32.0 years, accompa...","I am a male with an age of 32.0 years, accompa..."
2,"After reading the article, I am filled with sa...",1.000000,1.375,Sadness,6.75,6.75,6.75,6.75,7.0,4.643,2.0715,4.143,4.643,The absence of food in the mountains is causin...,"I am a male with 290 years of age, four year b...","I am a male with 290 years of age, four year b..."
3,"Despite having an amazing story to tell, someo...",6.166667,6.625,Sadness,6.00,6.00,5.00,4.50,3.5,5.000,4.1430,4.857,5.000,"The Miami Marlins pitcher, Jose Fernández, who...","I am a female, two-year associate degree, whit...","I am a female, two-year associate degree, whit..."
4,It seems as though an all-around kind and gene...,6.833333,1.000,Neutral,7.00,5.50,1.00,6.50,6.0,4.857,2.0000,3.429,5.000,"The Miami Marlins pitcher, Jose Fernández, who...","I am a male with an age of 37.0 years, four ye...","I am a male with an age of 37.0 years, four ye..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
982,I'm feeling apprehensive about the individuals...,1.000000,1.875,Fear/Sadness,6.75,6.75,6.75,6.75,7.0,4.643,2.0715,4.143,4.643,The Portugal-bound train that derailed in Spai...,"My name is Male, I am 29.0 years old and I hav...","My name is Male, I am 29.0 years old and I hav..."
983,Despite my best efforts to buy food with purpo...,3.500000,3.625,Sadness,5.00,3.00,5.00,4.00,3.5,2.714,3.0000,3.143,3.286,The paper published in Plos One highlights the...,"As a female with XSW degrees and White skin, I...","As a female with XSW degrees and White skin, I..."
984,I wish the article had provided more informati...,4.000000,4.375,Neutral,5.00,3.00,5.00,4.00,3.5,2.714,3.0000,3.143,3.286,The police have arrived at the Azusa location ...,"My background is White, my age is 21.0 years, ...","My background is White, my age is 21.0 years, ..."
985,This is a truly shocking story. I find it asto...,6.000000,6.000,Fear,6.00,6.00,6.00,6.00,6.0,4.000,2.2860,3.571,3.714,A recent report has revealed that providers wh...,"I am a female, 27.0 years old, with an educati...","I am a female, 27.0 years old, with an educati..."


In [102]:
train_dev_paraphrased = pd.concat([train_dev, paraphrased], axis=0, ignore_index=True)

In [103]:
train_dev_paraphrased.to_csv("./processed_data/train_dev_paraphrased.csv")

## Separate train to check dev performance separately

In [13]:
# train_dev_paraphrased = pd.read_csv("./processed_data/train_dev_paraphrased.csv", index_col=0, header=0)

In [104]:
train = train_dev_paraphrased.iloc[0:779,:]

In [108]:
train_paraphrased = train_dev_paraphrased.iloc[987:1766,:]

In [109]:
train_train_paraphrased = pd.concat([train, train_paraphrased], ignore_index=True, axis=0)

In [110]:
train_train_paraphrased.to_csv("./processed_data/train_train_paraphrased.csv")

In [111]:
dev_paraphrased = train_dev_paraphrased.iloc[779:987,:]
dev_paraphrased.to_csv("./processed_data/dev_summarised.csv")

## Check lengths

In [52]:
from datasets import Dataset

In [54]:
tokeniser = trf.AutoTokenizer.from_pretrained("bert-base-uncased")

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence["demographic_essay"], truncation=True)

hugging_dataset = Dataset.from_pandas(train_dev, preserve_index=False)

tokenised_hugging_dataset = hugging_dataset.map(tokenise, batched=True)

Map:   0%|          | 0/987 [00:00<?, ? examples/s]

In [58]:
# checking length after tokenisation

length = []
for i in range(tokenised_hugging_dataset.num_rows):
  length.append(len(tokenised_hugging_dataset['input_ids'][i]))

print(f"Max length: {max(length)}")

Max lengths: 236


In [61]:
train_dev["demographic_essay"].str.len().max()

956

In [63]:
train_dev["article"].str.len().min()

284

In [10]:
train_dev["article"].str.len().max()

20047

In [8]:
train_dev["demographic_essay"].str.len().max()

956

In [14]:
train_dev_paraphrased["article"].str.len().max()

987

# Parrot

In [20]:
paraphrased.loc[429,'essay']

'I am not surprised that the nations of Africa are having difficulty agreeing on conservation efforts for elephants. There is much disharmony politically between them. Africa is home to a number of endangered and threatened animals. I think the global community has to be involved in these conservation efforts.'

In [22]:
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=True)

In [29]:
parrot.augment(input_phrase=paraphrased.loc[429,'essay'], use_gpu=True, do_diverse=True, max_return_phrases=5)

[('I am not surprised that the nations of Africa are having difficulty agreeing on conservation efforts for elephants. There is much disharmony politically between them. Africa is home to a number of endangered and threatened animals. I think the global community has to be involved in these conservation efforts.',
  0)]

Current problem: the paraphrased sentence is pretty much the same as original

# Pegasus

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [32]:
def get_response(input_text,num_return_sequences,num_beams):
    batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

In [33]:
get_response(input_text = paraphrased.loc[429,'essay'], num_return_sequences=1 ,num_beams=10)

["I am not surprised that the nations of Africa don't agree on elephants."]

Current problem: only the first sentence is paraphrased