In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
os.chdir("/home/573/rh2942/WASSA-2023-EMP") # changing dir for evaluation file

In [3]:
raw_test = pd.read_csv('./dataset/test/WASSA23_conv_level_test.tsv', sep='\t', header=0)
raw_test.sample(3)

Unnamed: 0,conversation_id,turn_id,text,speaker_number,article_id,speaker_id,essay_id
626,1114,8,"yeah thats true, I would be so angry if I were...",2,14,77,47
282,1094,24,"I agree, this was one of the things that bothe...",2,11,64,23
855,1150,18,"I agree, I wish we could",2,16,72,74


# Combine (1) dev sample with label and (2) train and dev set

## Dev sample with corresponding output labels

In [81]:
raw_dev = pd.read_csv('./dataset/dev/WASSA23_conv_level_dev.tsv', sep='\t', header=0)
raw_dev.sample(2)

Unnamed: 0,conversation_id,turn_id,text,speaker_number,article_id,speaker_id,essay_id
2093,462,13,How do you know so much about all this? It's i...,2,18.0,68.0,961.0
1921,425,11,"School shootings are interpreted as ""oh anothe...",2,133.0,65.0,924.0


In [82]:
goldstandard_dev = pd.read_csv('./dataset/dev/goldstandard_CONV_dev.tsv', sep='\t', header=None) # no header

In [83]:
# just an educated guess as the header is not provided
goldstandard_dev = goldstandard_dev.rename(columns={0:'EmotionalPolarity',
                                                    1:'Emotion',
                                                    2:'Empathy'})

In [84]:
complete_dev = pd.concat([raw_dev, goldstandard_dev], axis=1)
complete_dev.sample(2)

Unnamed: 0,conversation_id,turn_id,text,speaker_number,article_id,speaker_id,essay_id,EmotionalPolarity,Emotion,Empathy
2253,26,29,Back at you!,2,171.0,74.0,525.0,0.0,0.0,1.0
1822,406,13,"I agree, but this system was not built was eve...",2,331.0,75.0,905.0,1.3333,1.6667,2.6667


## Combine with train

In [85]:
raw_train = pd.read_csv('./dataset/WASSA23_conv_level_with_labels_train.tsv', sep='\t', na_values='unknown', header=0) # raw csv file consists of 'unknown' values
raw_train.sample(2)

Unnamed: 0,conversation_id,turn_id,text,EmotionalPolarity,Emotion,Empathy,speaker_number,article_id,speaker_id,essay_id
6120,359,17,how so?,1.0,0.6667,1.0,2,44.0,24.0,858.0
2662,169,8,Oo I guess I could understand that. It's easy...,0.6667,2.0,3.0,1,228.0,49.0,168.0


In [86]:
train = raw_train[list(complete_dev)] #only keeping columns exist on dev set
train.sample(2)

In [88]:
train_dev = pd.concat([train, complete_dev], axis=0, ignore_index=True)
train_dev.sample(2)

Unnamed: 0,conversation_id,turn_id,text,speaker_number,article_id,speaker_id,essay_id,EmotionalPolarity,Emotion,Empathy
6114,359,11,http://worldpopulationreview.com/countries/mas...,2,44.0,24.0,858.0,1.6667,0.6667,1.0
10467,387,27,"That's good, hopefully everyone involved will ...",2,66.0,67.0,886.0,0.6667,1.6667,2.0


# Numerical to textual conversion

In [4]:
# Summarised article
article = pd.read_csv('./processed_data/articles_adobe_AMT_summarised.csv', header=0, index_col=0)
article.sample(2)

Unnamed: 0_level_0,text
article_id,Unnamed: 1_level_1
334,Climate change impacts have now been documente...
19,More people have drowned in the Mediterranean ...


In [5]:
def num_to_text(raw_data):
    input_data = raw_data.copy() #mandatory step as dataframe is mutable
    
    input_data.dropna(inplace=True)
    print(f"Existing non-numeric columns:\n {input_data.select_dtypes(exclude=['number']).columns.tolist()}") #Just checking if any non-numeric value exist on numeric columns
    
    #converting article id to corresponding article texts
    input_data['article'] = input_data['article_id'].apply(lambda x: article.loc[x, 'text'])
    
    # print(input_data.isna().any())
    assert input_data.isna().any().any() == False #no NA values
    assert input_data.isnull().any().any() == False #no null values
  
    return input_data

In [6]:
def save_preprocessed(df, dataname):
    raw_data = df.copy()
    processed_df = num_to_text(raw_data)
    processed_df.to_csv("./processed_data/CONV_preprocessed_" + dataname + ".csv")

In [8]:
# save_preprocessed(complete_dev, dataname="complete_dev") #dev with output label
save_preprocessed(raw_test, dataname="test")

Existing non-numeric columns:
 ['text']


# Data augmentaiton (Paraphrasing) -- train and dev set

In [94]:
import torch
import transformers as trf

In [91]:
# train_dev is not saved, so not passed through numerical-to-textual conversion
train_dev = num_to_text(train_dev)

Existing non-numeric columns:
 ['text']


In [95]:
train_dev.columns

Index(['conversation_id', 'turn_id', 'text', 'speaker_number', 'article_id',
       'speaker_id', 'essay_id', 'EmotionalPolarity', 'Emotion', 'Empathy',
       'article'],
      dtype='object')

In [97]:
# keeping chosen columns only
train_dev = train_dev[['text','EmotionalPolarity', 'Emotion', 'Empathy', 'article']]

In [98]:
train_dev.sample(2)

Unnamed: 0,text,EmotionalPolarity,Emotion,Empathy,article
2141,It was nice talking to you! Thanks for sharing...,0.0,2.0,1.3333,Dakota Fanning's mom and dad are done after al...
2260,:(. That's all I have to say on this situation...,1.0,1.6667,1.3333,As fighting continued in and around Mosul on F...


In [99]:
paraphrased = train_dev.copy()

In [100]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
tokenizer = trf.AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = trf.AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

In [106]:
def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=1,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7
):
    max_length=len(question) #length of existing sentence is the limit
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids
    
    outputs = model.generate(
        input_ids.to(device), temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

In [None]:
for index, row in paraphrased.iterrows():
    if len(paraphrased.loc[index, "text"].split()) > 1:
        paraphrased.loc[index, "text"] = paraphrase(paraphrased.loc[index, "text"])[0] # 0-index to take the one and only first paraphrased item
    paraphrased.loc[index, "article"] = paraphrase(paraphrased.loc[index, "article"])[0]



In [None]:
paraphrased.sample(5)

In [None]:
train_dev_paraphrased = pd.concat([train_dev, paraphrased], axis=0, ignore_index=True)

In [None]:
train_dev_paraphrased.to_csv("./processed_data/CONV_train_dev_paraphrased.csv")

## Separate augmented train to check dev performance separately

In [3]:
train_dev_paraphrased = pd.read_csv("./processed_data/CONV_train_dev_paraphrased.csv", index_col=0, header=0)

In [4]:
train_dev_paraphrased.sample(3)

Unnamed: 0,text,EmotionalPolarity,Emotion,Empathy,article
14490,"Today, just like in the past, we have all gone...",2.0,3.6667,1.6667,The police killing of a black man led to chaos...
7887,Your one of the few that have a farm and lives...,1.3333,1.6667,2.3333,At least three U.S. military trainers in Jorda...
22157,Those are good intentions. They could have tak...,0.6667,1.3333,1.6667,"U.S. Attorney Channing D."" said on Friday that..."


In [5]:
train = train_dev_paraphrased.iloc[0:8776,:]

In [6]:
train_paraphrased = train_dev_paraphrased.iloc[11176:19952,:] #2400-dev, so train will start from 8776+2400 = 11176 to 11176+8776=19952

In [7]:
train_train_paraphrased = pd.concat([train, train_paraphrased], ignore_index=True, axis=0)

In [8]:
train_train_paraphrased.to_csv("./processed_data/CONV_train_train_paraphrased.csv")

# Check lengths

In [52]:
from datasets import Dataset

In [54]:
tokeniser = trf.AutoTokenizer.from_pretrained("bert-base-uncased")

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence["demographic_essay"], truncation=True)

hugging_dataset = Dataset.from_pandas(train_dev, preserve_index=False)

tokenised_hugging_dataset = hugging_dataset.map(tokenise, batched=True)

Map:   0%|          | 0/987 [00:00<?, ? examples/s]

In [58]:
# checking length after tokenisation

length = []
for i in range(tokenised_hugging_dataset.num_rows):
  length.append(len(tokenised_hugging_dataset['input_ids'][i]))

print(f"Max length: {max(length)}")

Max lengths: 236


In [61]:
train_dev["demographic_essay"].str.len().max()

956

In [63]:
train_dev["article"].str.len().min()

284

In [10]:
train_dev["article"].str.len().max()

20047

In [8]:
train_dev["demographic_essay"].str.len().max()

956

In [14]:
train_dev_paraphrased["article"].str.len().max()

987

# Extra

## Mapping to speaker demographic and essay

In [50]:
essay_train_dev = pd.read_csv("./processed_data/preprocessed_train_dev.csv", header=0, index_col=0)
essay_train_dev.head(2)

# first duplicate speaker_id is removed, then index set as 'speaker_id' as our target is to extract demographics of speaker
essay_train_dev = essay_train_dev.drop_duplicates(subset='speaker_id', keep='first' ).set_index('speaker_id')

# following speaker id was missing
essay_train_dev.loc[31,'demographic'] = "I am speaker 31."
essay_train_dev.loc[26,'demographic'] = "I am speaker 26."

raw_train['demographic'] = raw_train['speaker_id'].apply(lambda x: essay_train_dev.loc[x, 'demographic'])

In [54]:
essay_train_dev = essay_train_dev.set_index('essay_id')

essay_train_dev.loc[501,'essay'] = "I wrote essay 501"
essay_train_dev.loc[502,'essay'] = "I wrote essay 502"

raw_train['essay'] = raw_train['essay_id'].apply(lambda x: essay_train_dev.loc[x, 'essay'])