In [None]:
import pandas as pd
import numpy as np

In [None]:
import os

In [None]:
import openai
import time
with open("./openai-api.txt", 'r') as f:
    openai.api_key = f.read()

In [None]:
def chat(prompt, model="gpt-3.5-turbo-0613", temp=1.5):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temp, # degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

# Numeric to text

In [None]:
def num_to_text(df):
    id_demographic = df.copy()
    print('Shape before dropping duplicates:', id_demographic.shape)
    id_demographic = id_demographic.drop_duplicates(subset='speaker_id')
    print('Shape after dropping duplicates:', id_demographic.shape)
    print('Starting access to ChatGPT ...')
    
    for row in id_demographic.itertuples():
        prompt_convert = f"""
        Your task is to format five numerical data (individual's gender, education level, race, age, and income) into meaningful sentences.
        The numerical data are delimited by triple backticks.
        Write from a first-person point-of-view.
        Complete the task with no more than three sentences.
        
        Use the following mapping between the number and the corresponding text:
        
        Gender:
        1 = Male
        2 = Female
        5 = Other
        
        Education level:
        1 = Less than a high school diploma
        2 = High school diploma
        3 = Technical/Vocational school
        4 = Some college but no degree
        5 = Two-year associate degree
        6 = Four-year bachelor’s degree
        7 = Postgraduate or professional degree
        
        Race:
        1 = White        
        2 = Hispanic or Latino
        3 = Black or African American
        4 = Native American or American Indian
        5 = Asian/Pacific Islander
        6 = Other
        
        Age:
        <number> = <number> years
        
        Income:
        <number> = <number> USD
        
        For example, if the input numbers are: "Gender: 1, Education level: 5, Race: 1, Age: 25, Income: 40000"
        The output can be "I am a 25-year-old male of the White race. I completed a two-year associate degree and earn 40000 USD."
        
        Input numbers: ```Gender: {id_demographic.loc[row.Index, 'gender']}, Education level: {id_demographic.loc[row.Index, 'education']}, Race: {id_demographic.loc[row.Index, 'race']}, Age: {id_demographic.loc[row.Index, 'age']}, Income: {id_demographic.loc[row.Index, 'income']}```
        """
        print('Working on row index:', row.Index)
        try:
            id_demographic.loc[row.Index, 'demographic'] = chat(prompt_convert)
        except Exception as e:
            print(e)
            print("\nFailed but we're trying again in 60 seconds...\n")
            time.sleep(60) # normally it asks to wait for 20s
            id_demographic.loc[row.Index, 'demographic'] = chat(prompt_convert)
    
        ## Debugging
        # print(f"Input numbers: ```Gender: {data.loc[row.Index, 'gender']}, Education level: {data.loc[row.Index, 'education']}, Race: {data.loc[row.Index, 'race']}, Age: {data.loc[row.Index, 'age']}, Income: {data.loc[row.Index, 'income']}")
        
        # if row.Index == 2:
        #     break

    id_demographic = id_demographic[['speaker_id', 'demographic']] #keeping only required columns

    print('Here are the converted texts:')
    print(id_demographic.loc[:,'demographic'].values)

    return id_demographic

In [None]:
def id_to_demographic(filename, WS22=False, save_as=None, access_gpt=True):
    '''
    access_gpt: if False, it will just load existing saved id_demographic file using save_as as filename
    '''
    data = train_ws23 = pd.read_csv(filename, sep='\t', na_values='unknown')
    print('Columns having NaN values: ', data.columns[data.isna().any()].tolist())
    print('Initial data shape:', data.shape)
    data = data.dropna()
    print()
    print('After dropping NA values, data shape:', data.shape)
    print('Columns having NaN values: ', data.columns[data.isna().any()].tolist())
    print('Columns having object datatypes: ')
    print(data.dtypes[data.dtypes=='object'])    

    assert data.isna().any().any() == False #no NA values
    assert data.isnull().any().any() == False #no null values

    # WASSA22 has no speaker_id
    if WS22:
        data['speaker_id'] = data.groupby(['gender', 'education', 'race', 'age', 'income']).ngroup()
    
    if access_gpt:
        print()
        id_demographic = num_to_text(data)
    else:
        if WS22:
            id_demographic = pd.read_csv('./data/' + save_as+'.csv', index_col=0) #unfortunately saved as csv
        else:
            id_demographic = pd.read_csv('./data/' + save_as+'.tsv', sep='\t', index_col=0)

    if access_gpt and (save_as is not None):
        id_demographic.to_csv('./data/' + save_as+'.tsv', sep='\t')
        print(f'\nSaved as {save_as}.tsv')
    return data, id_demographic

In [None]:
def map_id_to_main(id_demographic_df, main_df, save_as):
    id_demographic = id_demographic_df.copy()
    main = main_df.copy()
    id_demographic.set_index('speaker_id', inplace=True) #changing index so that I can use it easily to map as below
    main['demographic'] = main['speaker_id'].apply(lambda x: id_demographic.loc[x, 'demographic'])
    main['demographic_essay'] = main['demographic'] + ' ' + main['essay']
    if save_as is not None:
        main.to_csv('./data/' + save_as + '.tsv', sep='\t')
        print(f'Saved it as: {save_as}.tsv')
    return main

## WASSA23

In [None]:
train_file_ws23 = './data/WASSA23/WASSA23_essay_level_with_labels_train.tsv'
dev_file_ws23 = './data/WASSA23/WASSA23_essay_level_dev.tsv'
test_file_ws23 = './data/WASSA23/WASSA23_essay_level_test.tsv'

In [None]:
train_ws23, id_demographic_train_ws23 = id_to_demographic(
    filename=train_file_ws23,
    save_as='WS23-train-id_demographic-text',
    access_gpt=True
)

In [None]:
train_ws23 = map_id_to_main(id_demographic_df=id_demographic_train_ws23, main_df=train_ws23, save_as=None)

In [None]:
dev_ws23, id_demographic_dev_ws23 = id_to_demographic(
    filename=dev_file_ws23,
    save_as='WS23-dev-id_demographic-text',
    access_gpt=True
)

In [None]:
dev_ws23 = map_id_to_main(id_demographic_df=id_demographic_dev_ws23, main_df=dev_ws23, save_as=None)

In [None]:
test_ws23, id_demographic_test_ws23 = id_to_demographic(
    filename=test_file_ws23,
    save_as='WS23-test-id_demographic-text',
    access_gpt=True
)

In [None]:
test_ws23 = map_id_to_main(id_demographic_df=id_demographic_test_ws23, main_df=test_ws23, save_as=None)

## WASSA22

In [None]:
train_file = './data/WASSA22/messages_train_ready_for_WS.tsv'
dev_file = './data/WASSA22/messages_dev_features_ready_for_WS_2022.tsv'
test_file = './data/WASSA22/messages_test_features_ready_for_WS_2022.tsv'

In [None]:
train, id_demographic_train = id_to_demographic(
    filename=train_file,
    WS22=True,
    save_as='WS22-train-id_demographic-text',
    access_gpt=True
)

In [None]:
train = map_id_to_main(id_demographic_df=id_demographic_train, main_df=train, save_as=None)

In [None]:
dev, id_demographic_dev = id_to_demographic(
    filename=dev_file,
    WS22=True,
    save_as='WS22-dev-id_demographic-text',
    access_gpt=True
)

In [None]:
dev = map_id_to_main(id_demographic_df=id_demographic_dev, main_df=dev, save_as=None)

In [None]:
test, id_demographic_test = id_to_demographic(
    filename=test_file,
    WS22=True,
    save_as='WS22-test-id_demographic-text',
    access_gpt=True
)

In [None]:
test = map_id_to_main(id_demographic_df=id_demographic_test, main_df=test, save_as=None)

# Article summary

In [None]:
article = pd.read_csv('./data/WASSA23/articles_adobe_AMT.csv', header=0)
article.sample(2)

In [None]:
long_index = []
for row in article.itertuples():
    prompt_summary = f"""
    Your task is to summarize given text delimited by triple backticks.
    Use at most 1000 characters.
    Do not add any additional information not contained in the input text.
    
    Input text: ```{article.loc[row.Index,'text']}```
    """
    # we don't need variation, so, temp=0 
    try:
        article.loc[row.Index, 'summary_text'] = chat(prompt=prompt_summary, temp=0)
    except:
        # base gpt3.5: "This model's maximum context length is 4097 tokens. However, your messages resulted in 4536 tokens.", so 16K context is used
        article.loc[row.Index, 'summary_text'] = chat(prompt=prompt_summary, model="gpt-3.5-turbo-16k", temp=0)
        long_index.append(row.Index)
    
    # debugging
    print(article.loc[row.Index, 'summary_text'])
    print("\n")
    # if row.Index == 1:
    #     break

In [None]:
article.to_csv("./data/article-summarised.csv")

In [None]:
print(f"Longer texts (index) summarised by 16k model: {long_index}")

## \# of characters

In [None]:
print("Minimum, mean and maximum of raw articles:", 
      article['text'].str.len().min(), article['text'].str.len().mean(), article['text'].str.len().max())

In [None]:
print("Minimum, mean and maximum of summarised articles:", 
      article['summary_text'].str.len().min(), article['summary_text'].str.len().mean(), article['summary_text'].str.len().max())

# Combine article to the main dataset

In [None]:
article = pd.read_csv("./data/article-summarised.csv", index_col=1) #article_id as index so that I can map to it easily

In [None]:
article.sample(2)

In [None]:
article.loc[63, 'text']

In [None]:
def make_main_data(main_df, dataname, test=False):
    input_data = main_df.copy() #mandatory step as dataframe is mutable

    if not test: #can't remove anything on test set
        print('Initial shape:', input_data.shape)
        print('Rows having invalid article 63:')
        print(input_data[input_data['article_id']==63].values)
        
        input_data = input_data[input_data['article_id'] != 63]
        print('\nCurrent shape after removing artile 63, if any:', input_data.shape)

    #converting article id to corresponding article texts
    input_data['article'] = input_data['article_id'].apply(lambda x: article.loc[x, 'summary_text'])
        
    # print(input_data.isna().any())
    assert input_data.isna().any().any() == False #no NA values
    assert input_data.isnull().any().any() == False #no null values
    
    input_data['demographic_essay'] = input_data['demographic'] + ' ' + input_data['essay']
  
    input_data.to_csv("./data/PREPROCESSED-" + dataname + ".tsv", sep='\t')

In [None]:
make_main_data(main_df=train_ws23, dataname="WS23-train")

In [None]:
make_main_data(main_df=dev_ws23, dataname="WS23-dev", test=True) # will use as test set while validation, so no need to drop

In [None]:
make_main_data(main_df=test_ws23, dataname="WS23-test")

In [None]:
make_main_data(main_df=train, dataname="WS22-train")

In [None]:
make_main_data(main_df=dev, dataname="WS22-dev", test=True) # will use as test set while validation, so no need to drop

In [None]:
make_main_data(main_df=test, dataname="WS22-test", test=True)

# Combine WS22 and WS23

## Preprocessed

### Train

In [None]:
ws22 = pd.read_csv('./data/PREPROCESSED-WS22-train.tsv', sep='\t', index_col=0)
ws23 = pd.read_csv('./data/PREPROCESSED-WS23-train.tsv', sep='\t', index_col=0)

In [None]:
ws22.head(2)

In [None]:
ws23.head(2)

In [None]:
column_ws22 = ws22.columns.tolist()

In [None]:
column_ws23 = ws23.columns.tolist()

In [None]:
common_columns = [i for i in column_ws22 if i in column_ws23]

In [None]:
common_columns

In [None]:
ws22 = ws22[common_columns]
ws23 = ws23[common_columns]

In [None]:
print(ws22.shape, ws23.shape)

In [None]:
ws = pd.concat([ws22, ws23], ignore_index=True)

In [None]:
ws.shape

In [None]:
ws.to_csv('./data/PREPROCESSED-WS22-WS23-train.tsv', sep='\t', index=False)

### Dev

In [None]:
ws22 = pd.read_csv('./data/PREPROCESSED-WS22-dev.tsv', sep='\t', index_col=0)
ws23 = pd.read_csv('./data/PREPROCESSED-WS23-dev.tsv', sep='\t', index_col=0)

In [None]:
ws22.head(2)

In [None]:
ws23.head(2)

In [None]:
column_ws22 = ws22.columns.tolist()

In [None]:
column_ws23 = ws23.columns.tolist()

In [None]:
common_columns = [i for i in column_ws22 if i in column_ws23]

In [None]:
common_columns

In [None]:
ws22 = ws22[common_columns]
ws23 = ws23[common_columns]

In [None]:
print(ws22.shape, ws23.shape)

In [None]:
ws = pd.concat([ws22, ws23], ignore_index=True)

In [None]:
ws.shape

In [None]:
ws.to_csv('./data/PREPROCESSED-WS22-WS23-dev.tsv', sep='\t', index=False)

### Dev labels

In [None]:
ws22 = pd.read_csv('./data/WASSA22/goldstandard_dev_2022.tsv', sep='\t', header=None)
ws23 = pd.read_csv('./data/WASSA23/goldstandard_dev.tsv', sep='\t', header=None)

In [None]:
ws22.head(2)

In [None]:
ws23.head(2)

In [None]:
print(ws22.shape, ws23.shape)

In [None]:
ws = pd.concat([ws22, ws23], ignore_index=True)

In [None]:
ws.shape

In [None]:
ws.to_csv('./data/goldstandard-WS22-WS23-dev.tsv', sep='\t', index=False, header=None)

## Raw

In [None]:
train_WS22 = './data/WASSA22/messages_train_ready_for_WS.tsv'
train_WS23 = './data/WASSA23/WASSA23_essay_level_with_labels_train.tsv'

In [None]:
train_WS22 = pd.read_csv(train_WS22, sep='\t', header=0)
train_WS23 = pd.read_csv(train_WS23, sep='\t', na_values='unknown', header=0)

In [None]:
train_WS22.dropna(inplace=True)
train_WS23.dropna(inplace=True)

In [None]:
train_WS22.select_dtypes(exclude='number').columns.tolist()

In [None]:
train_WS23.select_dtypes(exclude='number').columns.tolist()

In [None]:
column_WS22 = train_WS22.columns.tolist()

In [None]:
column_WS23 = train_WS23.columns.tolist()

In [None]:
common_columns = [i for i in column_WS22 if i in column_WS23]

In [None]:
common_columns

In [None]:
train_WS22 = train_WS22[common_columns]
train_WS23 = train_WS23[common_columns]

In [None]:
train_WS22.shape

In [None]:
train_WS23.shape

In [None]:
train_WS = pd.concat([train_WS22, train_WS23], ignore_index=True)

In [None]:
train_WS.shape

In [None]:
train_WS.to_csv('./data/essay-train-ws22-ws23.tsv', sep='\t')

## GPT-annotated

### Train

In [None]:
ws22 = pd.read_csv('./data/WS22-train-gpt.tsv', sep='\t')
ws23 = pd.read_csv('./data/WS23-train-gpt.tsv', sep='\t')

In [None]:
ws22.head(2)

In [None]:
ws23.head(2)

In [None]:
column_ws22 = ws22.columns.tolist()

In [None]:
column_ws23 = ws23.columns.tolist()

In [None]:
common_columns = [i for i in column_ws22 if i in column_ws23]

In [None]:
common_columns

In [None]:
ws22 = ws22[common_columns]
ws23 = ws23[common_columns]

In [None]:
print(ws22.shape, ws23.shape)

In [None]:
ws = pd.concat([ws22, ws23], ignore_index=True)

In [None]:
ws.shape

In [None]:
ws.dropna(inplace=True)

In [None]:
ws.shape

In [None]:
ws.to_csv('./data/WS22-WS23-train-gpt.tsv', sep='\t', index=False)

In [None]:
ws22.shape

In [None]:
ws22.dropna(inplace=True)

In [None]:
ws22.shape

In [None]:
ws22.to_csv('./data/WS22-train-gpt.tsv', sep='\t', index=False)

# Augmentation

In [None]:
data = pd.read_csv('./data/PREPROCESSED-WS22-WS23-train.tsv', sep='\t')

In [None]:
data.head()

In [None]:
data.drop(['essay', 'demographic'], axis=1, inplace=True) #since only essay will not be paraphrased
paraphrased = data.copy()
paraphrased.loc[:, ['demographic_essay', 'article']] = np.nan # paraphrased texts will be placed there 

for row in data.itertuples():
    prompt = f"""
    As a data augmentation tool for NLP, your task is to paraphrase the newspaper article delimited by triple backticks.

    Do not add any additional information not contained in the input texts.

    Your response must not have any backticks or any additional symbols.
    
    Input newspaper article: ```{data.loc[row.Index,'article']}```
    """
    try:
        response = chat(prompt=prompt, temp=1)
    except Exception as e:
        print(e)
        print("\nFailed but we're trying again in 60 seconds with a different model...\n")
        time.sleep(60) # normally it asks to wait for 20s
        # base gpt3.5: "This model's maximum context length is 4097 tokens. However, your messages resulted in 4536 tokens.", so 16K context is used
        response = chat(prompt=prompt, model="gpt-3.5-turbo-16k", temp=1)

    # demographic_essay
    prompt_essay = f"""
    In a data collection experiment for empathy detection, the study participant writes essay to describe their feeling after reading a newspaper article involving harm to individuals, groups or other entities.
    
    The participant's demographic information are also available within the essay.
    
    As a data augmentation tool for NLP, your task is to paraphrase the demographic and essay information delimited by triple backticks.

    Do not add any additional information not contained in the input texts.

    Overall, the participant expressed {data.loc[row.Index,'emotion']} emotion. Do not change this overall emotion of the participant's essay.

    Your response must not have any backticks or any additional symbols.
    
    Input demographic and essay: ```{data.loc[row.Index,'demographic_essay']}```
    """
    try:
        response_essay = chat(prompt=prompt_essay, temp=1)
    except Exception as e:
        print(e)
        print("\nFailed but we're trying again in 60 seconds with a different model...\n")
        time.sleep(60) # normally it asks to wait for 20s
        # base gpt3.5: "This model's maximum context length is 4097 tokens. However, your messages resulted in 4536 tokens.", so 16K context is used
        response_essay = chat(prompt=prompt_essay, model="gpt-3.5-turbo-16k", temp=1)

    # process the response
    paraphrased.loc[row.Index, 'demographic_essay'] = response_essay
    paraphrased.loc[row.Index, 'article'] = response

    print('Completed row index:', row.Index)
    # saving per 10 new paraphrase
    if row.Index % 10 == 0:
        paraphrased.to_csv('./data/paraphrased-preprocessed-WS22-WS23-train.tsv', sep='\t', index=None)
    
    # debugging
    # print("\n")
    # if row.Index == 2:
    #     break

In [None]:
paraphrased.to_csv('./data/paraphrased-preprocessed-WS22-WS23-train.tsv', sep='\t', index=None)

In [None]:
data.columns

In [None]:
paraphrased.columns

In [None]:
all = pd.concat([data, paraphrased], ignore_index=True)

In [None]:
all.shape

In [None]:
all.to_csv('./data/COMBINED-PREPROCESSED-PARAPHRASED-WS22-WS23-train.tsv', sep='\t', index=None)

## Changing original annotations by GPT annotations

In [None]:
ws['empathy'].shape

In [None]:
original_anno = pd.read_csv('./data/COMBINED-PREPROCESSED-PARAPHRASED-WS22-WS23-train.tsv', sep='\t')

In [None]:
original_anno.shape

In [None]:
original_anno.rename(columns={'empathy': 'wrong_empathy'}, inplace=True)

In [None]:
original_anno.columns

In [None]:
original_anno['empathy'] = ws['empathy'].tolist() + ws['empathy'].tolist()

In [None]:
original_anno.shape

In [None]:
original_anno.dropna(inplace=True) #there should be two+two NA values, as those were unable to annotate

In [None]:
original_anno.shape

In [None]:
original_anno.to_csv('./data/WS22-WS23-augmented-train-gpt.tsv', sep='\t', index=None)

#### Split

In [None]:
all = pd.read_csv('./data/WS22-WS23-augmented-train-gpt.tsv', sep='\t')

In [None]:
wo_aug = all[: (len(all)//2)]

In [None]:
len(wo_aug)

In [None]:
wo_aug.to_csv('./data/WS22-WS23-sep-from-aug-train-gpt.tsv', sep='\t', index=False)

### Further split

In [None]:
wo_aug = pd.read_csv('./data/WS22-WS23-sep-from-aug-train-gpt.tsv', sep='\t')

In [None]:
v1 = wo_aug.loc[:1854, :]

In [None]:
v1.shape

In [None]:
ten_percent = v1.sample(n=185)

In [None]:
ten_percent.shape

In [None]:
ten_percent.to_csv('./data/v1-10-percent.tsv', sep='\t', index=True)

In [None]:
ninety_percent = v1.drop(ten_percent.index) #removing 10% randomly as per the paper

In [None]:
ninety_percent.shape

In [None]:
ninety_percent.to_csv('./data/v1-90-percent.tsv', sep='\t', index=False)