In [1]:
import pandas as pd
import numpy as np
import openai
import time
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

In [2]:
import os

from utils.extra import num_tokens_from_messages
from evaluation import pearsonr

In [3]:
with open("./openai-api.txt", 'r') as f:
    openai.api_key = f.read()

In [4]:
# https://platform.openai.com/docs/models/gpt-3-5
model = 'gpt-3.5-turbo-0613'
model_16k = 'gpt-3.5-turbo-16k-0613'

In [5]:
@retry(wait=wait_random_exponential(min=5, max=60), stop=stop_after_attempt(6))
def empath_gpt(history, model=model):
    response = openai.ChatCompletion.create(
        model=model,
        messages=history,
        temperature=0
    )
    return response['choices'][0]['message']['content']

In [6]:
train = pd.read_csv('./data/PREPROCESSED-WS23-train.tsv', sep='\t', index_col=0)
train = train.reset_index(drop=True)

# Annotation by GPT

In [7]:
essay_gpt = [
    {
        "role": "system",
        "content": f"You are an AI model that annotates written essays to provide an empathy score between 1.0 to 7.0 based on the definition of empathy.\n The essays were written by human participants after reading a newspaper article involving harm to individuals, groups of people, nature, etc. The essay is provided to you within triple backticks. Your response must contain one and only empathy score."
    }
]
seed_index = [0, 7, 23]

for index in seed_index:
    essay_gpt.append({
        "role": "user",
        "content": f"Essay: ```{train.loc[index, 'demographic_essay']}```"
    })
    essay_gpt.append({
        "role": "assistant",
        "content": f"{train.loc[index, 'empathy']:.1f}"
    })

In [8]:
essay_gpt

[{'role': 'system',
  'content': 'You are an AI model that annotates written essays to provide an empathy score between 1.0 to 7.0 based on the definition of empathy.\n The essays were written by human participants after reading a newspaper article involving harm to individuals, groups of people, nature, etc. The essay is provided to you within tripple backticks. Your response must containe one and only empathy score.'},
 {'role': 'user',
  'content': "Essay: ```I am a 37-year-old male of the African American race. I have a four-year bachelor's degree and earn 40000 USD. It breaks my heart to see people living in those conditions. I hope that all the aid that was sent to the island makes it to the people who need it the most. I do not know what I would do it that was my family and I. I would hope that I would do my best, but I can see how depressing and hopeless you could feel having your whole life changed because of a storm and not knowing where your next meal is coming from.```"},
 

## Train

In [47]:
gpt_anno = train.copy()
gpt_anno.loc[:, 'empathy'] = np.nan # new annotations will be placed here

In [48]:
resume_from = 0

In [None]:
for row in train.itertuples():
    # if it was already annotated as the starting point
    if row.Index in seed_index:
        gpt_anno.loc[row.Index, 'empathy'] = train.loc[row.Index, 'empathy']
        continue
    
    if row.Index < resume_from:
        continue
    
    seed = essay_gpt.copy()
    print(f"Working on row index: {row.Index}")
    # add a new user essay to predict
    seed.append({
        "role": "user",
        "content": f"Essay: ```{train.loc[row.Index, 'demographic_essay']}```"
    })

    # Generate a response from the model
    try:
        response = empath_gpt(history=seed, model=model)
    except Exception as e:
        print(e)
        print("\nFailed but we're trying again in 60 seconds with a different model...\n")
        time.sleep(60)
        response = empath_gpt(history=seed, model=model_16k)       

    gpt_anno.loc[row.Index, 'empathy'] = float(response.strip())

    # save 
    if row.Index % 10 == 0:
        gpt_anno.to_csv('./data/WS23-train-gpt.tsv', sep='\t', index=None)
        
gpt_anno.to_csv('./data/WS23-train-gpt.tsv', sep='\t', index=None)

## Other sets

In [20]:
# data = pd.read_csv('./data/PREPROCESSED-WS23-dev.tsv', sep='\t', index_col=0)
# data = pd.read_csv('./data/PREPROCESSED-WS22-train.tsv', sep='\t', index_col=0)
# save_as = './data/WS22-train-gpt.tsv'

data = pd.read_csv('./data/PREPROCESSED-WS22-dev.tsv', sep='\t', index_col=0)
save_as = './data/WS22-dev-gpt.tsv'

data = data.reset_index(drop=True)

In [21]:
gpt_anno = data.copy()
gpt_anno.loc[:, 'empathy'] = np.nan # new annotations will be placed here

In [22]:
resume_from = 0

In [None]:
for row in data.itertuples():
    
    if row.Index < resume_from:
        continue
    
    seed = essay_gpt.copy()
    print(f"Working on row index: {row.Index}")
    # add a new user essay to predict
    seed.append({
        "role": "user",
        "content": f"Essay: ```{data.loc[row.Index, 'demographic_essay']}```"
    })

    # Generate a response from the model
    try:
        response = empath_gpt(history=seed, model=model)
    except Exception as e:
        print(e)
        print("\nFailed but we're trying again in 60 seconds with a different model...\n")
        time.sleep(60)
        response = empath_gpt(history=seed, model=model_16k)       

    gpt_anno.loc[row.Index, 'empathy'] = float(response.strip())

    # save 
    if row.Index % 10 == 0:
        gpt_anno.to_csv(save_as, sep='\t', index=None)
        
gpt_anno.to_csv(save_as, sep='\t', index=None)

In [26]:
cost_per_anno = ((1.86-1.67)+(3.86-1.86))/(208+1857+270)
cost_per_anno

0.0009379014989293362

## Unable to provide scores
WS23-train 22: "I'm sorry, but I'm unable to provide an empathy score for this essay as it contains offensive and insensitive language."

WS22-train-1272, 1387: 'I apologize, but the essay you provided seems to be a mix of unrelated sentences and does not provide any coherent thoughts or feelings regarding the article. Could you please provide a new essay that clearly expresses your thoughts and feelings about the article?'

In [13]:
# WS22
data.loc[1272, 'demographic_essay']

'I am a 32-year-old male of Black or African American race. I have completed some college but no degree, and I earn 50000 USD. The driven into exile last year, and he is now based in the southern port city of Aden. Saudi Arabia’s Sunni Muslim monarchy entered the war in large part because of concerns of Iranian influence in the region. That Shiite theocracy is widely perceived to be backing the Shiite  rebels. The airstrikes in  come on the same day i rejected a new U.N. peace proposal that would have sidelined him and given the prominent roles in a new government. More than 10,000 people have died in the conflict, many of them civilians who were killed by Saudi-led coalition bombings, according to the United Nations. that you have read the first article, please write a message to a friend or friends about your feelings and thoughts regarding the article you just read. This could be a private message to a friendS.'

In [16]:
# WS22
data.loc[1387, 'demographic_essay']

"I am a 30-year-old male of the Black or African American race. I have a four-year bachelor's degree and my income is 100 USD. very bad act abut killing that happened Now that you have read the first article, please write a message to a friend or friends about your feelings and thoughts regarding the article you just read. This could be a private message to a friend or something you would post on social media. Please do not identify your intended friend(s) - just write your thoughts about the article as if you were communicating with them Now that you have read the first article, please write a message to a friend or friends about your feelings and thoughts regarding the article you just read. This could be a private message to a friend or something you would post on social media. Please do not identify your intended friend(s) - just write your thoughts about the article as if you were communicating with them ...."