# Re-prompting GPT - V3
The results from these are listed in the FT column in the paper


### Conda environment: transformers

In [2]:
!conda env list

# conda environments:
#
base                     /Users/kylehamilton/opt/anaconda3
annotation               /Users/kylehamilton/opt/anaconda3/envs/annotation
mapping                  /Users/kylehamilton/opt/anaconda3/envs/mapping
mlflow                   /Users/kylehamilton/opt/anaconda3/envs/mlflow
nlp                      /Users/kylehamilton/opt/anaconda3/envs/nlp
pyg                      /Users/kylehamilton/opt/anaconda3/envs/pyg
si                       /Users/kylehamilton/opt/anaconda3/envs/si
torch                    /Users/kylehamilton/opt/anaconda3/envs/torch
transformers          *  /Users/kylehamilton/opt/anaconda3/envs/transformers



In [3]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter, OrderedDict, defaultdict
import features
import ast
import tiktoken
import os

from tqdm import tqdm
import backoff
import logging
import requests
import re
import utils
import importlib
importlib.reload(utils)

import GPT_V2
import GPT_V3

In [4]:
from openai import OpenAI

client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY'],  # this is also the default, it can be omitted
)

In [5]:
fs = list(features.f_od.keys())

In [6]:
fs

['Aspect',
 'Emphasis',
 'Figures_of_argument',
 'Figures_of_word_choice',
 'Language_of_origin',
 'Language_varieties',
 'Lexical_and_semantic_fields',
 'Modifying_clauses',
 'Modifying_phrases',
 'Mood',
 'New_words_and_changing_uses',
 'Parallelism',
 'Phrases_built_on_nouns',
 'Phrases_built_on_verbs',
 'Predication',
 'Prosody_and_punctuation',
 'Sentence_architecture',
 'Series',
 'Subject_choices',
 'Tense',
 'Tropes',
 'Verb_choices']

In [275]:
# get_single_gpt_response

importlib.reload(utils)
importlib.reload(GPT_V3)

def parseRes(x):
    try:
        result = gpt.parseResponse(x)[1][0]
    except:
        result = []
    return result


def fixProperties(s,feature):
    new_list = []
    if type(s) == str:
        s = ast.literal_eval(s) 

    for l in s:
        new_list.append(gpt.mapToProperty(l,feature))
        
    return new_list




def run(data,FEATURE,temp,version):
    print(FEATURE)
    print("="*100)
    temp = str(temp)
    
    for i in range(1,4):
        responses = []
        gpt = GPT_V3.GPT(MODEL)

        for row in tqdm(data.iterrows()):
            sentence = row[1]['text']
            feature = row[1]['feature_id']
            sid = row[1]['sentence_id']
            promt, response, usage = gpt.get_single_gpt_response(sentence,feature,sid,float(temp))
            responses.append([sid, response])

        df = pd.DataFrame(responses, columns=['sentence_id','res_'+temp+'_'+version+'_'+str(i)])
        
        df['property_'+temp+'_'+version+'_'+str(i)] = df['res_'+temp+'_'+version+'_'+str(i)].apply(parseRes)
        df['property_'+temp+'_'+version+'_'+str(i)] = df['property_'+temp+'_'+version+'_'+str(i)].apply(lambda s: fixProperties(s,feature))
    
        data = data.merge(df, how='outer',on='sentence_id')
        
        data.to_csv(output_path+version+"_"+FEATURE+".csv",index=None)

        print(f"There were {len(gpt.errors)} errors in round {i}.")
        
    # get agreement
    
    g1 = 'property_'+temp+'_'+version+'_1' 
    g2 = 'property_'+temp+'_'+version+'_2'
    g3 = 'property_'+temp+'_'+version+'_3'

    data["property_"+temp+'_'+version+"_consistency"] = data.apply(lambda x: utils.calcExactAgreement(x[g1],x[g2],x[g3]),axis=1)
    data.to_csv(output_path+version+"_"+FEATURE+".csv",index=None)
        
    return sum(data["property_"+temp+'_'+version+"_consistency"])/len(data)

In [7]:
# Fine tuned models:
FT_1 = "ft:gpt-3.5-turbo-1106:personal::8jtK1ntl"
FT_B = "ft:gpt-3.5-turbo-1106:personal::8kCGsHib" # only 'perfect' examples
FT_25 = "ft:gpt-3.5-turbo-1106:personal::8kCqqZMf"
FT_Maj = "ft:gpt-3.5-turbo-1106:personal::8kHQUVJr" # aspect
FT_M_2 = "ft:gpt-3.5-turbo-1106:personal::8kJH3MlY" # emphasis
FT_M_3 = "ft:gpt-3.5-turbo-1106:personal::8kJNlyTL" # figures of argument
FT_M_4 = "ft:gpt-3.5-turbo-1106:personal::8kJHdiWj" # Figures_of_word_choice
FT_M_6 = "ft:gpt-3.5-turbo-1106:personal::8kSbYB5Z" # Language_varieties
FT_M_7 = "ft:gpt-3.5-turbo-1106:personal::8kSXEMmV" # Lexical_and_semantic_fields
FT_M_8 = "ft:gpt-3.5-turbo-1106:personal::8kSZGelp" # Modifying_clauses
# FT_M_9 = "ft:gpt-3.5-turbo-1106:personal::8kTpo8tH" # Mood
FT_M_9 = "ft:gpt-3.5-turbo-1106:personal::8krH6MwS" # Mood
FT_M_10 = "ft:gpt-3.5-turbo-1106:personal::8kTWojhH" # New_words_and_changing_uses
FT_M_11 = "ft:gpt-3.5-turbo-1106:personal::8kTceGAO" # Parallelism
FT_M_12 = "ft:gpt-3.5-turbo-1106:personal::8kVWQWJW" # Phrases_built_on_nouns
FT_M_13 = "ft:gpt-3.5-turbo-1106:personal::8kVFi1Q9" # Phrases_built_on_verbs
FT_M_14 = "ft:gpt-3.5-turbo-1106:personal::8kVdlK5k" # Predication
FT_M_15 = "ft:gpt-3.5-turbo-1106:personal::8kYKKQWJ" # Sentence_architecture
FT_M_16 = "ft:gpt-3.5-turbo-1106:personal::8kYKGlP0" # Series
FT_M_17 = "ft:gpt-3.5-turbo-1106:personal::8kYdJ0eq" # Subject_choices

FT_M_18 = "ft:gpt-3.5-turbo-1106:personal::8ka5hSbD" # Tense
FT_M_19 = "ft:gpt-3.5-turbo-1106:personal::8kaunpHX" # Tropes
FT_M_20 = "ft:gpt-3.5-turbo-1106:personal::8kZhtRxY" # Verb_choices

In [10]:
models_dict = {
"ft:gpt-3.5-turbo-1106:personal::8kHQUVJr":"Aspect",
"ft:gpt-3.5-turbo-1106:personal::8kJH3MlY":"Emphasis",
"ft:gpt-3.5-turbo-1106:personal::8kJNlyTL":"Figures_of_argument",
"ft:gpt-3.5-turbo-1106:personal::8kJHdiWj":"Figures_of_word_choice",
"ft:gpt-3.5-turbo-1106:personal::8kSbYB5Z":"Language_varieties",
"ft:gpt-3.5-turbo-1106:personal::8kSXEMmV":"Lexical_and_semantic_fields",
"ft:gpt-3.5-turbo-1106:personal::8kSZGelp":"Modifying_clauses",
"ft:gpt-3.5-turbo-1106:personal::8krH6MwS":"Mood",
"ft:gpt-3.5-turbo-1106:personal::8kTWojhH":"New_words_and_changing_uses",
"ft:gpt-3.5-turbo-1106:personal::8kTceGAO":"Parallelism",
"ft:gpt-3.5-turbo-1106:personal::8kVWQWJW":"Phrases_built_on_nouns",
"ft:gpt-3.5-turbo-1106:personal::8kVFi1Q9":"Phrases_built_on_verbs",
"ft:gpt-3.5-turbo-1106:personal::8kVdlK5k":"Predication",
"ft:gpt-3.5-turbo-1106:personal::8kYKKQWJ":"Sentence_architecture",
"ft:gpt-3.5-turbo-1106:personal::8kYKGlP0":"Series",
"ft:gpt-3.5-turbo-1106:personal::8kYdJ0eq":"Subject_choices",
"ft:gpt-3.5-turbo-1106:personal::8ka5hSbD":"Tense",
"ft:gpt-3.5-turbo-1106:personal::8kaunpHX":"Tropes",
"ft:gpt-3.5-turbo-1106:personal::8kZhtRxY":"Verb_choices"
}

In [15]:
# estimate cost of 10 sentences
import GPT_V3
importlib.reload(utils)
importlib.reload(GPT_V3)

df = pd.read_csv("data/human_gpt_verified/Aspect.csv")
DATA = df[:10]

In [17]:
all_responses = []
for k,v in models_dict.items():
    FEATURE = v
    MODEL = k
    gpt = GPT_V3.GPT(MODEL)

    for row in tqdm(DATA.iterrows()):
            sentence = row[1]['text']
            sid = row[1]['sentence_id']
            
            responses = gpt.get_gpt_response(sentence,FEATURE,sid,0.0,"")
            all_responses.append(responses)


10it [00:24,  2.46s/it]
10it [00:26,  2.65s/it]
10it [00:34,  3.47s/it]
10it [00:39,  3.96s/it]
10it [01:37,  9.72s/it]
10it [00:58,  5.88s/it]
10it [00:35,  3.53s/it]
10it [00:44,  4.50s/it]
10it [01:41, 10.11s/it]
10it [00:33,  3.35s/it]
10it [00:47,  4.76s/it]
10it [00:14,  1.46s/it]
10it [00:29,  2.94s/it]
10it [00:32,  3.27s/it]
10it [00:29,  2.92s/it]
10it [00:55,  5.54s/it]
10it [00:43,  4.31s/it]
10it [01:18,  7.86s/it]
10it [00:46,  4.64s/it]


In [23]:
timings = pd.read_csv("timings.csv",header=None)

In [24]:
timings

Unnamed: 0,0,1
0,00:24,2.46
1,00:26,2.65
2,00:34,3.47
3,00:39,3.96
4,01:37,9.72
5,00:58,5.88
6,00:35,3.53
7,00:44,4.5
8,01:41,10.11
9,00:33,3.35


In [44]:
all_responses[0][0][4]

CompletionUsage(completion_tokens=27, prompt_tokens=120, total_tokens=147)

In [None]:
# gpt-3.5-turbo	$0.0080 / 1K tokens;	prompt: $0.0030 / 1K tokens;	compeletion: $0.0060 / 1K tokens
all_responses[0][0][4].completion_tokens * 0.006/1000 + all_responses[0][0][4].prompt_tokens * 0.003/1000

In [40]:
cost = 0
for responses in all_responses:
    for response in responses:
        cost += response[4].completion_tokens * 0.006/1000 + response[4].prompt_tokens * 0.003/1000
print("$cost for 10 sentences:",cost, "\n$cost for 20,000 sentences:", cost*2000)

$cost for 10 sentences: 0.7244309999999995 
$cost for 20,000 sentences: 1448.861999999999


In [None]:
# FEATURE = "Aspect"
MODEL = FT_M_9 #"gpt-4-1106-preview, gpt-3.5-turbo-instruct, gpt-3.5-turbo-1106"
model_version = "_FT_Maj_gpt3.5"
version = "V3"
data_path = "data/clean-annotated-data/"
output_path = "data/"+version+"/"+model_version+"/"

In [423]:
# for evaluating models trained on different features
alt = "_FT_Maj"
alt = ""

In [429]:
fs = ['Mood']

In [430]:
# get_gpt_response

importlib.reload(utils)
importlib.reload(GPT_V3)
gpt = GPT_V3.GPT(MODEL)

def parseRes(x,_property):
    try:
        result = gpt.parseYNResponse(x,_property)
    except():
        result = []
    return result


def fixProperties(s,feature):
    new_list = []
    if type(s) == str:
        s = ast.literal_eval(s) 

    for l in s:
        new_list.append(gpt.mapToProperty(l,feature))
    return new_list



def run(data,FEATURE,temp,version,model_version):
    temp = str(temp)
    
    for i in range(2,3):
        responses_data = []
        gpt = GPT_V3.GPT(MODEL)

        for row in tqdm(data.iterrows()):
            sentence = row[1]['text']
            feature = row[1]['feature_id']
            sid = row[1]['sentence_id']
            
            responses = gpt.get_gpt_response(sentence,feature,sid,float(temp),model_version)
            
            for res in responses:
                responses_data.append([sid, res[1], res[2]])

        df = pd.DataFrame(responses_data, columns=['sentence_id','property'+model_version,'res'+model_version+'_'+temp+'_'+version+'_'+str(i)])
         
        data = data.merge(df, how='outer',on='sentence_id')
        data.to_csv(output_path+version+"_"+FEATURE+alt+".csv",index=None)
        
        data['property'+model_version+'_'+temp+'_'+version+'_'+str(i)] = \
            data.apply(lambda row: parseRes(row['res'+model_version+'_'+temp+'_'+version+'_'+str(i)],row['property'+model_version]),axis=1) 
        
        data.to_csv(output_path+version+"_"+FEATURE+alt+".csv",index=None)
        
        print(f"There were {len(gpt.errors)} errors in round {i}.")

In [431]:
# ADDS THE PROPERTY NAME TO THE RESPONSE OBJECT FOR EASIER READING LATER.
def combine(prop, res):
    _json_obj = gpt.responseToJson(res)
    _json_obj['Property'] = prop
    
    return _json_obj

def removeErrors(s):
    if "parse error" in s:
        s = "[]"
        
    return ast.literal_eval(s)

for FEATURE in fs:
    df = pd.read_csv("data/human_gpt_verified/"+FEATURE+".csv")
    df = df[df["humans isCorrect"]>=0]
    print(FEATURE, model_version, version, len(df))
    run(df,FEATURE,0.0,version,model_version)

    # Combine
    df = pd.read_csv(output_path+"/"+version+"_"+FEATURE+alt+".csv")
    df['property'+model_version+'_0.0_'+version+'_2'] = df['property'+model_version+'_0.0_'+version+'_2'].apply(removeErrors)
    df['res'+model_version+'_0.0_'+version+'_2'] = df.apply(lambda x: combine(x['property'+model_version],x['res'+model_version+'_0.0_'+version+'_2']), axis=1)
    df['sentence_id'] = df['sentence_id'].apply(lambda x: int(x))

    df = df.groupby(['sentence_id']).agg({
        'sentence_id':lambda x: x.iloc[0], 
        'technique':lambda x: x.iloc[0], 
        'text':lambda x: x.iloc[0], 
        'feature_id':lambda x: x.iloc[0], 
        'props_a20':lambda x: x.iloc[0],
        'props_a21':lambda x: x.iloc[0], 
        'props_a22':lambda x: x.iloc[0], 
        'annotator_consistency':lambda x: x.iloc[0],
        'props_gpt4_majority':lambda x: x.iloc[0], 
        'res_1.0_1':lambda x: x.iloc[0], 
        'gpt_props_1.0_1':lambda x: x.iloc[0], 
        'res_1.0_2':lambda x: x.iloc[0],
        'gpt_props_1.0_2':lambda x: x.iloc[0], 
        'res_1.0_3':lambda x: x.iloc[0], 
        'gpt_props_1.0_3':lambda x: x.iloc[0],
        'gpt3.5_1.0_consistency':lambda x: x.iloc[0], 
        'res_0.2_1':lambda x: x.iloc[0], 
        'gpt_props_0.2_1':lambda x: x.iloc[0], 
        'res_0.2_2':lambda x: x.iloc[0],
        'gpt_props_0.2_2':lambda x: x.iloc[0], 
        'res_0.2_3':lambda x: x.iloc[0], 
        'gpt_props_0.2_3':lambda x: x.iloc[0],
        'gpt3.5_0.2_consistency':lambda x: x.iloc[0], 
        'gpt3.5_0.2_majority':lambda x: x.iloc[0], 
        'humans isCorrect':lambda x: x.iloc[0],
        'gpt isCorrect':lambda x: x.iloc[0], 
        'comments':lambda x: x.iloc[0], 
        'ground truth':lambda x: x.iloc[0],
        'property'+model_version:list, 
        'res'+model_version+'_0.0_'+version+'_2':list,
        'property'+model_version+'_0.0_'+version+'_2':sum
    })
    df=df.drop('property'+model_version,axis=1)
    df.to_csv(output_path+"/_"+version+"_"+FEATURE+alt+".csv",index=None)
    df=pd.read_csv(output_path+"/_"+version+"_"+FEATURE+alt+".csv")
    df = df[df["humans isCorrect"]>=0]
    df['agreement'] = df.apply(lambda x: utils.calcAgreement(x["ground truth"],x["property"+model_version+"_0.0_"+version+"_2"]), axis=1)
    
    print(Counter(df['agreement'])[1]/len(df))
    print("="*100)

Mood _FT_Maj_gpt3.5 V3 29


29it [01:30,  3.13s/it]

There were 0 errors in round 2.
0.13793103448275862



  df = df.groupby(['sentence_id']).agg({


In [407]:
df.columns

Index(['sentence_id', 'technique', 'text', 'feature_id', 'props_a20',
       'props_a21', 'props_a22', 'annotator_consistency',
       'props_gpt4_majority', 'res_1.0_1', 'gpt_props_1.0_1', 'res_1.0_2',
       'gpt_props_1.0_2', 'res_1.0_3', 'gpt_props_1.0_3',
       'gpt3.5_1.0_consistency', 'res_0.2_1', 'gpt_props_0.2_1', 'res_0.2_2',
       'gpt_props_0.2_2', 'res_0.2_3', 'gpt_props_0.2_3',
       'gpt3.5_0.2_consistency', 'gpt3.5_0.2_majority', 'humans isCorrect',
       'gpt isCorrect', 'comments', 'ground truth',
       'res_FT_Maj_gpt3.5_0.0_V3_2', 'property_FT_Maj_gpt3.5_0.0_V3_2',
       'agreement'],
      dtype='object')

In [227]:
FEATURE = "Tropes"

In [235]:
df = pd.read_csv(output_path+"/"+version+"_"+FEATURE+".csv")
df.columns

Index(['sentence_id', 'technique', 'text', 'feature_id', 'props_a20',
       'props_a21', 'props_a22', 'annotator_consistency',
       'props_gpt4_majority', 'res_1.0_1', 'gpt_props_1.0_1', 'res_1.0_2',
       'gpt_props_1.0_2', 'res_1.0_3', 'gpt_props_1.0_3',
       'gpt3.5_1.0_consistency', 'res_0.2_1', 'gpt_props_0.2_1', 'res_0.2_2',
       'gpt_props_0.2_2', 'res_0.2_3', 'gpt_props_0.2_3',
       'gpt3.5_0.2_consistency', 'gpt3.5_0.2_majority', 'humans isCorrect',
       'gpt isCorrect', 'comments', 'ground truth', 'property_gpt3.5',
       'res_gpt3.5_0.0_V3_2', 'property_gpt3.5_0.0_V3_2'],
      dtype='object')

# OPEN THE CSV TO CHECK FOR PARSE ERRORS FIRST

In [236]:
# ADDS THE PROPERTY NAME TO THE RESPONSE OBJECT FOR EASIER READING LATER.
def combine(prop, res):
    _json_obj = gpt.responseToJson(res)
    _json_obj['Property'] = prop
    
    return _json_obj

def removeErrors(s):
    if "parse error" in s:
        s = "[]"
        
    return ast.literal_eval(s)

In [237]:
len(df)

448

In [238]:
model_version

'_gpt3.5'

In [239]:
df['property'+model_version+'_0.0_'+version+'_2'] = df['property'+model_version+'_0.0_'+version+'_2'].apply(removeErrors)

In [240]:
df['res'+model_version+'_0.0_'+version+'_2'] = \
    df.apply(lambda x: combine(x['property'+model_version],x['res'+model_version+'_0.0_'+version+'_2']), axis=1)

In [241]:
df['sentence_id'] = df['sentence_id'].apply(lambda x: int(x))

df = df.groupby(['sentence_id']).agg({
    'sentence_id':lambda x: x.iloc[0], 
    'technique':lambda x: x.iloc[0], 
    'text':lambda x: x.iloc[0], 
    'feature_id':lambda x: x.iloc[0], 
    'props_a20':lambda x: x.iloc[0],
    'props_a21':lambda x: x.iloc[0], 
    'props_a22':lambda x: x.iloc[0], 
    'annotator_consistency':lambda x: x.iloc[0],
    'props_gpt4_majority':lambda x: x.iloc[0], 
    'res_1.0_1':lambda x: x.iloc[0], 
    'gpt_props_1.0_1':lambda x: x.iloc[0], 
    'res_1.0_2':lambda x: x.iloc[0],
    'gpt_props_1.0_2':lambda x: x.iloc[0], 
    'res_1.0_3':lambda x: x.iloc[0], 
    'gpt_props_1.0_3':lambda x: x.iloc[0],
    'gpt3.5_1.0_consistency':lambda x: x.iloc[0], 
    'res_0.2_1':lambda x: x.iloc[0], 
    'gpt_props_0.2_1':lambda x: x.iloc[0], 
    'res_0.2_2':lambda x: x.iloc[0],
    'gpt_props_0.2_2':lambda x: x.iloc[0], 
    'res_0.2_3':lambda x: x.iloc[0], 
    'gpt_props_0.2_3':lambda x: x.iloc[0],
    'gpt3.5_0.2_consistency':lambda x: x.iloc[0], 
    'gpt3.5_0.2_majority':lambda x: x.iloc[0], 
    'humans isCorrect':lambda x: x.iloc[0],
    'gpt isCorrect':lambda x: x.iloc[0], 
    'comments':lambda x: x.iloc[0], 
    'ground truth':lambda x: x.iloc[0],
    'property'+model_version:list, 
    'res'+model_version+'_0.0_'+version+'_2':list,
    'property'+model_version+'_0.0_'+version+'_2':sum
})

  df = df.groupby(['sentence_id']).agg({


In [242]:
df=df.drop('property'+model_version,axis=1)

In [243]:
df.to_csv(output_path+"/_"+version+"_"+FEATURE+".csv",index=None)

In [244]:
df=pd.read_csv(output_path+"/_"+version+"_"+FEATURE+".csv")

In [245]:
df = df[df["humans isCorrect"]>=0]
print(len(df))

32


In [246]:
importlib.reload(utils)
df['agreement'] = df.apply(lambda x: utils.calcAgreement(x["ground truth"],x["property"+model_version+"_0.0_"+version+"_2"]), axis=1)

In [247]:
print(FEATURE, model_version, version)
Counter(df['agreement'])[1]/len(df)

Tropes _gpt3.5 V3


0.15625

In [None]:
# Figures_of_argument GPT4
print(FEATURE, model_version, version)
Counter(df['agreement'])[1]/len(df)

In [96]:
# Subject Predication GPT3.5
print(FEATURE, model_version, version)
Counter(df['agreement'])[1]/len(df)

Predication _gpt3.5 V3


0.16666666666666666

In [48]:
# Subject choices GPT4
Counter(df['agreement'])[1]/len(df)

0.4666666666666667

In [83]:
# Figures_of_argument GPT4
Counter(df['agreement'])[1]/len(df)

0.5666666666666667