# Cleanup GPT
Sometimes the response does not match the prompt. This is probably due to a race condition in the annotation app code. For example, when prompted to provide the 'mood', the response has to do with 'aspect'.

So for every feature, let's remove the responses that do not correspond to the given feature. Let's store the sentence id and nuber of incorrect responses, so we can prompt GPT again and correc the dataset.

In [104]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter, OrderedDict, defaultdict
import utils
import features
import importlib
import ast
import tiktoken
import os
import openai
from tqdm import tqdm
import backoff
import logging
import requests

In [90]:
importlib.reload(utils)
connection = utils.connectDB()

query = 'SELECT * FROM ebdb.ChatGPT where annotator_id in (20,21,22);'
gpt = pd.read_sql(query, connection)
# gpt = gpt[~gpt["response"].isna()]
# gpt = gpt[gpt["response"]!=""]



In [371]:
gpt.head() #.groupby('sentence_id').count()

Unnamed: 0,id,sentence_id,annotator_id,feature_id,prompt,response,date_updated
0,478,265,21,Aspect,\n You are a rhetoretician and linguist...,"{""Properties"":[""simple""],""Explanation"":""The te...",2023-08-14 11:01:22
1,479,265,21,Aspect,\n You are a rhetoretician and linguist...,"{""Properties"":[""simple""],""Explanation"":""The te...",2023-08-14 11:01:25
2,480,265,21,Aspect,\n You are a rhetoretician and linguist...,"{""Properties"":[""simple""],""Explanation"":""The te...",2023-08-14 11:01:28
3,481,265,21,Aspect,\n You are a rhetoretician and linguist...,"{""Properties"":[""simple""],""Explanation"":""The te...",2023-08-14 11:01:32
4,482,265,21,Aspect,\n You are a rhetoretician and linguist...,"{""Properties"":[""simple""],""Explanation"":""The te...",2023-08-14 11:01:36


In [362]:
importlib.reload(utils)
gpt_df = utils.getGPTFeatures(gpt)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 324/324 [00:42<00:00,  7.70it/s]


In [363]:
print(len(gpt_df))
gpt_df.head()

Unnamed: 0,sentence_id,count,properties,all_properties,all_count,feature_id
0,265,1,[{simple}],"[{simple}, {simple}, {simple}, {simple}, {simp...",5,Aspect
1,265,0,[],[],0,Mood
2,265,0,[],[],0,Verb_choices
3,265,1,[{subordinate}],[{subordinate}],1,Modifying_clauses
4,265,1,"[{multiplying and embedding modifiers, single ...","[{multiplying and embedding modifiers, single ...",5,Modifying_phrases
...,...,...,...,...,...,...
4986,15233,1,[{appositives}],"[{appositives}, {appositives}, {appositives}]",3,Phrases_built_on_nouns
4987,15233,1,"[{multiplying and embedding modifiers, single ...","[{multiplying and embedding modifiers, single ...",22,Modifying_phrases
4988,15233,2,"[{multiplying and embedding modifiers, single ...","[{left branching, right branching}, {multiplyi...",3,Sentence_architecture
4989,15233,2,"[{by sentence role, by position}, {left branch...","[{by sentence role, by position}, {left branch...",3,Emphasis


# Figure out which features/properties were incorrectly recorded, and make a list to re-prompt GPT

In [96]:
importlib.reload(utils)
importlib.reload(features)

to_reprompt = []
_features = list(features.f_od.keys())
_num_prompts = 3

for f in tqdm(_features):
    idx = _features.index(f)
    num_properties = len(features.f_od[f])
    
    try:
        properties_gpt = gpt_df.groupby('feature_id').agg(list).reset_index().loc[idx][['all_properties','all_count','sentence_id']]
        prop_list = list(zip(properties_gpt['all_properties'],properties_gpt['sentence_id'],properties_gpt['all_count']))

        for tup in prop_list:
            props = tup[0]
            sid = tup[1]
            count = tup[2]
            c = 0
            for _set in props:
                if len(_set.intersection(set(features.f_od[f]))) == 0:
                    c+=1
            times_to_reprompt = _num_prompts-(count-c)
            if times_to_reprompt > 0:
                to_reprompt.append([count, c, f, sid, props, times_to_reprompt])
    except Exception as err:
        print(f"Unexpected {err=}, {type(err)=}")
        pass
        

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 71.16it/s]

Unexpected err=KeyError(21), type(err)=<class 'KeyError'>





In [97]:
len(to_reprompt)

3380

In [98]:
to_reprompt[0:2]

[[0, 0, 'Aspect', 311, [], 3], [1, 0, 'Aspect', 313, [{'simple'}], 2]]

# Get new GPT responses

In [24]:
################################################################################################################################

# load features for ChatGPT
sentenceFeaturesDict = defaultdict(list)
sentenceFeaturesList = []

with open('../APP/features-sentence.json') as f:
    for jsonObj in f:
        featureDict = json.loads(jsonObj)
        sentenceFeaturesList.append(featureDict)
        sentenceFeaturesDict[featureDict['key']] = [val+" : "+desc for val, desc in zip(featureDict['values'],featureDict['descriptions'])]


wordFeaturesDict = defaultdict(list)
wordFeaturesList = []

with open('../APP/features-word.json') as f:
    for jsonObj in f:
        featureDict = json.loads(jsonObj)
        wordFeaturesList.append(featureDict)
        wordFeaturesDict[featureDict['key']] = [val+" : "+desc for val, desc in zip(featureDict['values'],featureDict['descriptions'])]


featuresDict = sentenceFeaturesDict | wordFeaturesDict

In [138]:
logging.getLogger('backoff').handlers.clear()

In [178]:
errors = []
logging.basicConfig(filename='gpt_reponse.log', filemode='w', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logging.getLogger('backoff').addHandler(logging.FileHandler('gpt_reponse.log'))
    
def get_gpt_response(sentence,feature):
    feature = feature.replace("_", " ")
    properties = "\n".join(featuresDict[feature])

    global prompt
    prompt = f"""
    You are a rhetoretician and linguist specializing in news text. 

    Your task is to identify which, if any, of the following properties of {feature} are used in the example text. 
    You may select multiple properties.
    Each line contains a property followed by a colon, followed by a brief definition and example(s):

    {properties}


    Format your response as a JSON object with "Properties" and "Explanation" as the keys. 
    The value of "Properties" should be a list. If none of the properties are present, return an empty list. 
    Explain your choice in the "Explanation". Make your response as short as possible.
    The example text is delimited with triple backticks. 


    Example text: ```{sentence}```
    """

    openai.organization = os.getenv("OPENAI_ORG_ID")
    openai.api_key = os.getenv("OPENAI_API_KEY")

    # model = "gpt-3.5-turbo-0301"
    # model = "gpt-3.5-turbo-0613"
    model = "gpt-4"


    messages=[{"role": "user", "content": prompt}]
    
    

    @backoff.on_exception(backoff.expo, openai.error.RateLimitError)
    @backoff.on_exception(backoff.expo, openai.error.APIError)
    @backoff.on_exception(backoff.expo, openai.error.APIConnectionError)
    @backoff.on_exception(backoff.expo, openai.error.Timeout)
    @backoff.on_exception(backoff.expo, openai.error.ServiceUnavailableError)
    @backoff.on_exception(backoff.expo, requests.exceptions.Timeout)
    def completions_with_backoff(**kwargs):
        return openai.ChatCompletion.create(**kwargs)

    res = completions_with_backoff(model=model, messages=messages)
    
    
    '''
    try:
      #Make your OpenAI API request here
      res = openai.ChatCompletion.create(
          model=model,
          messages=messages
        )
    except openai.error.APIError as e:
        #Handle API error here, e.g. retry or log
        print(f"OpenAI API returned an API Error: {e}")
        errors.append(f"OpenAI API returned an API Error: {e}")
        pass
    except openai.error.APIConnectionError as e:
        #Handle connection error here
        print(f"Failed to connect to OpenAI API: {e}")
        errors.append(f"Failed to connect to OpenAI API: {e}")
        pass
    except openai.error.RateLimitError as e:
        #Handle rate limit error (we recommend using exponential backoff)
        print(f"OpenAI API request exceeded rate limit: {e}")
        errors.append(f"OpenAI API request exceeded rate limit: {e}")
        pass
    except openai.error.Timeout as e:
        #Handle timeout error (we recommend using exponential backoff)
        print(f"OpenAI API request timed out: {e}")
        errors.append(f"OpenAI API request timed out: {e}")
        pass
    except requests.exceptions.Timeout as e:
        print(f"Request timed out: {e}")
        errors.append(f"Request timed out: {e}")
        pass
    except Exception as e:
        print(f"Unexpected {e=}, {type(e)=}")
        errors.append(f"Unexpected {e=}, {type(e)=}")
        pass
    '''

    response = res["choices"][0]["message"]["content"]

    return (prompt,response)

In [10]:
importlib.reload(utils)
connection = utils.connectDB()

query = 'SELECT id, text FROM ebdb.sample_sentences;'
sentences = pd.read_sql(query, connection)



In [11]:
len(sentences)

357

In [101]:
sentences[sentences['id']==4].iloc[0]['text']

'Muhammad advises the same thing, according to the tradition of the Muslim doctors: wherefore the great Temur always strove to exterminate the infidels, as much to acquire that glory, as to signalise himself by the greatness of his conquests.”'

In [102]:
new_responses = []

In [179]:
items = tqdm(to_reprompt[3155:]) # because an exception was raised at this item, we just need to get from this item on...

for item in items:
    sid = item[3]
    text = sentences[sentences['id']==sid].iloc[0]['text']
    feature = item[2]
    times_to_reprompt = item[5]
    
    for i in range(times_to_reprompt):
        promt, response = get_gpt_response(text,feature)
        new_responses.append([sid, feature, promt, response])
    
    

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 225/225 [45:12<00:00, 12.05s/it]


In [113]:
len(to_reprompt)

3380

In [180]:
len(new_responses)
#10303

10978

In [110]:
to_reprompt.index([2,
  0,
  'Figures_of_argument',
  3374,
  [{'strategic repetition'}, {'strategic repetition'}],
  1])

382

In [140]:
!ulimit -a

-t: cpu time (seconds)              unlimited
-f: file size (blocks)              unlimited
-d: data seg size (kbytes)          unlimited
-s: stack size (kbytes)             8192
-c: core file size (blocks)         0
-v: address space (kbytes)          unlimited
-l: locked-in-memory size (kbytes)  unlimited
-u: processes                       2784
-n: file descriptors                7168


In [181]:
ndf = pd.DataFrame(new_responses)

In [182]:
ndf

Unnamed: 0,0,1,2,3
0,313,Aspect,\n You are a rhetoretician and linguist spe...,"{""Properties"": [""simple""], ""Explanation"": ""The..."
1,313,Aspect,\n You are a rhetoretician and linguist spe...,"{""Properties"": [""simple""], ""Explanation"": ""The..."
2,3387,Aspect,\n You are a rhetoretician and linguist spe...,"{""Properties"": [""simple""], ""Explanation"": ""The..."
3,3387,Aspect,\n You are a rhetoretician and linguist spe...,"{""Properties"": [""simple""], ""Explanation"": ""The..."
4,3387,Aspect,\n You are a rhetoretician and linguist spe...,"{""Properties"": [""simple""], ""Explanation"": ""The..."
...,...,...,...,...
10973,14356,Tropes,\n You are a rhetoretician and linguist spe...,"{""Properties"": [""synecdoche"", ""metonymy"", ""eup..."
10974,14356,Tropes,\n You are a rhetoretician and linguist spe...,"{\n""Properties"": [""metonymy""],\n""Explanation"":..."
10975,14506,Tropes,\n You are a rhetoretician and linguist spe...,"{""Properties"": [""hyperbole""], ""Explanation"": ""..."
10976,14506,Tropes,\n You are a rhetoretician and linguist spe...,"{""Properties"": [""hyperbole""], ""Explanation"": ""..."


In [183]:
ndf.to_csv("newGPTresponses-2.csv",index=None)

In [184]:
# rows up to: 3260 are duplicates
ndf2 = pd.read_csv("newGPTresponses-2.csv",skiprows=3260,header=None)

In [185]:
ndf2.to_csv("newGPTresponses-3.csv",index=None) #.iloc[3][2]

In [186]:
ndf3 = pd.read_csv("newGPTresponses-3.csv")

In [191]:
ndf3.columns = ['sentence_id','feature_id','prompt','response']

In [373]:
ndf3.head()

Unnamed: 0,sentence_id,feature_id,prompt,response
0,311,Aspect,\n You are a rhetoretician and linguist spe...,"{""Properties"": [""simple""], ""Explanation"": ""The..."
1,311,Aspect,\n You are a rhetoretician and linguist spe...,"{\n""Properties"": [""simple""],\n""Explanation"": ""..."
2,311,Aspect,\n You are a rhetoretician and linguist spe...,"{""Properties"": [""simple""], ""Explanation"": ""The..."
3,313,Aspect,\n You are a rhetoretician and linguist spe...,"{""Properties"": [""simple""], ""Explanation"": ""The..."
4,313,Aspect,\n You are a rhetoretician and linguist spe...,"{""Properties"": [""simple""], ""Explanation"": ""The..."


In [277]:
importlib.reload(utils)
new_gpt_df = utils.getGPTFeatures(ndf3)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 324/324 [00:14<00:00, 21.65it/s]


In [278]:
new_gpt_df

Unnamed: 0,sentence_id,count,properties,all_properties,all_count,feature_id
0,311,1,[{simple}],"[{simple}, {simple}, {simple}]",3,Aspect
1,311,1,"[{by sentence role, by position}]","[{by sentence role, by position}, {by sentence...",2,Emphasis
2,311,1,[{antithesis}],"[{antithesis}, {antithesis}, {antithesis}]",3,Figures_of_argument
3,311,2,"[{agnominatio, emphasis}, {agnominatio, emphas...","[{agnominatio, emphasis}, {agnominatio, emphas...",2,Figures_of_word_choice
4,311,1,"[{Old English Core, Latin/Greek}]","[{Old English Core, Latin/Greek}]",1,Language_of_origin
...,...,...,...,...,...,...
3375,47,1,[{doubling}],"[{doubling}, {doubling}, {doubling}]",3,New_words_and_changing_uses
3376,47,1,[{polysyndenton}],"[{polysyndenton}, {polysyndenton}, {polysynden...",3,Series
3377,6125,1,[{stative}],[{stative}],1,Predication
3378,6125,1,[{}],"[{}, {}, {}]",3,Series


# Merge new GPT responses merge(new_gpt_df, gpt_df)

Once we have the new responses, merge into the GPT dataframe by removing the incorrect ones, and appending the new ones.

In [283]:
importlib.reload(utils)
connection = utils.connectDB()

query = 'SELECT * FROM ebdb.ChatGPT where annotator_id in (20,21,22);'
gpt = pd.read_sql(query, connection)

In [283]:
importlib.reload(utils)
gpt_df = utils.getGPTFeatures(gpt)

In [364]:
len(gpt_df)

4991

In [365]:
gpt_df.head()

Unnamed: 0,sentence_id,count,properties,all_properties,all_count,feature_id
0,265,1,[{simple}],"[{simple}, {simple}, {simple}, {simple}, {simp...",5,Aspect
1,265,0,[],[],0,Mood
2,265,0,[],[],0,Verb_choices
3,265,1,[{subordinate}],[{subordinate}],1,Modifying_clauses
4,265,1,"[{multiplying and embedding modifiers, single ...","[{multiplying and embedding modifiers, single ...",5,Modifying_phrases


In [366]:
new_gpt_df.head()

Unnamed: 0,sentence_id,count,properties,all_properties,all_count,feature_id
0,311,1,[{simple}],"[{simple}, {simple}, {simple}]",3,Aspect
1,311,1,"[{by sentence role, by position}]","[{by sentence role, by position}, {by sentence...",2,Emphasis
2,311,1,[{antithesis}],"[{antithesis}, {antithesis}, {antithesis}]",3,Figures_of_argument
3,311,2,"[{agnominatio, emphasis}, {agnominatio, emphas...","[{agnominatio, emphasis}, {agnominatio, emphas...",2,Figures_of_word_choice
4,311,1,"[{Old English Core, Latin/Greek}]","[{Old English Core, Latin/Greek}]",1,Language_of_origin


In [300]:
type(new_gpt_df['sentence_id'].unique())

numpy.ndarray

In [367]:
for i,row in enumerate(list(gpt_df.iterrows())):
    sid = row[1]['sentence_id']
    fid = row[1]['feature_id']
    
    _nd = new_gpt_df[new_gpt_df['sentence_id']==sid]
    
    _nd_features = _nd['feature_id'].unique()
    
    if fid in _nd_features:
        props = row[1]['all_properties']
        _nd_props = _nd[_nd['feature_id']==fid].iloc[0]['all_properties']
        
        
        # remove incorrect properties from props
        # add the new properties from _nd_props
        new_list = []
        for _set in props:
            if len(_set.intersection(set(features.f_od[fid]))) == 0:
                pass
            else:
                new_list.append(_set)
        
        if len(_nd_props[0]) > 0:
            new_list += _nd_props
        
        gpt_df.loc[((gpt_df['sentence_id'] == sid) & (gpt_df['feature_id'] == fid)),'all_properties'] = \
            gpt_df.loc[((gpt_df['sentence_id'] == sid) & (gpt_df['feature_id'] == fid)),'all_properties'].apply(lambda x: new_list) 


In [368]:
gpt_df[gpt_df['sentence_id']==265]

Unnamed: 0,sentence_id,count,properties,all_properties,all_count,feature_id
0,265,1,[{simple}],"[{simple}, {simple}, {simple}, {simple}, {simp...",5,Aspect
1,265,0,[],"[{indicative}, {indicative}, {indicative}]",0,Mood
2,265,0,[],[],0,Verb_choices
3,265,1,[{subordinate}],"[{subordinate}, {subordinate, adjective}, {sub...",1,Modifying_clauses
4,265,1,"[{multiplying and embedding modifiers, single ...","[{multiplying and embedding modifiers, single ...",5,Modifying_phrases
5,265,1,"[{left branching, middle branching, Periodic s...","[{left branching, middle branching, Periodic s...",3,Sentence_architecture
6,265,1,"[{by sentence role, by position}]","[{by sentence role, by position}, {by sentence...",1,Emphasis


# Save GPT dataframe for downstream tasks

In [375]:
gpt_df[["sentence_id","all_properties","feature_id"]].to_csv("corrected_GPT_min.csv",index=None)

In [370]:
len(gpt_df)

4991

In [374]:
gpt_df.head()

Unnamed: 0,sentence_id,count,properties,all_properties,all_count,feature_id
0,265,1,[{simple}],"[{simple}, {simple}, {simple}, {simple}, {simp...",5,Aspect
1,265,0,[],"[{indicative}, {indicative}, {indicative}]",0,Mood
2,265,0,[],[],0,Verb_choices
3,265,1,[{subordinate}],"[{subordinate}, {subordinate, adjective}, {sub...",1,Modifying_clauses
4,265,1,"[{multiplying and embedding modifiers, single ...","[{multiplying and embedding modifiers, single ...",5,Modifying_phrases
