# Annotation Results

In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from collections import defaultdict
import os
import pymysql
import scipy
from tqdm import tqdm
import ast
import utils
import importlib

# Read data

In [95]:
data_path = "data/human_gpt_verified/"

In [96]:
features = ['Aspect',
 'Emphasis',
 'Figures_of_argument',
 'Figures_of_word_choice',
 'Language_of_origin',
 'Language_varieties',
 'Lexical_and_semantic_fields',
 'Modifying_clauses',
 'Modifying_phrases',
 'Mood',
 'New_words_and_changing_uses',
 'Parallelism',
 'Phrases_built_on_nouns',
 'Phrases_built_on_verbs',
 'Predication',
 'Prosody_and_punctuation',
 'Sentence_architecture',
 'Series',
 'Subject_choices',
 'Tense',
 'Tropes',
 'Verb_choices']

# Manual error analysis

## Aspect

In [240]:
verified_df = pd.read_csv("data/human_gpt_verified/Aspect.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
l = len(verified_df)
print(l)
print("human accuracy:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


35
human accuracy: 0.9428571428571428
gpt accuracy: 0.14285714285714285
----------------------------------------------------------------------------------------------------
1 	 gpt inserted its own property
1 	 gtp mistook adjective for verb (growing)
1 	 the correct answer is simple  (email begins with) and progressinve (admitting)
17 	 Seems to be relying on the definition provided in a looser way
6 	 nan
1 	 gpt missed the part of  sentence
7 	 this should be an empty list
1 	 explanation is just wrong


### Main types of errors

1. GPT is not using the definitions provided in the context of grammar. 
2. GPT often provides a property when no property is applicable.

We only verified rows where gpt majority responses were different from human annotators. This does not preclude error 1 from occuring where there is agreement, only the agreement is by chance. This is particulalry true for the 'simple' property which is by far the majority property present in the Aspect feature.

### Next steps:

* redesign the prompt to clarify that the answer sought is in the context of grammar.
* verify again.

# Emphahsis

In [241]:
verified_df = pd.read_csv("data/human_gpt_verified/Emphasis.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))
print("human accuracy:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


15
human accuracy: 0.6
gpt accuracy: 0.4
----------------------------------------------------------------------------------------------------
4 	 GPT is misinterpreting the property. Using it out of context of grammar.
2 	 Annotaors missed 'fron inversion'
1 	 GPT classification and explanation both wrong
8 	 nan


It's hard to know the intended emphasis, so it's hard to know which property is being employed. I'm not sure this can be annotated without additional context.

# Figures of Argument

In [242]:
verified_df = pd.read_csv("data/human_gpt_verified/Figures_of_argument.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))

# what percentage of the fully agreed examples are correct?

total_agree = sum(verified_df['annotator_consistency'])

def getNumCorrect(x):
    return x['props_a20'] == x['props_a21'] == x['props_a22'] == x['ground truth']

verified_df['n_cor'] = verified_df.apply(lambda x: getNumCorrect(x), axis=1)

print("human accuracy where all annotators agree:",sum(verified_df['n_cor'])/total_agree)
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))




comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    
    


30
human accuracy where all annotators agree: 0.8
human accuracy all examples: 0.6666666666666666
gpt accuracy all examples: 0.16666666666666666
----------------------------------------------------------------------------------------------------
4 	 GPT is misinterpreting the property. Using it out of context of grammar.
2 	 Annotaors missed 'fron inversion'
1 	 classification and explanation both wrong
14 	 nan
1 	 GPT is making stuff up
7 	 GPT classification and explanation both wrong
1 	 GPT is pertially correct


### Main types of errors
1. GPT incorrectly assigns 'antithesis' to almost all examples.
2. GPT often provides a property when no property is applicable.

### Next steps
1. Instead of instructing GPT to return an empty list when no properties are applicable, create an explicit option of "none of the above properties apply to the given sentence".  

## V3
* Figures of argument are classified individually
* GPT3.5 Accuracy: 0.53
* GPT4 Accuracy: 0.57

# Figures of word choice

In [243]:
verified_df = pd.read_csv("data/human_gpt_verified/Figures_of_word_choice.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))
# what percentage of the fully agreed examples are correct?

total_agree = sum(verified_df['annotator_consistency'])

def getNumCorrect(x):
    return x['props_a20'] == x['props_a21'] == x['props_a22'] == x['ground truth']

verified_df['n_cor'] = verified_df.apply(lambda x: getNumCorrect(x), axis=1)

print("human accuracy where all annotators agree:",sum(verified_df['n_cor'])/total_agree)
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


30
human accuracy where all annotators agree: 0.3
human accuracy all examples: 0.1
gpt accuracy all examples: 0.06666666666666667
----------------------------------------------------------------------------------------------------
17 	 nan
8 	 GPT explanation doesn't reflect given sentence
5 	 GPT is misinterpreting the property. Using it out of context of grammar.


### Main types of errors
Emphasis is the most common property assignment by humans. However, this is not correct.

# Language of origin
This feature should be assigned using an etymological dictionary rather than human annotation.

# Language varieties

In [244]:
verified_df = pd.read_csv("data/human_gpt_verified/Language_varieties.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))

print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


46
human accuracy all examples: 0.7391304347826086
gpt accuracy all examples: 0.010869565217391304
----------------------------------------------------------------------------------------------------
45 	 nan
1 	 it's possible GPT picked up the word 'formally' and assinged this property on that basis rather than reasoning through the instructions.


### Main types of errors

The majority of cases should include 'correctness', 'clarity', and 'middle'. Since all of the sentences are taken from news articles, and the intended audience is the general public, this is expected.

Often, GPT was partially correct by including one or more of these properties. 



# Lexical and semantic fields

In [245]:
verified_df = pd.read_csv("data/human_gpt_verified/Lexical_and_semantic_fields.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))
# what percentage of the fully agreed examples are correct?

total_agree = sum(verified_df['annotator_consistency'])

def getNumCorrect(x):
    return x['props_a20'] == x['props_a21'] == x['props_a22'] == x['ground truth']

verified_df['n_cor'] = verified_df.apply(lambda x: getNumCorrect(x), axis=1)

print("human accuracy where all annotators agree:",sum(verified_df['n_cor'])/total_agree)
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


30
human accuracy where all annotators agree: 0.375
human accuracy all examples: 0.1
gpt accuracy all examples: 0.2
----------------------------------------------------------------------------------------------------
1 	 humans missed one
29 	 nan


### Main types of errors

This one seems particulalry hard for both humans and GPT. The differences between the properties can be very subtle, and there is room for subjectivity. Ideally, this would be annotated and discussed to reach consensus. 

The GPT explanations sometimes re-word a property definition but assign a different property to that definition. It is mixing up the properties and their definitions. 

To improve GPT, we need to think about rewriting the instructions. We could also think about consolidating some of the properties. Examples would probably go a long way here.

# Modifying clauses

In [246]:
verified_df = pd.read_csv("data/human_gpt_verified/Modifying_clauses.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


29
human accuracy all examples: 0.6896551724137931
gpt accuracy all examples: 0.10344827586206896
----------------------------------------------------------------------------------------------------
1 	 gpt picked up the second clause but did not assing the right porperty
1 	 gpt identifies wrong propety, and gpt attributes characteristics to sentence which are incorrect
10 	 gpt finding things which aren't there
1 	 gpt got it right once
2 	 gpt is just completely wrong. It's made a soup of the sentence and property definitions
2 	 gpt misinterprets the property
2 	 gpt misaligned between property it assigns and explanation it gives.
3 	 gpt doesn't provide explanation
6 	 gpt explanation is wrong
1 	 gpt explanation is correct


# Modifying phrases
Can be extracted from parse tree. no need for annotators.

# Mood

In [247]:
verified_df = pd.read_csv("data/human_gpt_verified/Mood_02.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))
# what percentage of the fully agreed examples are correct?

total_agree = sum(verified_df['annotator_consitency'])

def getNumCorrect(x):
    return x['props_a20'] == x['props_a21'] == x['props_a22'] == x['ground truth']

verified_df['n_cor'] = verified_df.apply(lambda x: getNumCorrect(x), axis=1)

print("human accuracy where all annotators agree:",sum(verified_df['n_cor'])/total_agree)
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


29
human accuracy where all annotators agree: 1.0
human accuracy all examples: 0.7241379310344828
gpt accuracy all examples: 0.7586206896551724
----------------------------------------------------------------------------------------------------
28 	 nan
1 	 gpt is correct but the explanation is wrong.


# New_words_and_changing_uses

In [248]:
verified_df = pd.read_csv("data/human_gpt_verified/New_words_and_changing_uses.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))

# what percentage of the fully agreed examples are correct?

total_agree = sum(verified_df['annotator_consistency'])

def getNumCorrect(x):
    return x['props_a20'] == x['props_a21'] == x['props_a22'] == x['ground truth']

verified_df['n_cor'] = verified_df.apply(lambda x: getNumCorrect(x), axis=1)

print("human accuracy where all annotators agree:",sum(verified_df['n_cor'])/total_agree)
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


25
human accuracy where all annotators agree: 0.8421052631578947
human accuracy all examples: 0.72
gpt accuracy all examples: 0.16
----------------------------------------------------------------------------------------------------
7 	 GPT is misinterpreting the property. Using it out of context of grammar.
4 	 GPT explanation does not match classification
1 	 GPT makde upo its own property
1 	 GPT is wrong about the part of speech
6 	 nan
4 	 GPT is only partially correct
2 	 GPT explanation is soup


# Parallelism

In [249]:
verified_df = pd.read_csv("data/human_gpt_verified/Parallelism.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))

# what percentage of the fully agreed examples are correct?

total_agree = sum(verified_df['annotator_consistency'])

def getNumCorrect(x):
    return x['props_a20'] == x['props_a21'] == x['props_a22'] == x['ground truth']

verified_df['n_cor'] = verified_df.apply(lambda x: getNumCorrect(x), axis=1)

print("human accuracy where all annotators agree:",sum(verified_df['n_cor'])/total_agree)
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


31
human accuracy where all annotators agree: 0.9545454545454546
human accuracy all examples: 0.7096774193548387
gpt accuracy all examples: 0.0967741935483871
----------------------------------------------------------------------------------------------------
21 	 GPT explanation is incorrect
10 	 nan


### Main types of errors

GPT assigns the 'grammatical_structure_(parison)' property for almost all examples which is incorrect. Humans correctly assing no property in most cases. The feature is so rarely found it may not be useful. Or, it may be that it is difficult to detect for humans.

# Phrases built on nouns

In [250]:
verified_df = pd.read_csv("data/human_gpt_verified/Phrases_built_on_nouns.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))

# what percentage of the fully agreed examples are correct?

total_agree = sum(verified_df['annotator_consistency'])

def getNumCorrect(x):
    return x['props_a20'] == x['props_a21'] == x['props_a22'] == x['ground truth']

verified_df['n_cor'] = verified_df.apply(lambda x: getNumCorrect(x), axis=1)

print("human accuracy where all annotators agree:",sum(verified_df['n_cor'])/total_agree)
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


31
human accuracy where all annotators agree: 1.0
human accuracy all examples: 0.9032258064516129
gpt accuracy all examples: 0.03225806451612903
----------------------------------------------------------------------------------------------------
27 	 nan
2 	 GPT explanation males no sense
2 	 GPT mixes up appositives with summative modifiers


### Main types of errors

GPT assigns the 'appositives' property for almost all examples which is incorrect. The explanations are not logical. There are very few examples where this feature is present.

# Phrases built on verbs


In [251]:
verified_df = pd.read_csv("data/human_gpt_verified/Phrases_built_on_verbs.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))

# what percentage of the fully agreed examples are correct?

total_agree = sum(verified_df['annotator_consistency'])

def getNumCorrect(x):
    return x['props_a20'] == x['props_a21'] == x['props_a22'] == x['ground truth']

verified_df['n_cor'] = verified_df.apply(lambda x: getNumCorrect(x), axis=1)

print("human accuracy where all annotators agree:",sum(verified_df['n_cor'])/total_agree)
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


30
human accuracy where all annotators agree: 0.7222222222222222
human accuracy all examples: 0.43333333333333335
gpt accuracy all examples: 0.0
----------------------------------------------------------------------------------------------------
17 	 nan
13 	 GPT is making stuff up


### Main types of errors

GPT assign 'participal_phrases' to most examples, and the explanations are illogical. Most examples do not contain this feature.

# Predication

In [252]:
verified_df = pd.read_csv("data/human_gpt_verified/Predication.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))

# what percentage of the fully agreed examples are correct?

total_agree = sum(verified_df['annotator_consistency'])

def getNumCorrect(x):
    return x['props_a20'] == x['props_a21'] == x['props_a22'] == x['ground truth']

verified_df['n_cor'] = verified_df.apply(lambda x: getNumCorrect(x), axis=1)

print("human accuracy where all annotators agree:",sum(verified_df['n_cor'])/total_agree)
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


30
human accuracy where all annotators agree: 0.9
human accuracy all examples: 0.9
gpt accuracy all examples: 0.13333333333333333
----------------------------------------------------------------------------------------------------
6 	 GPT is mixing up the properties and definitions
1 	 GPT is misinterpreting a property
23 	 nan


## V3

* Predication properties are classified individually
* GPT3.5 V3 Accuracy:  0.17
---------
* Predication is ran as a single prompt for all properties. `get_single_gpt_response()`
* When asked about 'predication', GPT is interpreting the term outside of the grammar context of active/passive voice

* GPT3.5 V3 accuracy: 0.23

# Prosody and Punctuation

Human annotators did not find any properties of prosody and punctuation in the corpus. This may be another one of those featres that is subjective and nuanced and therefore very difficult for both humans and machines.

# Sentence architecture

In [253]:
verified_df = pd.read_csv("data/human_gpt_verified/Sentence_architecture.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))

# what percentage of the fully agreed examples are correct?

total_agree = sum(verified_df['annotator_consistency'])

def getNumCorrect(x):
    return x['props_a20'] == x['props_a21'] == x['props_a22'] == x['ground truth']

verified_df['n_cor'] = verified_df.apply(lambda x: getNumCorrect(x), axis=1)

print("human accuracy where all annotators agree:",sum(verified_df['n_cor'])/total_agree)
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


30
human accuracy where all annotators agree: 0.9666666666666667
human accuracy all examples: 0.9666666666666667
gpt accuracy all examples: 0.0
----------------------------------------------------------------------------------------------------
24 	 GPT defaults to right-branching. Explanation is incorrect.
1 	 GPT explanation is not logical
4 	 GPT misinterpreting the 'loose' feature
1 	 GPT misinterpreting the 'periodic' feature


# Series

In [254]:
verified_df = pd.read_csv("data/human_gpt_verified/Series.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))

# what percentage of the fully agreed examples are correct?

total_agree = sum(verified_df['annotator_consistency'])

def getNumCorrect(x):
    return x['props_a20'] == x['props_a21'] == x['props_a22'] == x['ground truth']

verified_df['n_cor'] = verified_df.apply(lambda x: getNumCorrect(x), axis=1)

print("human accuracy where all annotators agree:",sum(verified_df['n_cor'])/total_agree)
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


30
human accuracy where all annotators agree: 1.0
human accuracy all examples: 0.9
gpt accuracy all examples: 0.0
----------------------------------------------------------------------------------------------------
9 	 GPT misinterprets the feature
14 	 nan
4 	 GPT explanation is not logical
3 	 GPT makes stuff up


### Main types of errors

GPT interprets any conjunctions as polysyndenton, and any commas as asyndenton regardless of whether there is a series present or not.

# Subject choices

In [255]:
verified_df = pd.read_csv("data/human_gpt_verified/Subject_choices.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))

# what percentage of the fully agreed examples are correct?

total_agree = sum(verified_df['annotator_consistency'])

def getNumCorrect(x):
    return x['props_a20'] == x['props_a21'] == x['props_a22'] == x['ground truth']

verified_df['n_cor'] = verified_df.apply(lambda x: getNumCorrect(x), axis=1)

print("human accuracy where all annotators agree:",sum(verified_df['n_cor'])/total_agree)
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


30
human accuracy where all annotators agree: 0.8666666666666667
human accuracy all examples: 0.8666666666666667
gpt accuracy all examples: 0.13333333333333333
----------------------------------------------------------------------------------------------------
5 	 GPT did not adhere to definitions
5 	 nan
4 	 GPT is mangling the definitions
1 	 GPT is misinterpreting the property. Using it out of context of grammar.
7 	 GPT explanation is only partially correct
6 	 GPT is assinging the poperty to the object not the subject
2 	 GPT classification and explanation is correct


### Main types of errors

GPT assigns 'rhetorical participants' to the majority of examples. It is also assinging properties to any nouns within the sentence not just the subjects. It's also not sticking to the definitions provided. 

Tell gpt to identify the subjects first, then assign properties.

## GPT3.5 - V3
* Subject choices are classified individually
* Acuuracy: 0.5

## GPT4 - V3
* Subject choices are classified individually
* Acuuracy: 0.47

# Tense

In [256]:
verified_df = pd.read_csv("data/human_gpt_verified/Tense.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))

print("human accuracy:",sum(verified_df['humans isCorrect'].astype(float))/len(verified_df))
print("gpt accuracy:",sum(verified_df['gpt isCorrect'].astype(float))/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


30
human accuracy: 0.7333333333333333
gpt accuracy: 0.1
----------------------------------------------------------------------------------------------------
28 	 nan
2 	 GPT misinterprets the definition


### Main types of errors

The text about news headlines being in the present confuses gpt to assign present tense even when the tense is not present.

Much of the time GPT is just plain wrong, assining 'present' when it's 'past' and vise versa. This is also seen in the explanations. GPT also tends to assign 'progression' to most examples.

What seems strange is that the human annotators are sometimes in full agreement on the wrong tense when the answer is seemingly obvious.

# Tropes

In [257]:
verified_df = pd.read_csv("data/human_gpt_verified/Tropes.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))

# what percentage of the fully agreed examples are correct?

total_agree = sum(verified_df['annotator_consistency'])

def getNumCorrect(x):
    return x['props_a20'] == x['props_a21'] == x['props_a22'] == x['ground truth']

verified_df['n_cor'] = verified_df.apply(lambda x: getNumCorrect(x), axis=1)

print("human accuracy where all annotators agree:",sum(verified_df['n_cor'])/total_agree)
print("human accuracy all examples:",sum(verified_df['humans isCorrect'])/len(verified_df))
print("gpt accuracy all examples:",sum(verified_df['gpt isCorrect'])/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)
    


32
human accuracy where all annotators agree: 0.7
human accuracy all examples: 0.46875
gpt accuracy all examples: 0.09375
----------------------------------------------------------------------------------------------------
31 	 nan
1 	 GPT is partially correct


### Main types of errors

This one is clearly hard for both humans and gpt. Probabaly because it can be subjective and very nuanced.

GPT's explanations are very poor - neither correct, nor logical.

It would be nice to get more examples. Most sentences do not use tropes, and the ones that do, use hyperbole.

# Verb choices

In [258]:
verified_df = pd.read_csv("data/human_gpt_verified/Verb_choices.csv")
verified_df = verified_df[verified_df["humans isCorrect"]>=0]
print(len(verified_df))

print("human accuracy:",sum(verified_df['humans isCorrect'].astype(float))/len(verified_df))
print("gpt accuracy:",sum(verified_df['gpt isCorrect'].astype(float))/len(verified_df))

comments = Counter(verified_df['comments'])

print("-"*100)
for k,v in comments.items():
    print(v,"\t",k)


30
human accuracy: 0.7333333333333333
gpt accuracy: 0.23333333333333334
----------------------------------------------------------------------------------------------------
15 	 GPT wrongly assings negation
15 	 nan


GPT tends to assign negation to everything. THe explanations make up stuff that's not there to justify this.

In [285]:
all_features_df = pd.read_csv("data/human_gpt_verified/all_verified.csv")

In [286]:
ann_consistent = all_features_df[all_features_df['annotator_consistency']==True]

In [287]:
print(len(ann_consistent))
print(len(ann_consistent[ann_consistent['gpt3.5 isCorrect']==True]))
print(len(ann_consistent[ann_consistent['gpt4 isCorrect']==True]))
print(len(ann_consistent[ann_consistent['humans isCorrect']==True]))

433
49
244
361


In [271]:
# GPT3.5 - where annotator agreement is 100%
49/433

0.11316397228637413

In [269]:
# GPT4 - where annotator agreement is 100%
244/433

0.5635103926096998

In [281]:
# Humans - where annotator agreement is 100%
361/433

0.8337182448036952

In [272]:
# GPT3.5 - where we have gold labels
sum(all_features_df['gpt isCorrect'])/len(all_features_df)

0.12903225806451613

In [273]:
# GPT4 - where we have gold labels
sum(all_features_df['gpt4 isCorrect'])/len(all_features_df)

0.5053763440860215

In [277]:
all_features_df['feature_id'].unique()

array(['Aspect', 'Figures_of_argument', 'Figures_of_word_choice',
       'Language_varieties', 'Lexical_and_semantic_fields',
       'Modifying_clauses', 'Mood', 'New_words_and_changing_uses',
       'Parallelism', 'Phrases_built_on_nouns', 'Phrases_built_on_verbs',
       'Predication', 'Sentence_architecture', 'Series',
       'Subject_choices', 'Tense', 'Tropes', 'Verb_choices'], dtype=object)

In [279]:
for f in all_features_df['feature_id'].unique():
    df = all_features_df[all_features_df['feature_id']==f]
    print(f,"\t",sum(df['gpt4 isCorrect'])/len(df))

Aspect 	 0.45714285714285713
Figures_of_argument 	 0.6666666666666666
Figures_of_word_choice 	 0.4
Language_varieties 	 0.5434782608695652
Lexical_and_semantic_fields 	 0.2
Modifying_clauses 	 0.6206896551724138
Mood 	 0.4827586206896552
New_words_and_changing_uses 	 0.08
Parallelism 	 0.8064516129032258
Phrases_built_on_nouns 	 0.967741935483871
Phrases_built_on_verbs 	 0.6666666666666666
Predication 	 0.4
Sentence_architecture 	 0.7333333333333333
Series 	 0.8333333333333334
Subject_choices 	 0.23333333333333334
Tense 	 0.4
Tropes 	 0.3125
Verb_choices 	 0.2
