# Result Analysis

In [1]:
models = ["gpt-3.5-turbo-0125", "gpt-4-0125-preview", "gpt-4o-2024-05-13", "gpt-4-turbo-preview"]

In [2]:
import pandas as pd

def load_df(model_type, embedding, no_prev_chunking, gpt_model, batched):
    path = f"../data/dfs/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{model_type}/ReferenceErrorDetection_data_with_prompt_results{'_batched' if batched else ''}{'_'+gpt_model if gpt_model != models[0] else ''}.pkl"
    df = pd.read_pickle(path)
    return df

In [3]:
def remove_json_colons(json_text):
    if json_text and json_text.startswith("```json") and json_text.rstrip().endswith("```"):
        return json_text[7:-3]
    return json_text

In [4]:
import json

# Add extra columns for the model classification label and explanation by extracting the information from the JSON
# If the JSON is misformed due to leading ```json and trailing ``` then remove them
# Make sure that correct label and model label are both lower case and do not end with d (unsubstaniate instead of unsubstantiated)
def reshape_model_classification(df):
    for index, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            try:
                row['Model Classification'] = remove_json_colons(row['Model Classification'])
                model_classification = json.loads(row['Model Classification'])
                label = model_classification['label'].lower()
                df.at[row.name, 'Model Classification Label'] = label if not label.endswith('d') else label[:-1]
                df.at[row.name, 'Model Classification Explanation'] = model_classification['explanation']
            except json.JSONDecodeError as e:
                print(f"Row {index} Model Classification could not be decoded: {e}")
                print(row['Model Classification'])
                df.at[row.name, 'Model Classification Label'] = None
                df.at[row.name, 'Model Classification Explanation'] = None
        else:
            df.at[row.name, 'Model Classification Label'] = None
            df.at[row.name, 'Model Classification Explanation'] = None
        df.at[row.name, 'Label'] = df.at[row.name, 'Label'].lower()
    return df

In [5]:
def eval_predictions_all_labels(df, include_not_originally_downloaded=True):
    total = 0
    correct = 0
    false_predictions = 0

    # What was the target label and what did the model predict (first hierarchy is target label, second is model label)
    type_false_predictions = {
        'unsubstantiate': {
            'partially substantiate': 0,
            'fully substantiate': 0
        },
        'partially substantiate': {
            'unsubstantiate': 0,
            'fully substantiate': 0
        },
        'fully substantiate': {
            'unsubstantiate': 0,
            'partially substantiate': 0
        }
    }

    for index, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            if include_not_originally_downloaded or row['Reference Article PDF Available'] == 'Yes':
                total += 1
                target_label = row['Label']
                model_label = row['Model Classification Label']

                if target_label == model_label:
                    correct += 1
                elif model_label not in ['unsubstantiate', 'partially substantiate', 'fully substantiate']:
                    false_predictions += 1
                    print(f"Row {index} Model Classification Label is not a valid label: {model_label}")
                    print(row['Model Classification'])
                else:
                    false_predictions += 1
                    type_false_predictions[target_label][model_label] += 1
    
    evaluation_results = {
        'accuracy': round(correct / total, 4),
        'total': total,
        'correct': correct,
        'false_predictions': false_predictions,
        'type_false_predictions': type_false_predictions
    }

    return evaluation_results

In [6]:
def replace_substantiate_label(label):
    if label in ['partially substantiate', 'fully substantiate']:
        label = 'substantiate'
    return label

In [7]:
def eval_predictions_two_labels(df, include_not_originally_downloaded=True):
    total = 0
    correct = 0
    false_predictions = 0
    false_positives = 0
    false_negatives = 0

    for index, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            if include_not_originally_downloaded or row['Reference Article PDF Available'] == 'Yes':
                total += 1
                target_label = replace_substantiate_label(row['Label'])
                model_label = replace_substantiate_label(row['Model Classification Label'])

                if target_label == model_label:
                    correct += 1
                else:
                    false_predictions += 1
                    if model_label not in ['unsubstantiate', 'substantiate']:
                        print(f"Row {index} Model Classification Label is not a valid label: {model_label}")
                    elif target_label == 'unsubstantiate' and model_label == 'substantiate':
                        false_positives += 1
                    elif target_label == 'substantiate' and model_label == 'unsubstantiate':
                        false_negatives += 1
    
    evaluation_results = {
        'accuracy': round(correct / total, 4),
        'total': total,
        'correct': correct,
        'false_predictions': false_predictions,
        'false_positives': false_positives,
        'false_negatives': false_negatives
    }

    return evaluation_results

## Gather Results: Model Comparison

In [8]:
all_results = {
    "te3s": {},
    "te3s_no_prev_chunking": {},
    "te3s_no_prev_chunking_batched": {},
    "te3l": {},
    "te3l_no_prev_chunking": {},
    "te3l_no_prev_chunking_batched": {},
}

In [9]:
def gather_results(model_type, embedding, no_prev_chunking, batched, gpt_model=models[0]):
    global all_results

    df = load_df(model_type, embedding, no_prev_chunking, gpt_model, batched)
    df = reshape_model_classification(df)
    
    results_all_labels = eval_predictions_all_labels(df)
    results_all_labels_exlude_not_available = eval_predictions_all_labels(df, False)
    results_two_labels = eval_predictions_two_labels(df)
    results_two_labels_exlude_not_available = eval_predictions_two_labels(df, False)

    results = {
        'all_labels': results_all_labels,
        'all_labels_exclude_not_available': results_all_labels_exlude_not_available,
        'two_labels': results_two_labels,
        'two_labels_exclude_not_available': results_two_labels_exlude_not_available
    }

    embedding_string = embedding + ('_no_prev_chunking' if no_prev_chunking else '') + ('_batched' if batched else '')

    if not isinstance(all_results[embedding_string].get(model_type), dict):
        all_results[embedding_string][model_type] = {}
    all_results[embedding_string][model_type][gpt_model] = results
    

### Small GROBID Model
- PDF text extracted with smaller GROBID model 
- full text from TEI document directly fed into index generation 
- Model temperature set to 0 (for top 3 excerpts retrieval and for classification via model prompting)
- Used embedding for index: text-embedding-3-small
- Used model for prompting: gpt-3.5-turbo-0125

#### Small Text Embedding

In [10]:
model_type = "small_model"
embedding = "te3s"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


### Full GROBID Model
- PDF text extracted with full GROBID model
- full text from TEI document directly fed into index generation 
- Model temperature set to 0 (for top 3 excerpts retrieval and for classification via model prompting)
- Used embedding for index: text-embedding-3-small
- Used model for prompting: gpt-3.5-turbo-0125

#### Small Text Embedding

In [11]:
model_type = "full_model"
embedding = "te3s"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Small Text Embedding - no previous chunking

In [12]:
model_type = "full_model"
embedding = "te3s"
no_prev_chunking = True
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Small Text Embedding - no previous chunking - batched

In [13]:
model_type = "full_model"
embedding = "te3s"
no_prev_chunking = True
batched = True
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Large Text Embedding

In [14]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{



Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Large Text Embedding - no previous chunking

##### GPT 3 Turbo

In [15]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[0]
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


##### GPT 4

In [16]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[1]
gather_results(model_type, embedding, no_prev_chunking, batched, model)

##### GPT 4o

In [17]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[2]
gather_results(model_type, embedding, no_prev_chunking, batched, model)

#### Large Text Embedding - no previous chunking - batched

In [18]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = True
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


### Full GROBID Model + TEI document text refactoring
- PDF text extracted with full GROBID model
- Only text from body of papers (actual content) extracted from TEI documents
- Model temperature set to 0 (for top 3 excerpts retrieval and for classification via model prompting)
- Used embedding for index: text-embedding-3-small
- Used model for prompting: gpt-3.5-turbo-0125

#### Small Text Embedding

In [19]:
model_type = "full_model_texts"
embedding = "te3s"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

#### Small Text Embedding - no previous chunking

In [20]:
model_type = "full_model_texts"
embedding = "te3s"
no_prev_chunking = True
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Small Text Embedding - no previous chunking - batched

In [21]:
model_type = "full_model_texts"
embedding = "te3s"
no_prev_chunking = True
batched = True
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Large Text Embedding

In [22]:
model_type = "full_model_texts"
embedding = "te3l"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

#### Large Text Embedding - no previous chunking

In [23]:
model_type = "full_model_texts"
embedding = "te3l"
no_prev_chunking = True
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Large Text Embedding - no previous chunking - batched

In [24]:
model_type = "full_model_texts"
embedding = "te3l"
no_prev_chunking = True
batched = True
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


### Save results

In [25]:
import json

with open("../data/all_results.json", "w") as json_file:
    json.dump(all_results, json_file, indent=4)