# Result Analysis

In [221]:
models = ["gpt-3.5-turbo-0125", "gpt-4-0125-preview", "gpt-4o-2024-05-13", "llama3.1:70b"]

In [222]:
import pandas as pd

def load_df(model_type, embedding, no_prev_chunking, gpt_model, batched):
    path = f"../data/dfs/{embedding}{'_no_prev_chunking' if no_prev_chunking else ''}/{model_type}/ReferenceErrorDetection_data_with_prompt_results{'_batched' if batched else ''}{'_'+gpt_model if gpt_model != models[0] else ''}.pkl"
    df = pd.read_pickle(path)
    return df

In [223]:
def remove_json_colons(json_text):
    if json_text and '{' in json_text and '}' in json_text:
        json_text = json_text[json_text.find('{'):json_text.rfind('}') + 1]
    return json_text

In [224]:
import json

# Add extra columns for the model classification label and explanation by extracting the information from the JSON
# If the JSON is misformed due to leading ```json and trailing ``` then remove them
# Make sure that correct label and model label are both lower case and do not end with d (unsubstaniate instead of unsubstantiated)
def reshape_model_classification(df):
    for index, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            try:
                row['Model Classification'] = remove_json_colons(row['Model Classification'])
                model_classification = json.loads(row['Model Classification'])
                label = model_classification['label'].lower()
                df.at[row.name, 'Model Classification Label'] = label if not label.endswith('d') else label[:-1]
                df.at[row.name, 'Model Classification Explanation'] = model_classification['explanation']
            except json.JSONDecodeError as e:
                print(f"Row {index} Model Classification could not be decoded: {e}")
                print(row['Model Classification'])
                df.at[row.name, 'Model Classification Label'] = None
                df.at[row.name, 'Model Classification Explanation'] = None
        else:
            df.at[row.name, 'Model Classification Label'] = None
            df.at[row.name, 'Model Classification Explanation'] = None
        df.at[row.name, 'Label'] = df.at[row.name, 'Label'].lower()
    return df

In [225]:
def eval_predictions_all_labels(df, include_not_originally_downloaded=True):
    total = 0
    correct = 0
    false_predictions = 0

    # What was the target label and what did the model predict (first hierarchy is target label, second is model label)
    type_predictions = {
        'unsubstantiate': {
            'total': 0,
            'unsubstantiate': 0,
            'partially substantiate': 0,
            'fully substantiate': 0,
            'invalid label': 0,
        },
        'partially substantiate': {
            'total': 0,
            'unsubstantiate': 0,
            'partially substantiate': 0,
            'fully substantiate': 0,
            'invalid label': 0,
        },
        'fully substantiate': {
            'total': 0,
            'unsubstantiate': 0,
            'partially substantiate': 0,
            'fully substantiate': 0,
            'invalid label': 0,
        }
    }

    for index, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            if include_not_originally_downloaded or row['Reference Article PDF Available'] == 'Yes':
                total += 1
                target_label = row['Label']
                type_predictions[target_label]['total'] += 1
                model_label = row['Model Classification Label']

                if model_label not in ['unsubstantiate', 'partially substantiate', 'fully substantiate']:
                    false_predictions += 1
                    type_predictions[target_label]['invalid label'] += 1
                    print(f"Row {index} Model Classification Label is not a valid label: {model_label}")
                    continue

                type_predictions[target_label][model_label] += 1

                if target_label == model_label:
                    correct += 1
                else:
                    false_predictions += 1
    
    evaluation_results = {
        'accuracy': round(correct / total, 3),
        'total': total,
        'correct': correct,
        'false_predictions': false_predictions,
        'type_predictions': type_predictions
    }

    return evaluation_results

In [226]:
def replace_substantiate_label(label):
    if label in ['partially substantiate', 'fully substantiate']:
        label = 'substantiate'
    return label

In [227]:
def eval_predictions_two_labels(df, include_not_originally_downloaded=True):
    total = 0
    correct = 0
    false_predictions = 0
    false_positives = 0
    false_negatives = 0

    for index, row in df.iterrows():
        if row['Reference Article Downloaded'] == 'Yes':
            if include_not_originally_downloaded or row['Reference Article PDF Available'] == 'Yes':
                total += 1
                target_label = replace_substantiate_label(row['Label'])
                model_label = replace_substantiate_label(row['Model Classification Label'])

                if target_label == model_label:
                    correct += 1
                else:
                    false_predictions += 1
                    if model_label not in ['unsubstantiate', 'substantiate']:
                        print(f"Row {index} Model Classification Label is not a valid label: {model_label}")
                    elif target_label == 'unsubstantiate' and model_label == 'substantiate':
                        false_positives += 1
                    elif target_label == 'substantiate' and model_label == 'unsubstantiate':
                        false_negatives += 1
    
    evaluation_results = {
        'accuracy': round(correct / total, 3),
        'total': total,
        'correct': correct,
        'false_predictions': false_predictions,
        'false_positives': false_positives,
        'false_negatives': false_negatives
    }

    return evaluation_results

## Gather Results: Model Comparison

In [228]:
all_results = {
    "te3s": {},
    "te3s_no_prev_chunking": {},
    "te3s_no_prev_chunking_batched": {},
    "te3l": {},
    "te3l_no_prev_chunking": {},
    "te3l_no_prev_chunking_batched": {},
}

In [229]:
def gather_results(model_type, embedding, no_prev_chunking, batched, gpt_model=models[0]):
    global all_results

    df = load_df(model_type, embedding, no_prev_chunking, gpt_model, batched)
    df = reshape_model_classification(df)
    
    results_all_labels = eval_predictions_all_labels(df)
    results_all_labels_exlude_not_available = eval_predictions_all_labels(df, False)
    results_two_labels = eval_predictions_two_labels(df)
    results_two_labels_exlude_not_available = eval_predictions_two_labels(df, False)

    results = {
        'all_labels': results_all_labels,
        'all_labels_exclude_not_available': results_all_labels_exlude_not_available,
        'two_labels': results_two_labels,
        'two_labels_exclude_not_available': results_two_labels_exlude_not_available
    }

    embedding_string = embedding + ('_no_prev_chunking' if no_prev_chunking else '') + ('_batched' if batched else '')

    if not isinstance(all_results[embedding_string].get(model_type), dict):
        all_results[embedding_string][model_type] = {}
    all_results[embedding_string][model_type][gpt_model] = results
    

### Small GROBID Model
- PDF text extracted with smaller GROBID model 
- full text from TEI document directly fed into index generation 
- Model temperature set to 0 (for top 3 excerpts retrieval and for classification via model prompting)
- Used embedding for index: text-embedding-3-small
- Used model for prompting: gpt-3.5-turbo-0125

#### Small Text Embedding

In [230]:
model_type = "small_model"
embedding = "te3s"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


### Full GROBID Model
- PDF text extracted with full GROBID model
- full text from TEI document directly fed into index generation 
- Model temperature set to 0 (for top 3 excerpts retrieval and for classification via model prompting)
- Used embedding for index: text-embedding-3-small
- Used model for prompting: gpt-3.5-turbo-0125

#### Small Text Embedding

In [231]:
model_type = "full_model"
embedding = "te3s"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Small Text Embedding - no previous chunking

In [232]:
model_type = "full_model"
embedding = "te3s"
no_prev_chunking = True
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Small Text Embedding - no previous chunking - batched

In [233]:
model_type = "full_model"
embedding = "te3s"
no_prev_chunking = True
batched = True
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Large Text Embedding

In [234]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Large Text Embedding - no previous chunking

##### GPT 3 Turbo

In [235]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[0]
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


##### GPT 4

In [236]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[1]
gather_results(model_type, embedding, no_prev_chunking, batched, model)

##### GPT 4o

In [237]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[2]
gather_results(model_type, embedding, no_prev_chunking, batched, model)

##### Llama 3.1 (70b)

In [238]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[3]
gather_results(model_type, embedding, no_prev_chunking, batched, model)

#### Large Text Embedding - no previous chunking - batched

In [239]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = True
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


### Full GROBID Model + TEI document text refactoring
- PDF text extracted with full GROBID model
- Only text from body of papers (actual content) extracted from TEI documents
- Model temperature set to 0 (for top 3 excerpts retrieval and for classification via model prompting)
- Used embedding for index: text-embedding-3-small
- Used model for prompting: gpt-3.5-turbo-0125

#### Small Text Embedding

In [240]:
model_type = "full_model_texts"
embedding = "te3s"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

#### Small Text Embedding - no previous chunking

In [241]:
model_type = "full_model_texts"
embedding = "te3s"
no_prev_chunking = True
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Small Text Embedding - no previous chunking - batched

In [242]:
model_type = "full_model_texts"
embedding = "te3s"
no_prev_chunking = True
batched = True
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Large Text Embedding

In [243]:
model_type = "full_model_texts"
embedding = "te3l"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

#### Large Text Embedding - no previous chunking

In [244]:
model_type = "full_model_texts"
embedding = "te3l"
no_prev_chunking = True
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Large Text Embedding - no previous chunking - batched

In [245]:
model_type = "full_model_texts"
embedding = "te3l"
no_prev_chunking = True
batched = True
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


### Save results

In [246]:
import json

with open("../data/all_results.json", "w") as json_file:
    json.dump(all_results, json_file, indent=4)

## Reproduce Table from Paper

In [247]:
results_for_table = all_results['te3l_no_prev_chunking']['full_model']

In [248]:
results_for_table

{'gpt-3.5-turbo-0125': {'all_labels': {'accuracy': 0.567,
   'total': 247,
   'correct': 140,
   'false_predictions': 107,
   'type_predictions': {'unsubstantiate': {'total': 109,
     'unsubstantiate': 78,
     'partially substantiate': 16,
     'fully substantiate': 14,
     'invalid label': 1},
    'partially substantiate': {'total': 14,
     'unsubstantiate': 2,
     'partially substantiate': 7,
     'fully substantiate': 5,
     'invalid label': 0},
    'fully substantiate': {'total': 124,
     'unsubstantiate': 11,
     'partially substantiate': 58,
     'fully substantiate': 55,
     'invalid label': 0}}},
  'all_labels_exclude_not_available': {'accuracy': 0.57,
   'total': 244,
   'correct': 139,
   'false_predictions': 105,
   'type_predictions': {'unsubstantiate': {'total': 108,
     'unsubstantiate': 78,
     'partially substantiate': 15,
     'fully substantiate': 14,
     'invalid label': 1},
    'partially substantiate': {'total': 14,
     'unsubstantiate': 2,
     'parti

In [249]:
def calc_label_accuracies(results, exclude_not_available=False):
    # Initialize dictionary
    label_accuracies = {}
    for model in results:
        label_accuracies[model] = {
            'unsubstantiate': None,
            'partially substantiate': None,
            'fully substantiate': None,
            'overall': None,
        }
    
    for model, model_results in results.items():
        category_name = 'all_labels' + ('_exclude_not_available' if exclude_not_available else '')
        type_predictions = model_results[category_name]['type_predictions']
        for label in type_predictions:
            label_accuracies[model][label] = round(type_predictions[label][label] / type_predictions[label]['total'], 3)
        label_accuracies[model]['overall'] = round(model_results[category_name]['accuracy'], 3)
    
    return label_accuracies

In [250]:
from tabulate import tabulate

def print_accuracies(label_accuracies):
    # Prepare data for the table
    table_data = [
        [model, 
        f"{accuracies['unsubstantiate'] * 100:.1f}", 
        f"{accuracies['partially substantiate'] * 100:.1f}", 
        f"{accuracies['fully substantiate'] * 100:.1f}", 
        f"{accuracies['overall'] * 100:.1f}"]
        for model, accuracies in label_accuracies.items()
    ]

    # Define headers
    headers = ['Model', 'Un', 'Partially', 'Fully', 'Overall']

    # Display the table
    print(tabulate(table_data, headers=headers, tablefmt='pretty'))

In [251]:
print("Label Accuracies - Include All Downloaded")
print_accuracies(calc_label_accuracies(results_for_table))

Label Accuracies - Include All Downloaded
+--------------------+------+-----------+-------+---------+
|       Model        |  Un  | Partially | Fully | Overall |
+--------------------+------+-----------+-------+---------+
| gpt-3.5-turbo-0125 | 71.6 |   50.0    | 44.4  |  56.7   |
| gpt-4-0125-preview | 85.3 |   14.3    | 62.9  |  70.0   |
| gpt-4o-2024-05-13  | 84.4 |   57.1    | 36.3  |  58.7   |
|    llama3.1:70b    | 87.2 |   64.3    | 27.4  |  55.9   |
+--------------------+------+-----------+-------+---------+


In [252]:
print("Label Accuracies - Exclude Not Available in Paper")
print_accuracies(calc_label_accuracies(results_for_table, True))

Label Accuracies - Exclude Not Available in Paper
+--------------------+------+-----------+-------+---------+
|       Model        |  Un  | Partially | Fully | Overall |
+--------------------+------+-----------+-------+---------+
| gpt-3.5-turbo-0125 | 72.2 |   50.0    | 44.3  |  57.0   |
| gpt-4-0125-preview | 85.2 |   14.3    | 63.1  |  70.1   |
| gpt-4o-2024-05-13  | 84.3 |   57.1    | 36.1  |  58.6   |
|    llama3.1:70b    | 87.0 |   64.3    | 27.0  |  55.7   |
+--------------------+------+-----------+-------+---------+


### Compare to paper results

In [253]:
calc_label_accuracies(results_for_table)

{'gpt-3.5-turbo-0125': {'unsubstantiate': 0.716,
  'partially substantiate': 0.5,
  'fully substantiate': 0.444,
  'overall': 0.567},
 'gpt-4-0125-preview': {'unsubstantiate': 0.853,
  'partially substantiate': 0.143,
  'fully substantiate': 0.629,
  'overall': 0.7},
 'gpt-4o-2024-05-13': {'unsubstantiate': 0.844,
  'partially substantiate': 0.571,
  'fully substantiate': 0.363,
  'overall': 0.587},
 'llama3.1:70b': {'unsubstantiate': 0.872,
  'partially substantiate': 0.643,
  'fully substantiate': 0.274,
  'overall': 0.559}}

In [254]:
paper_results = {
    'gpt-3.5-turbo-0125': {
        'unsubstantiate': 0.795,
        'partially substantiate': 0.571,
        'fully substantiate': 0.306,
        'overall': 0.540,
    },
    'gpt-4-0125-preview': {
        'unsubstantiate': 0.839,
        'partially substantiate': 0.214,
        'fully substantiate': 0.629,
        'overall': 0.700,
    },
    'gpt-4o-2024-05-13': {
        'unsubstantiate': 0.866,
        'partially substantiate': 0.500,
        'fully substantiate': 0.347,
        'overall': 0.588,
    },
}

In [255]:
def calc_difference_between_results(calculated_results, paper_results):
    differences = {}
    for model, results in paper_results.items():
        if model in calculated_results:
            differences[model] = {
                key: round(1 - (1 / value * calculated_results[model][key]), 3)
                for key, value in results.items()
            }
    return differences

In [256]:
calc_difference_between_results(calc_label_accuracies(results_for_table), paper_results)

{'gpt-3.5-turbo-0125': {'unsubstantiate': 0.099,
  'partially substantiate': 0.124,
  'fully substantiate': -0.451,
  'overall': -0.05},
 'gpt-4-0125-preview': {'unsubstantiate': -0.017,
  'partially substantiate': 0.332,
  'fully substantiate': 0.0,
  'overall': 0.0},
 'gpt-4o-2024-05-13': {'unsubstantiate': 0.025,
  'partially substantiate': -0.142,
  'fully substantiate': -0.046,
  'overall': 0.002}}

In [257]:
label_accuracies = calc_label_accuracies(results_for_table, True)
print("Label Accuracies - Exclude Not Available in Paper")
print_accuracies(label_accuracies)
print("")
print("Paper Results")
print_accuracies(paper_results)
print("")
print("Difference from paper to calculated results")
print_accuracies(calc_difference_between_results(label_accuracies, paper_results))

Label Accuracies - Exclude Not Available in Paper
+--------------------+------+-----------+-------+---------+
|       Model        |  Un  | Partially | Fully | Overall |
+--------------------+------+-----------+-------+---------+
| gpt-3.5-turbo-0125 | 72.2 |   50.0    | 44.3  |  57.0   |
| gpt-4-0125-preview | 85.2 |   14.3    | 63.1  |  70.1   |
| gpt-4o-2024-05-13  | 84.3 |   57.1    | 36.1  |  58.6   |
|    llama3.1:70b    | 87.0 |   64.3    | 27.0  |  55.7   |
+--------------------+------+-----------+-------+---------+

Paper Results
+--------------------+------+-----------+-------+---------+
|       Model        |  Un  | Partially | Fully | Overall |
+--------------------+------+-----------+-------+---------+
| gpt-3.5-turbo-0125 | 79.5 |   57.1    | 30.6  |  54.0   |
| gpt-4-0125-preview | 83.9 |   21.4    | 62.9  |  70.0   |
| gpt-4o-2024-05-13  | 86.6 |   50.0    | 34.7  |  58.8   |
+--------------------+------+-----------+-------+---------+

Difference from paper to calculate