# Result Analysis

In [19]:
models = ["gpt-3.5-turbo-0125", "gpt-4.1-nano-2025-04-14", "gpt-4.1-mini-2025-04-14", "gpt-4.1-2025-04-14", "llama3.1.70b", "llama4.scout"]
chunkings = ["256_20", "1024_20"]
only_texts = [False, True]

In [20]:
%run ../scripts/load_df_for_analysis.py

In [21]:
%run ../scripts/df_calculations.py

## Gather Results: Model Comparison

In [22]:
all_results = {}

In [23]:
def gather_results(chunking, only_text, model):
    global all_results

    df = load_df_for_analysis(chunking, only_text, model)
    
    results = eval_predictions(df, include_relabelled_partially=True)
    results_without_relabelled = eval_predictions(df, include_relabelled_partially=False)

    if model not in all_results:
        all_results[model] = {}
    all_results[model][f"{'only_text_' if only_text else ''}{chunking}"] = {
        "all": results,
        "without_relabelled": results_without_relabelled,
    }

### Hyperparameters for all evaluations:
- PDF text from reference was extracted with large GROBID model
- Ollama indexing for choosing best chunks: used text-embedding-3-large from OpenAI, chunking with SentenceSplitter, detected top 3 matching chunks for statement
- Model temperature is set to 0

### Hyperparameters here evaluated:
- only_text: If this is "True", then the scientific text from the TEI file from GROBID was extracted via code - Else the whole text from the TEI document (including sources, authors, ...) was included for choosing the best chunks
- chunking: chunk size is set to either 256 or 1024, token overlap is always kept at 20
- model: different models are evaluated and compared for the actual classification part

### GPT 3.5 Turbo

In [24]:
model = "gpt-3.5-turbo-0125"

In [25]:
for chunking in chunkings:
    for only_text in only_texts:
        try:
            gather_results(chunking, only_text, model)
        except Exception as e:
            print(f"Error gathering results for {model}, chunking: {chunking}, only_text: {only_text}: {e}")

Row 21 Model Classification could not be decoded: Expecting value: line 1 column 1 (char 0)

Row 21 Model Classification Label is not a valid label: None
Row 21 Model Classification Label is not a valid label: None
Row 21 Model Classification could not be decoded: Expecting value: line 1 column 1 (char 0)

Row 21 Model Classification Label is not a valid label: None
Row 21 Model Classification Label is not a valid label: None
Row 24 Model Classification could not be decoded: Expecting value: line 1 column 1 (char 0)

Row 24 Model Classification Label is not a valid label: None
Row 24 Model Classification Label is not a valid label: None
Row 21 Model Classification could not be decoded: Expecting value: line 1 column 1 (char 0)

Row 24 Model Classification could not be decoded: Expecting value: line 1 column 1 (char 0)

Row 21 Model Classification Label is not a valid label: None
Row 24 Model Classification Label is not a valid label: None
Row 21 Model Classification Label is not a vali

### GPT 4.1 Nano

In [26]:
model = "gpt-4.1-nano-2025-04-14"

In [27]:
for chunking in chunkings:
    for only_text in only_texts:
        try:
            gather_results(chunking, only_text, model)
        except Exception as e:
            print(f"Error gathering results for {model}, chunking: {chunking}, only_text: {only_text}: {e}")

Error gathering results for gpt-4.1-nano-2025-04-14, chunking: 1024_20, only_text: True: [Errno 2] No such file or directory: '../data/dfs/only_text_1024_20/gpt-4.1-nano-2025-04-14/ReferenceErrorDetection_data_with_prompt_results.pkl'


### GPT 4.1 Mini

In [28]:
model = "gpt-4.1-mini-2025-04-14"

In [29]:
for chunking in chunkings:
    for only_text in only_texts:
        try:
            gather_results(chunking, only_text, model)
        except Exception as e:
            print(f"Error gathering results for {model}, chunking: {chunking}, only_text: {only_text}: {e}")

Error gathering results for gpt-4.1-mini-2025-04-14, chunking: 1024_20, only_text: True: [Errno 2] No such file or directory: '../data/dfs/only_text_1024_20/gpt-4.1-mini-2025-04-14/ReferenceErrorDetection_data_with_prompt_results.pkl'


### Llama 3.1:70b

In [30]:
model = "llama3.1.70b"

In [31]:
for chunking in chunkings:
    for only_text in only_texts:
        # try:
        gather_results(chunking, only_text, model)
        # except Exception as e:
        #     print(f"Error gathering results for {model}, chunking: {chunking}, only_text: {only_text}: {e}")

Row 81 Model Classification could not be decoded: 'label'
{
  "citation": null,
  "conclusion": null,
  "introduction": null,
  "methodology": [
    "Atomic force microscopy force spectroscopy and trans-epithelial electrical resistance assessed changes in cell-cell tethering and paracellular permeability respectively.",
    "Carboxyfluorescein dye uptake, ATP-biosensing, and western blotting were used to assess the ability of Peptide 5 to block hemichannel activity, ATP-release, and ultimately disassembly of the adherens/tight junction complex."
  ],
  "question": null,
  "results": [
    "Co-incubation of TGF-β1 with Peptide 5 significantly reduced dye uptake and restored ATP release to near basal.",
    "Peptide 5 successfully prevented TGF-β1-evoked changes in expression of E-cadherin, N-cadherin, Claudin-2, and ZO-1 in human primary renal proximal tubule cells.",
    "Cx43 +/- mice exhibited minimal disassembly of the adherens and tight junction complex."
  ]
}
Row 81 Model Classif

### Save results

In [32]:
import json

with open("../data/all_results.json", "w") as json_file:
    json.dump(all_results, json_file, indent=4)

## Reproduce Table from Paper

In [33]:
results_for_table = all_results['te3l_no_prev_chunking']['full_model']

KeyError: 'te3l_no_prev_chunking'

In [None]:
results_for_table

{'gpt-3.5-turbo-0125': {'all_labels': {'accuracy': 0.567,
   'total': 247,
   'correct': 140,
   'false_predictions': 107,
   'type_predictions': {'unsubstantiate': {'total': 109,
     'unsubstantiate': 78,
     'partially substantiate': 16,
     'fully substantiate': 14,
     'invalid label': 1},
    'partially substantiate': {'total': 14,
     'unsubstantiate': 2,
     'partially substantiate': 7,
     'fully substantiate': 5,
     'invalid label': 0},
    'fully substantiate': {'total': 124,
     'unsubstantiate': 11,
     'partially substantiate': 58,
     'fully substantiate': 55,
     'invalid label': 0}}},
  'all_labels_exclude_not_available': {'accuracy': 0.57,
   'total': 244,
   'correct': 139,
   'false_predictions': 105,
   'type_predictions': {'unsubstantiate': {'total': 108,
     'unsubstantiate': 78,
     'partially substantiate': 15,
     'fully substantiate': 14,
     'invalid label': 1},
    'partially substantiate': {'total': 14,
     'unsubstantiate': 2,
     'parti

In [None]:
print("Label Accuracies - Include All Downloaded")
print_table_label_accuracies(calc_label_accuracies(results_for_table))

Label Accuracies - Include All Downloaded
+--------------------------------------------------------+------+-----------+-------+---------+
|                         Model                          |  Un  | Partially | Fully | Overall |
+--------------------------------------------------------+------+-----------+-------+---------+
|                   gpt-3.5-turbo-0125                   | 71.6 |   50.0    | 44.4  |  56.7   |
|                   gpt-4-0125-preview                   | 85.3 |   14.3    | 62.9  |  70.0   |
|                   gpt-4o-2024-05-13                    | 84.4 |   57.1    | 36.3  |  58.7   |
|                      llama3.1:70b                      | 87.2 |   64.3    | 27.4  |  55.9   |
|                     llama3.1:405b                      | 87.2 |   50.0    | 25.8  |  54.3   |
|                        llama3.3                        | 89.9 |   50.0    | 17.7  |  51.4   |
|                      llama4:scout                      | 86.2 |   57.1    | 29.0  |  55.9   

In [None]:
print("Label Accuracies - Exclude Not Available in Paper")
print_table_label_accuracies(calc_label_accuracies(results_for_table, True))

Label Accuracies - Exclude Not Available in Paper
+--------------------------------------------------------+------+-----------+-------+---------+
|                         Model                          |  Un  | Partially | Fully | Overall |
+--------------------------------------------------------+------+-----------+-------+---------+
|                   gpt-3.5-turbo-0125                   | 72.2 |   50.0    | 44.3  |  57.0   |
|                   gpt-4-0125-preview                   | 85.2 |   14.3    | 63.1  |  70.1   |
|                   gpt-4o-2024-05-13                    | 84.3 |   57.1    | 36.1  |  58.6   |
|                      llama3.1:70b                      | 87.0 |   64.3    | 27.0  |  55.7   |
|                     llama3.1:405b                      | 87.0 |   50.0    | 25.4  |  54.1   |
|                        llama3.3                        | 89.8 |   50.0    | 17.2  |  51.2   |
|                      llama4:scout                      | 86.1 |   57.1    | 28.7  | 

### Compare to paper results

In [None]:
calc_label_accuracies(results_for_table)

{'gpt-3.5-turbo-0125': {'unsubstantiate': 0.716,
  'partially substantiate': 0.5,
  'fully substantiate': 0.444,
  'overall': 0.567},
 'gpt-4-0125-preview': {'unsubstantiate': 0.853,
  'partially substantiate': 0.143,
  'fully substantiate': 0.629,
  'overall': 0.7},
 'gpt-4o-2024-05-13': {'unsubstantiate': 0.844,
  'partially substantiate': 0.571,
  'fully substantiate': 0.363,
  'overall': 0.587},
 'llama3.1:70b': {'unsubstantiate': 0.872,
  'partially substantiate': 0.643,
  'fully substantiate': 0.274,
  'overall': 0.559},
 'llama3.1:405b': {'unsubstantiate': 0.872,
  'partially substantiate': 0.5,
  'fully substantiate': 0.258,
  'overall': 0.543},
 'llama3.3': {'unsubstantiate': 0.899,
  'partially substantiate': 0.5,
  'fully substantiate': 0.177,
  'overall': 0.514},
 'llama4:scout': {'unsubstantiate': 0.862,
  'partially substantiate': 0.571,
  'fully substantiate': 0.29,
  'overall': 0.559},
 'llama3.1:70b_annotated': {'unsubstantiate': 0.836,
  'partially substantiate': 0,
 

In [None]:
paper_results = {
    'gpt-3.5-turbo-0125': {
        'unsubstantiate': 0.795,
        'partially substantiate': 0.571,
        'fully substantiate': 0.306,
        'overall': 0.540,
    },
    'gpt-4-0125-preview': {
        'unsubstantiate': 0.839,
        'partially substantiate': 0.214,
        'fully substantiate': 0.629,
        'overall': 0.700,
    },
    'gpt-4o-2024-05-13': {
        'unsubstantiate': 0.866,
        'partially substantiate': 0.500,
        'fully substantiate': 0.347,
        'overall': 0.588,
    },
}

In [None]:
def calc_difference_between_results(calculated_results, paper_results):
    differences = {}
    for model, results in paper_results.items():
        if model in calculated_results:
            differences[model] = {
                key: calculated_results[model][key] - value
                for key, value in results.items()
            }
    return differences

In [None]:
calc_difference_between_results(calc_label_accuracies(results_for_table), paper_results)

{'gpt-3.5-turbo-0125': {'unsubstantiate': -0.07900000000000007,
  'partially substantiate': -0.07099999999999995,
  'fully substantiate': 0.138,
  'overall': 0.026999999999999913},
 'gpt-4-0125-preview': {'unsubstantiate': 0.014000000000000012,
  'partially substantiate': -0.07100000000000001,
  'fully substantiate': 0.0,
  'overall': 0.0},
 'gpt-4o-2024-05-13': {'unsubstantiate': -0.02200000000000002,
  'partially substantiate': 0.07099999999999995,
  'fully substantiate': 0.016000000000000014,
  'overall': -0.0010000000000000009}}

In [None]:
label_accuracies = calc_label_accuracies(results_for_table, True)
print("Label Accuracies - Exclude Not Available in Paper")
print_table_label_accuracies(label_accuracies)
print("")
print("Paper Results")
print_table_label_accuracies(paper_results)
print("")
print("Differences from paper results to calculated paper results")
print_table_label_accuracies(calc_difference_between_results(label_accuracies, paper_results))

Label Accuracies - Exclude Not Available in Paper
+--------------------------------------------------------+------+-----------+-------+---------+
|                         Model                          |  Un  | Partially | Fully | Overall |
+--------------------------------------------------------+------+-----------+-------+---------+
|                   gpt-3.5-turbo-0125                   | 72.2 |   50.0    | 44.3  |  57.0   |
|                   gpt-4-0125-preview                   | 85.2 |   14.3    | 63.1  |  70.1   |
|                   gpt-4o-2024-05-13                    | 84.3 |   57.1    | 36.1  |  58.6   |
|                      llama3.1:70b                      | 87.0 |   64.3    | 27.0  |  55.7   |
|                     llama3.1:405b                      | 87.0 |   50.0    | 25.4  |  54.1   |
|                        llama3.3                        | 89.8 |   50.0    | 17.2  |  51.2   |
|                      llama4:scout                      | 86.1 |   57.1    | 28.7  | 