# Result Analysis

In [77]:
models = ["gpt-3.5-turbo-0125", "gpt-4-0125-preview", "gpt-4o-2024-05-13", "llama3.1:70b", "llama3.1:405b", "llama3.3", "llama4:scout"]

In [78]:
%run ../scripts/load_df_for_analysis.py

In [79]:
%run ../scripts/df_calculations.py

## Gather Results: Model Comparison

In [80]:
all_results = {
    "te3s": {},
    "te3s_no_prev_chunking": {},
    "te3s_no_prev_chunking_batched": {},
    "te3l": {},
    "te3l_no_prev_chunking": {},
    "te3l_no_prev_chunking_batched": {},
}

In [81]:
def gather_results(model_type, embedding, no_prev_chunking, batched, gpt_model=models[0], annotated=False, corrected_statements=False, two_labels=False):
    global all_results

    df = load_df(model_type, embedding, no_prev_chunking, gpt_model, batched, annotated, corrected_statements, two_labels)
    if 'Suited for Task' in df.columns:
        df = df[df['Suited for Task'] != "Added"]
    df = sort_df(df)
    df = reshape_model_classification(df)
    
    results_all_labels = eval_predictions_all_labels(df)
    results_all_labels_exlude_not_available = eval_predictions_all_labels(df, False)
    results_two_labels = eval_predictions_two_labels(df)
    results_two_labels_exlude_not_available = eval_predictions_two_labels(df, False)

    results = {
        'all_labels': results_all_labels,
        'all_labels_exclude_not_available': results_all_labels_exlude_not_available,
        'two_labels': results_two_labels,
        'two_labels_exclude_not_available': results_two_labels_exlude_not_available
    }

    embedding_string = embedding + ('_no_prev_chunking' if no_prev_chunking else '') + ('_batched' if batched else '')

    if not isinstance(all_results[embedding_string].get(model_type), dict):
        all_results[embedding_string][model_type] = {}
    gpt_model += ('_two_labels' if two_labels else '') + ('_annotated' if annotated else '') + ('_corrected_statements' if corrected_statements else '')
    all_results[embedding_string][model_type][gpt_model] = results
    

### Small GROBID Model
- PDF text extracted with smaller GROBID model 
- full text from TEI document directly fed into index generation 
- Model temperature set to 0 (for top 3 excerpts retrieval and for classification via model prompting)
- Used embedding for index: text-embedding-3-small
- Used model for prompting: gpt-3.5-turbo-0125

#### Small Text Embedding

In [82]:
model_type = "small_model"
embedding = "te3s"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


### Full GROBID Model
- PDF text extracted with full GROBID model
- full text from TEI document directly fed into index generation 
- Model temperature set to 0 (for top 3 excerpts retrieval and for classification via model prompting)
- Used embedding for index: text-embedding-3-small
- Used model for prompting: gpt-3.5-turbo-0125

#### Small Text Embedding

In [83]:
model_type = "full_model"
embedding = "te3s"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Small Text Embedding - no previous chunking

In [84]:
model_type = "full_model"
embedding = "te3s"
no_prev_chunking = True
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Small Text Embedding - no previous chunking - batched

In [85]:
model_type = "full_model"
embedding = "te3s"
no_prev_chunking = True
batched = True
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Large Text Embedding

In [86]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Large Text Embedding - no previous chunking

##### GPT 3 Turbo

In [87]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[0]
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


##### GPT 4

In [88]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[1]
gather_results(model_type, embedding, no_prev_chunking, batched, model)

##### GPT 4o

In [89]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[2]
gather_results(model_type, embedding, no_prev_chunking, batched, model)

##### Llama 3.1 (70b)

In [90]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[3]
gather_results(model_type, embedding, no_prev_chunking, batched, model)

##### Llama 3.1 (405b)

In [91]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[4]
gather_results(model_type, embedding, no_prev_chunking, batched, model)

##### Llama 3.3 (70b)

In [92]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[5]
gather_results(model_type, embedding, no_prev_chunking, batched, model)

##### Llama 4 Scout

In [93]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False
model = models[6]
gather_results(model_type, embedding, no_prev_chunking, batched, model)

Row 103 Model Classification could not be decoded: Expecting value: line 1 column 1 (char 0)
To assess whether the reference article supports the statement, let's analyze the information provided.


The statement suggests that agro-industries, farms, and civil society should develop a worldwide strategy for sustainable food systems to drive healthier, low-meat diets and reduce food waste.


The reference article explores the biophysical option space for feeding the world without deforestation. It discusses how different diets, yields, and agricultural practices can impact food security and environmental sustainability.


Based on the information provided, here is my assessment:


## Assessment

### Label: Partially substantiate

### Explanation: 
The reference article partially supports the statement. It highlights the importance of diet and food production strategies in achieving sustainable food systems. The article suggests that adopting diets with a lower share of livestock product

#### Large Text Embedding - no previous chunking - batched

In [94]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = True
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


### Full GROBID Model + TEI document text refactoring
- PDF text extracted with full GROBID model
- Only text from body of papers (actual content) extracted from TEI documents
- Model temperature set to 0 (for top 3 excerpts retrieval and for classification via model prompting)
- Used embedding for index: text-embedding-3-small
- Used model for prompting: gpt-3.5-turbo-0125

#### Small Text Embedding

In [95]:
model_type = "full_model_texts"
embedding = "te3s"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

#### Small Text Embedding - no previous chunking

In [96]:
model_type = "full_model_texts"
embedding = "te3s"
no_prev_chunking = True
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Small Text Embedding - no previous chunking - batched

In [97]:
model_type = "full_model_texts"
embedding = "te3s"
no_prev_chunking = True
batched = True
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Large Text Embedding

In [98]:
model_type = "full_model_texts"
embedding = "te3l"
no_prev_chunking = False
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

#### Large Text Embedding - no previous chunking

In [99]:
model_type = "full_model_texts"
embedding = "te3l"
no_prev_chunking = True
batched = False
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


#### Large Text Embedding - no previous chunking - batched

In [100]:
model_type = "full_model_texts"
embedding = "te3l"
no_prev_chunking = True
batched = True
gather_results(model_type, embedding, no_prev_chunking, batched)

Row 22 Model Classification could not be decoded: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
{

Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None
Row 22 Model Classification Label is not a valid label: None


### Annotated Data

#### Three Labels Classification

##### Llama 3.1 (70b) - Annotated Data

In [101]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False

annotated = True

model = models[3]
gather_results(model_type, embedding, no_prev_chunking, batched, model, annotated)

##### Llama 3.1 (70b) - Annotated Data + Corrected Statements

In [102]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False

annotated = True
corrected_statements = True

model = models[3]
gather_results(model_type, embedding, no_prev_chunking, batched, model, annotated, corrected_statements)

#### Two Labels (Fully substanitated/Unsubstantiate) Classification

##### Llama 3.1 (70b) - Annotated Data

In [103]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False

annotated = True
two_labels = True

model = models[3]
gather_results(model_type, embedding, no_prev_chunking, batched, model, annotated, two_labels=two_labels)

##### Llama 3.1 (70b) - Annotated Data + Corrected Statements

In [104]:
model_type = "full_model"
embedding = "te3l"
no_prev_chunking = True
batched = False

annotated = True
corrected_statements = True
two_labels = True

model = models[3]
gather_results(model_type, embedding, no_prev_chunking, batched, model, annotated, corrected_statements, two_labels=two_labels)

### Save results

In [105]:
import json

with open("../data/all_results.json", "w") as json_file:
    json.dump(all_results, json_file, indent=4)

## Reproduce Table from Paper

In [106]:
results_for_table = all_results['te3l_no_prev_chunking']['full_model']

In [107]:
results_for_table

{'gpt-3.5-turbo-0125': {'all_labels': {'accuracy': 0.567,
   'total': 247,
   'correct': 140,
   'false_predictions': 107,
   'type_predictions': {'unsubstantiate': {'total': 109,
     'unsubstantiate': 78,
     'partially substantiate': 16,
     'fully substantiate': 14,
     'invalid label': 1},
    'partially substantiate': {'total': 14,
     'unsubstantiate': 2,
     'partially substantiate': 7,
     'fully substantiate': 5,
     'invalid label': 0},
    'fully substantiate': {'total': 124,
     'unsubstantiate': 11,
     'partially substantiate': 58,
     'fully substantiate': 55,
     'invalid label': 0}}},
  'all_labels_exclude_not_available': {'accuracy': 0.57,
   'total': 244,
   'correct': 139,
   'false_predictions': 105,
   'type_predictions': {'unsubstantiate': {'total': 108,
     'unsubstantiate': 78,
     'partially substantiate': 15,
     'fully substantiate': 14,
     'invalid label': 1},
    'partially substantiate': {'total': 14,
     'unsubstantiate': 2,
     'parti

In [108]:
print("Label Accuracies - Include All Downloaded")
print_table_label_accuracies(calc_label_accuracies(results_for_table))

Label Accuracies - Include All Downloaded
+--------------------------------------------------------+------+-----------+-------+---------+
|                         Model                          |  Un  | Partially | Fully | Overall |
+--------------------------------------------------------+------+-----------+-------+---------+
|                   gpt-3.5-turbo-0125                   | 71.6 |   50.0    | 44.4  |  56.7   |
|                   gpt-4-0125-preview                   | 85.3 |   14.3    | 62.9  |  70.0   |
|                   gpt-4o-2024-05-13                    | 84.4 |   57.1    | 36.3  |  58.7   |
|                      llama3.1:70b                      | 87.2 |   64.3    | 27.4  |  55.9   |
|                     llama3.1:405b                      | 87.2 |   50.0    | 25.8  |  54.3   |
|                        llama3.3                        | 89.9 |   50.0    | 17.7  |  51.4   |
|                      llama4:scout                      | 86.2 |   57.1    | 29.0  |  55.9   

In [109]:
print("Label Accuracies - Exclude Not Available in Paper")
print_table_label_accuracies(calc_label_accuracies(results_for_table, True))

Label Accuracies - Exclude Not Available in Paper
+--------------------------------------------------------+------+-----------+-------+---------+
|                         Model                          |  Un  | Partially | Fully | Overall |
+--------------------------------------------------------+------+-----------+-------+---------+
|                   gpt-3.5-turbo-0125                   | 72.2 |   50.0    | 44.3  |  57.0   |
|                   gpt-4-0125-preview                   | 85.2 |   14.3    | 63.1  |  70.1   |
|                   gpt-4o-2024-05-13                    | 84.3 |   57.1    | 36.1  |  58.6   |
|                      llama3.1:70b                      | 87.0 |   64.3    | 27.0  |  55.7   |
|                     llama3.1:405b                      | 87.0 |   50.0    | 25.4  |  54.1   |
|                        llama3.3                        | 89.8 |   50.0    | 17.2  |  51.2   |
|                      llama4:scout                      | 86.1 |   57.1    | 28.7  | 

### Compare to paper results

In [110]:
calc_label_accuracies(results_for_table)

{'gpt-3.5-turbo-0125': {'unsubstantiate': 0.716,
  'partially substantiate': 0.5,
  'fully substantiate': 0.444,
  'overall': 0.567},
 'gpt-4-0125-preview': {'unsubstantiate': 0.853,
  'partially substantiate': 0.143,
  'fully substantiate': 0.629,
  'overall': 0.7},
 'gpt-4o-2024-05-13': {'unsubstantiate': 0.844,
  'partially substantiate': 0.571,
  'fully substantiate': 0.363,
  'overall': 0.587},
 'llama3.1:70b': {'unsubstantiate': 0.872,
  'partially substantiate': 0.643,
  'fully substantiate': 0.274,
  'overall': 0.559},
 'llama3.1:405b': {'unsubstantiate': 0.872,
  'partially substantiate': 0.5,
  'fully substantiate': 0.258,
  'overall': 0.543},
 'llama3.3': {'unsubstantiate': 0.899,
  'partially substantiate': 0.5,
  'fully substantiate': 0.177,
  'overall': 0.514},
 'llama4:scout': {'unsubstantiate': 0.862,
  'partially substantiate': 0.571,
  'fully substantiate': 0.29,
  'overall': 0.559},
 'llama3.1:70b_annotated': {'unsubstantiate': 0.836,
  'partially substantiate': 0,
 

In [111]:
paper_results = {
    'gpt-3.5-turbo-0125': {
        'unsubstantiate': 0.795,
        'partially substantiate': 0.571,
        'fully substantiate': 0.306,
        'overall': 0.540,
    },
    'gpt-4-0125-preview': {
        'unsubstantiate': 0.839,
        'partially substantiate': 0.214,
        'fully substantiate': 0.629,
        'overall': 0.700,
    },
    'gpt-4o-2024-05-13': {
        'unsubstantiate': 0.866,
        'partially substantiate': 0.500,
        'fully substantiate': 0.347,
        'overall': 0.588,
    },
}

In [112]:
def calc_difference_between_results(calculated_results, paper_results):
    differences = {}
    for model, results in paper_results.items():
        if model in calculated_results:
            differences[model] = {
                key: calculated_results[model][key] - value
                for key, value in results.items()
            }
    return differences

In [113]:
calc_difference_between_results(calc_label_accuracies(results_for_table), paper_results)

{'gpt-3.5-turbo-0125': {'unsubstantiate': -0.07900000000000007,
  'partially substantiate': -0.07099999999999995,
  'fully substantiate': 0.138,
  'overall': 0.026999999999999913},
 'gpt-4-0125-preview': {'unsubstantiate': 0.014000000000000012,
  'partially substantiate': -0.07100000000000001,
  'fully substantiate': 0.0,
  'overall': 0.0},
 'gpt-4o-2024-05-13': {'unsubstantiate': -0.02200000000000002,
  'partially substantiate': 0.07099999999999995,
  'fully substantiate': 0.016000000000000014,
  'overall': -0.0010000000000000009}}

In [114]:
label_accuracies = calc_label_accuracies(results_for_table, True)
print("Label Accuracies - Exclude Not Available in Paper")
print_table_label_accuracies(label_accuracies)
print("")
print("Paper Results")
print_table_label_accuracies(paper_results)
print("")
print("Differences from paper results to calculated paper results")
print_table_label_accuracies(calc_difference_between_results(label_accuracies, paper_results))

Label Accuracies - Exclude Not Available in Paper
+--------------------------------------------------------+------+-----------+-------+---------+
|                         Model                          |  Un  | Partially | Fully | Overall |
+--------------------------------------------------------+------+-----------+-------+---------+
|                   gpt-3.5-turbo-0125                   | 72.2 |   50.0    | 44.3  |  57.0   |
|                   gpt-4-0125-preview                   | 85.2 |   14.3    | 63.1  |  70.1   |
|                   gpt-4o-2024-05-13                    | 84.3 |   57.1    | 36.1  |  58.6   |
|                      llama3.1:70b                      | 87.0 |   64.3    | 27.0  |  55.7   |
|                     llama3.1:405b                      | 87.0 |   50.0    | 25.4  |  54.1   |
|                        llama3.3                        | 89.8 |   50.0    | 17.2  |  51.2   |
|                      llama4:scout                      | 86.1 |   57.1    | 28.7  | 