# Generate results using the ground truth and predictions

In [1]:
import os
import json
import numpy as np
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    accuracy_score,
    f1_score
)

from keys import (
    DEPLOYMENT_GPT_4o_MINI,
    DEPLOYMENT_GPT_4o
)

### Helper Functions

In [2]:
def load_json_file(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)
    return data

def load_ground_truths():
    ground_truth_path = os.path.join("review-5k-dataset", "300")
    ground_truth_files = os.listdir(ground_truth_path)
    if ".DS_Store" in ground_truth_files:
        ground_truth_files.remove(".DS_Store")

    ground_truths = {}
    for file in ground_truth_files:
        file_path = os.path.join("review-5k-dataset", "300", file)
        json_data = load_json_file(file_path)
        decision = json_data["decision"]
        if "accept" in decision.lower():
            decision = "accept"
        else:
            decision = "reject"
        
        ground_truths[file] = {
            "rates": json_data["rates"],
            "decision": decision
            }
        
    return ground_truths

def load_predictions(deloyment):
    predictions_path = os.path.join("LLM_Responses_Parsed", deloyment)
    prediction_files = os.listdir(predictions_path)
    if ".DS_Store" in prediction_files:
        prediction_files.remove(".DS_Store")

    predictions = {}
    for file in prediction_files:
        file_path = os.path.join("LLM_Responses_Parsed", deloyment, file)
        json_data = load_json_file(file_path)
        reviews = json_data["reviews"]
        decision = json_data["decision"]
        ratings = [item['rating'] for item in reviews]
        predictions[file] = {
            "rates": ratings,
            "decision": decision
        }

    return predictions

### Evaluation Functions

In [3]:
def compute_reviewer_n_minus_1_mae_mse_r2(ground_truth, use_all_possible_permutations=True):

    # Step-1: prepare true scores and predicted scores
    # True score is the ith rating
    # Predicted score is the mean of all ratings except ith rating
    true_scores = []
    predicted_scores = []
    
    for paper_id, paper_dict in ground_truth.items():
        ratings = paper_dict["rates"]
        n = len(ratings)

        if use_all_possible_permutations:

            for i in range(n):
                true_rating = ratings[i]
                other_ratings = [ratings[j] for j in range(n) if j != i]
                pred_rating = np.mean(other_ratings)

                true_scores.append(true_rating)
                predicted_scores.append(pred_rating)
        else:
            # Randomly pick one index i from the n reviews
            i = np.random.choice(n)
            
            true_rating = ratings[i]
            other_ratings = [ratings[j] for j in range(n) if j != i]
            pred_rating = np.mean(other_ratings)

            true_scores.append(true_rating)
            predicted_scores.append(pred_rating)

    # Step-2: compute MAE,  MSE and R2
    mae = round(mean_absolute_error(true_scores, predicted_scores), 2)
    mse = round(mean_squared_error(true_scores, predicted_scores), 2)
    r2 = round(r2_score(true_scores, predicted_scores), 2)
    return {
        "MAE": mae,
        "MSE": mse,
        "R2": r2
    }

def compute_llm_reviewer_n_mae_mse_r2(ground_truth, predictions):

    # Step-1: Preapre true scores and prediction scores. We use mean of the 4 ratings for each paper.
    true_scores = []
    pred_scores = []
    for paper_id, paper_dict in ground_truth.items():
        mean_human_rating = np.mean(ground_truth[paper_id]["rates"])
        mean_llm_rating = np.mean(predictions[paper_id]["rates"])
        true_scores.append(mean_human_rating)
        pred_scores.append(mean_llm_rating)

    # Step-2: compute MAE, MSE and R2
    mae = round(mean_absolute_error(true_scores, pred_scores), 2)
    mse = round(mean_squared_error(true_scores, pred_scores), 2)
    r2 = round(r2_score(true_scores, pred_scores), 2)
    return {
        "MAE": mae,
        "MSE": mse,
        "R2": r2
    }

def compute_accuracy_and_f1(ground_truth, predictions):
    
    # Step-1: Prepare true decisions and predicted decisions
    true_decisions = []
    pred_decisions = []
    for paper_id in ground_truth:
        true_decisions.append(ground_truth[paper_id]["decision"])
        pred_decisions.append(predictions[paper_id]["decision"])

    # Step-2: Compute Accuracy and Macro F1 as percentages
    acc = round(accuracy_score(true_decisions, pred_decisions) * 100, 2)
    f1 = round(f1_score(true_decisions, pred_decisions, average='macro') * 100, 2)

    return {
        "accuracy": acc,
        "f1_score": f1
    }

# Run Code

In [4]:
ground_truths = load_ground_truths()
predictions_4o = load_predictions(DEPLOYMENT_GPT_4o)
predictions_4o_mini = load_predictions(DEPLOYMENT_GPT_4o_MINI)

#### Compute scores

In [5]:
# n-1 MAE, MSE, R2
expert = compute_reviewer_n_minus_1_mae_mse_r2(ground_truths, use_all_possible_permutations=False)
n_minus_1_4o = compute_reviewer_n_minus_1_mae_mse_r2(predictions_4o, use_all_possible_permutations=False)
n_minus_1_4o_mini = compute_reviewer_n_minus_1_mae_mse_r2(predictions_4o_mini, use_all_possible_permutations=False)

# n MAE, MSE, R2
n_4o = compute_llm_reviewer_n_mae_mse_r2(ground_truths, predictions_4o)
n_4o_mini = compute_llm_reviewer_n_mae_mse_r2(ground_truths, predictions_4o_mini)

# Accuracy, Macro F1
af_4o = compute_accuracy_and_f1(ground_truths, predictions_4o)
af_4o_mini = compute_accuracy_and_f1(ground_truths, predictions_4o_mini)


In [6]:
print("Expert: ", expert)
print("4o n-1: ", n_minus_1_4o)
print("4o mini n-1: ", n_minus_1_4o_mini)

print("4o n: ", n_4o)
print("4o mini n: ", n_4o_mini)

print("4o af: ", af_4o)
print("4o mini af: ", af_4o_mini)


Expert:  {'MAE': 1.26, 'MSE': 2.62, 'R2': 0.08}
4o n-1:  {'MAE': 0.55, 'MSE': 0.45, 'R2': 0.05}
4o mini n-1:  {'MAE': 0.66, 'MSE': 0.68, 'R2': -0.47}
4o n:  {'MAE': 2.22, 'MSE': 6.23, 'R2': -3.77}
4o mini n:  {'MAE': 2.57, 'MSE': 7.88, 'R2': -5.04}
4o af:  {'accuracy': 35.0, 'f1_score': 27.41}
4o mini af:  {'accuracy': 33.67, 'f1_score': 25.19}


***
# Results
***

#### **Proxy Reviewer = $n-1$**  
For each paper, choose randomly $i^{th}$ rating (each paper has 4 ratings). This $i^{th}$ rating is considered **predicted rating**. Except the $i^{th}$ all other ratings (remaining 3 ratings for the paper) are averaged and the value is considered **ground truth rating**. We then compute MAE and MSE using this data.


#### **Proxy Reviewer = $n$** 
For each paper, we have 4 ground truth ratings (by ICLR etc.). We have 4 more reviews from LLMs(GPT-4o and GPT-4o-Mini), named predicted retings. We average the ground truth ratings to have one ground truth rating per paper, similarly we average the predicted ratings to have one predicted rating per paper. We then compute MAE and MSE using this data.


#### **Origianl Results (from paper)**
| Method             | Proxy(n-1) MAE | Proxy(n-1) MSE | Proxy(n) MAE | Proxy(n) MSE   | Accuracy | Macro F1 |
|--------------------|----------------|----------------|---------------|---------------|----------|----------|
| Expert  Individual |      1.16      |      2.34      |       -       |       -       | 75.40%   | 75.39%   |
| GPT 4o             |      2.24      |      6.61      |     2.30      |      6.53     | 52.58%   | 34.51%   |
| GPT 4o Mini        |      1.53      |      3.44      |     1.40      |      2.98     | 53.06%   | 34.72%   |


#### **Computed Results with 300 papers**
| Method             | Proxy(n-1) MAE | Proxy(n-1) MSE | Proxy(n-1) $R^2$ | Proxy(n) MAE  | Proxy(n) MSE  | Proxy(n) $R^2$ | Accuracy | Macro F1 |
|--------------------|----------------|----------------|------------------|---------------|---------------|----------------|----------|----------|
| Expert  Individual |      1.16      |      2.35      |       0.2        |       -       |       -       |       -        |    -     |    -     |
| GPT 4o             |      0.55      |      0.47      |       0.01       |     2.22      |      6.23     |   -3.78        |  35.0%   | 27.41%   |
| GPT 4o Mini        |      0.66      |      0.67      |      -0.52       |     2.57      |      7.88     |   -5.04        | 33.67%   | 25.19%   |


In [10]:
arr = []
for paper, paer_dict in ground_truths.items():
    if paer_dict['decision'] == 'reject':
        arr.append(0)
    else:
        arr.append(1)
np.unique(arr, return_counts=True)

(array([0, 1]), array([199, 101]))

In [11]:
arr = []
for paper, paer_dict in predictions_4o.items():
    if paer_dict['decision'] == 'reject':
        arr.append(0)
    else:
        arr.append(1)
np.unique(arr, return_counts=True)

(array([0, 1]), array([  4, 296]))

In [12]:
arr = []
for paper, paer_dict in predictions_4o_mini.items():
    if paer_dict['decision'] == 'reject':
        arr.append(0)
    else:
        arr.append(1)
np.unique(arr, return_counts=True)

(array([1]), array([300]))