In [1]:
import json
from sklearn.metrics import f1_score, accuracy_score
from rouge_score import rouge_scorer
import numpy as np
from copy import deepcopy
from tqdm import tqdm
from collections import defaultdict
import sys

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from datasets import Dataset

sys.path.insert(0, '../../')
from questions_construction.main import QUESTION_CATEGORIES
from questions_construction.domains import DOMAIN_NAMES
from common import *
from model_performances import *
import random

In [16]:
substitution=WITHOUT_RANDOM_SUB
ramification=WITHOUT_RAMIFICATIONS
prompt_type=ZERO_SHOT_PROMPT_KEY
questions_by_id = {d[OUT_OBJ_ID]: d for d in open_jsonl(f'{DATA_PATH}/test_data.paraphrased.cleaned.jsonl')}

In [23]:
def remove_keys(d):
    for rm_key in ['label', 'checkpoint_130_generated_responses', 'input_prompt_tokenized', 'answer_tokenized', 'input_prompt_tokenized']:
        if rm_key in d:
            del d[rm_key]
    return d

def load_all_data(model_name, substitution=WITHOUT_RANDOM_SUB, ramification=WITHOUT_RAMIFICATIONS, prompt_type=ZERO_SHOT_PROMPT_KEY):
    questions_by_id = {d[OUT_OBJ_ID]: d for d in open_jsonl(f'{DATA_PATH}/test_data.paraphrased.cleaned.jsonl')}
    
    # True/False
    model_tf_results = open_jsonl(f'{PROJECT_PATH}/data/prompting_results/{ramification}/{prompt_type}/{model_name}.jsonl')
    data_tf = data_all_single_run(questions_by_id, model_tf_results, substitution, ramification, model_name,prompt_type)
    data_tf_by_id = {d[OUT_OBJ_ID]: d for d in data_tf}
    
    model_free_results = open_jsonl(f'{PROJECT_PATH}/data/free_answers/{ramification}/{prompt_type}/{model_name}.jsonl')
    data_free = data_all_single_run(questions_by_id, model_free_results, substitution, ramification, model_name, prompt_type)
    data_free_by_id = {d[OUT_OBJ_ID]: d for d in data_free}
    
    zipped_data = []
    for q_id in questions_by_id:
        d = {}
        if q_id in data_tf_by_id:
            d.update(data_tf_by_id[q_id])
        if q_id in data_free_by_id:
            d.update(data_free_by_id[q_id])
        if d:
            d = remove_keys(d)
            zipped_data.append(d)
    return zipped_data

def prediction_criteria(d, prediction, ground_truth):
    if prediction in (TRUE_ANSWER, FALSE_ANSWER):
        if prediction == ground_truth:
            d[IS_RESPONSE_CORRECT_KEY] = TRUE_ANSWER
        else:
            d[IS_RESPONSE_CORRECT_KEY] = FALSE_ANSWER
    else:
        # print(prediction, ground_truth)
        d[IS_RESPONSE_CORRECT_KEY] = 'N/A'
        
    return d
    

In [24]:
for model_name in ['llama_8b', 'llama_70b', 'gpt-4o']:
    data_all = load_all_data(model_name)
    
    for d in data_all:
        try:
            if d[OUT_OBJ_ANSWER_TYPE] == TRUE_FALSE_ANSWER_TYPE:
                prediction = TrueFalseStats.prediction_selection_criteria(d)
                ground_truth = d[OUT_OBJ_ANSWER]
            else:
                prediction = FreeAnswerStats.prediction_selection_criteria(d)
                ground_truth = TRUE_ANSWER
        except Exception as e:
            print(e)
            ground_truth = 'xdskjnf'
            prediction = 'sjsnf'
        
        d = prediction_criteria(d, prediction, ground_truth)   
    save_jsonl(data_all, f'{model_name}.all.jsonl')

'evaluated_free_answer_response'


In [25]:
model_name = 'llama_8b.finetuned_free'
model_free_results = open_jsonl(f'{PROJECT_PATH}/data/free_answers/{ramification}/{prompt_type}/{model_name}.jsonl')
data_all = data_all_single_run(questions_by_id, model_free_results, substitution, ramification, model_name, prompt_type)

data_pruned = []
for d in data_all:
    if d[OUT_OBJ_ANSWER_TYPE] != FREE_ANSWER_TYPE:
        continue
    prediction = FreeAnswerStats.prediction_selection_criteria(d)
    ground_truth = TRUE_ANSWER   
    d = prediction_criteria(d, prediction, ground_truth) 
    d = remove_keys(d)
    data_pruned.append(d)
    
save_jsonl(data_pruned, f'{model_name}.all.jsonl')

In [26]:
model_name = 'llama_8b.finetuned_tf'
model_tf_results = open_jsonl(f'{PROJECT_PATH}/data/prompting_results/{ramification}/{prompt_type}/{model_name}.jsonl')
data_all = data_all_single_run(questions_by_id, model_tf_results, substitution, ramification, model_name,prompt_type)


data_pruned = []
for d in data_all:
    if d[OUT_OBJ_ANSWER_TYPE] != TRUE_FALSE_ANSWER_TYPE:
        continue 
    prediction = TrueFalseStats.prediction_selection_criteria(d)
    ground_truth = d[OUT_OBJ_ANSWER]
    d = prediction_criteria(d, prediction, ground_truth)  
    d = remove_keys(d)
    data_pruned.append(d)
        
save_jsonl(data_pruned, f'{model_name}.all.jsonl')

In [28]:
data = open_jsonl('gpt-4o.all.jsonl')

In [35]:
scores = []
for d in data:
    if d['plan_length'] == 1 and d['answer_type'] == TRUE_FALSE_ANSWER_TYPE and d['is_response_correct'] in (TRUE_ANSWER, FALSE_ANSWER):
        scores.append(d['is_response_correct']=='True')

In [36]:
np.mean(scores)

0.8126195028680688

In [32]:
len(scores)

713