In [51]:
!pip install datasets



In [52]:
from datasets import load_dataset
dataset = load_dataset("derek-thomas/ScienceQA")
validation_data = dataset["validation"]
filtered_validation_data = validation_data.filter(lambda example: example["lecture"] != "" and example["solution"] != "")
filtered_validation_data

Dataset({
    features: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution'],
    num_rows: 3216
})

In [53]:
import pandas as pd
import numpy as np
import csv
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#this notebook is restoring the script used for postprocessing + evaluation (benchmarking.ipynb)

In [None]:
project_dir = "foundation_models"
input_folder = "benchmarking_val/outputs/paligemma_ft/raw_outputs"
output_folder = "benchmarking_val/outputs/paligemma_ft/parsed_outputs"
full_input_dir = os.path.join("/content/drive/MyDrive", project_dir, input_folder)
full_res_dir = os.path.join("/content/drive/MyDrive", project_dir, output_folder)
files = os.listdir(full_input_dir)
files = [x for x in files if x.startswith("google")]
files

['google_paligemma-3b-ft-science-qa-224_validation_output_setting_QTCH.csv',
 'google_paligemma-3b-ft-science-qa-224_validation_output_setting_QTCHLS.csv',
 'google_paligemma-3b-ft-science-qa-224_validation_output_setting_QTCHS.csv',
 'google_paligemma-3b-ft-science-qa-224_validation_output_setting_QTCHL.csv']

In [None]:
file_path = os.path.join(full_input_dir, files[0])
df = pd.read_csv(file_path, delimiter="\t")
print(f"Loaded dataframe: {file_path} with {len(df)} rows.")

Loaded dataframe: /content/drive/MyDrive/foundation_models/benchmarking_val/outputs/paligemma_ft/raw_outputs/google_paligemma-3b-ft-science-qa-224_validation_output_setting_QTCH.csv with 3216 rows.


In [None]:
df

Unnamed: 0,idx,input,output,subject
0,0,Question: What does the verbal irony in this t...,The answer is Think answer,language science
1,1,Question: Which animal's mouth is also adapted...,Answer: The answer is A.,natural science
2,2,Question: Is this a sentence fragment?\nDuring...,Answer:,language science
3,3,Question: Which correctly shows the title of a...,The answer is A.,language science
4,4,Question: Does this passage describe the weath...,The answer is C.,natural science
...,...,...,...,...
3211,3211,Question: How long is an adult great white sha...,The answer is A.,natural science
3212,3212,Question: Which animal is also adapted to be c...,The answer is A.,natural science
3213,3213,Question: Compare the motion of two fish. Whic...,The answer is a.,natural science
3214,3214,Question: Which continent is highlighted?\n Ta...,The answer is Europe.,social science


In [None]:
import re
import json
def parse_output(text):
    # remove additional comments from LLM
    cleaned_text = re.sub("(?:The answer is|Answer):?", "", str(text)).strip('.').strip()
    solution = ""

    return cleaned_text, solution

In [58]:
df

Unnamed: 0.1,Unnamed: 0,idx,input,output,subject,answer,explanation,answer_str
0,0,0,Question: What does the verbal irony in this t...,The answer is Think answer,language science,-1,,Think answer
1,1,1,Question: Which animal's mouth is also adapted...,Answer: The answer is A.,natural science,0,,A
2,2,2,Question: Is this a sentence fragment?\nDuring...,Answer:,language science,-1,,
3,3,3,Question: Which correctly shows the title of a...,The answer is A.,language science,0,,A
4,4,4,Question: Does this passage describe the weath...,The answer is C.,natural science,2,,C
...,...,...,...,...,...,...,...,...
3211,3211,3211,Question: How long is an adult great white sha...,The answer is A.,natural science,0,,A
3212,3212,3212,Question: Which animal is also adapted to be c...,The answer is A.,natural science,0,,A
3213,3213,3213,Question: Compare the motion of two fish. Whic...,The answer is a.,natural science,0,,a
3214,3214,3214,Question: Which continent is highlighted?\n Ta...,The answer is Europe.,social science,2,,Europe


In [55]:
# This cell also repeats the steps I did while evaluating LLaVA outputs with benchmarking.ipynb. Commented out are the original lines, below are the lines in a new environment
# I am also not doing evaluation now -- only producing answer and solution columns as agreed for consistency
# The script is merely for the sake of documentation

setting_map = ['QTCH', 'QTCHL', 'QTCHLS', 'QTCHS']
# setting_map = ['1', '2', '3', '4']
# device = 'mps'

# def eval(RES_DIR:str, MODEL_NAME:str): #I kept this horrible name initially trying to rewrite as little given code as possible but I can't that's just too bad.
# I'm not sure why ScienceQA creators decided in favor of so many bad names -- maybe they're trying to create noisy data knowing that code generation models will be trained on open github repositories?
# if not I think they used this whale for their work: https://www.youtube.com/watch?v=q6dKllQzVxU&t=8s
def postprocess_and_evaluate(RES_DIR:str, MODEL_NAME:str, split:str="validation"):
    for i in range(4):
        # load data
        SETTING = setting_map[i]
        SETTING_NUM = i+1
        print(f"Metrics for setting {SETTING_NUM} {SETTING}")
        # path_to_df = f"benchmarking/{MODEL_NAME}/{MODEL_NAME}_val_output_setting_{SETTING}.csv"
        path_to_df = f"{full_input_dir}/{MODEL_NAME}_{split}_output_setting_{SETTING}.csv"
        df = pd.read_csv(path_to_df, sep="\t", header=0)[["idx", "input", "output"]] #avoid duplicate columns
        print(f"Loaded dataframe {path_to_df} with {len(df)} rows")
        # append validation data as accuracy evaluation will depend on it (as well as comparing selected answer to the given choices)
        hf_df = filtered_validation_data.to_pandas()
        hf_df = hf_df.reset_index()
        hf_df = hf_df.rename(columns={"index": "idx"})
        # check correct number of processed datapoints
        if len(hf_df) != len(df):
            print(f"WARNING: Data may be incomplete. Original dataset has {len(hf_df)} rows")
        df = pd.merge(df, hf_df, on="idx", how="inner")
        print(f"Merged dataframe with metadata has {len(df)} rows")
        # parse generated answer json
        # answer_pred_col, solution_pred_col = "answer_output", "solution_output"
        answer_pred_col, solution_pred_col = "answer_str", "explanation"
        df[[answer_pred_col, solution_pred_col]] = df["output"].apply(parse_output).apply(pd.Series)
        # extract answer number (index in choices)
        # df[answer_pred_col] = df.apply(lambda row: find_answer_num(row[answer_pred_col], row["choices"]), axis=1)
        df['answer'] = df.apply(lambda row: find_answer_num(row[answer_pred_col], row["choices"]), axis=1)
        print(df.head(4))
        # commenting out evaluation to keep only postprocessing
        # solution_preds =  df[solution_pred_col].tolist()
        # #accuracy per subject calculations to evaluate answers
        # scores = get_scores(df)
        # print_scores(scores)
        # #textual similarity to evaluate solutions
        # metrics = calculate_metrics_solutions(solution_preds, df)
        # scores.update(metrics)
        # dict_save = {'model': MODEL_NAME, 'setting': SETTING}
        # dict_save.update(scores)
        # df_save = pd.DataFrame(dict_save)
        # df_save.to_csv(f"{RES_DIR}/{MODEL_NAME}_val_metrics_setting_{SETTING_NUM}.csv", sep="\t", encoding="utf-8")
        df[["idx", "input", "output", "subject", "answer", "explanation", "answer_str"]].to_csv(f"{RES_DIR}/{MODEL_NAME}_{split}_output_setting_{SETTING_NUM}.csv", sep="\t", encoding="utf-8")

def find_answer_num(answer_output, choices):
    letter_answers = ["A", "B", "C", "D", "E"]
    # Answer is a letter, e.g. Answer: A
    if answer_output.strip().upper() in letter_answers:
        return letter_answers.index(answer_output.strip().upper())
    else:
        # Answer repeats the string of correct choice
        try:
            # ignore any uncoventional format of answer output; if it is a list or anything else it is simply incorrect, pass
            return int(np.where(choices == str(answer_output))[0][0])
        except IndexError:
            # Answer is a number, e.g. Answer: 1
            try:
                return int(answer_output.strip())
            except:
                return -1

In [56]:
postprocess_and_evaluate(full_res_dir, "google_paligemma-3b-ft-science-qa-224")

Metrics for setting 1 QTCH
Loaded dataframe /content/drive/MyDrive/foundation_models/benchmarking_val/outputs/paligemma_ft/raw_outputs/google_paligemma-3b-ft-science-qa-224_validation_output_setting_QTCH.csv with 3216 rows
Merged dataframe with metadata has 3216 rows
   idx                                              input  \
0    0  Question: What does the verbal irony in this t...   
1    1  Question: Which animal's mouth is also adapted...   
2    2  Question: Is this a sentence fragment?\nDuring...   
3    3  Question: Which correctly shows the title of a...   

                       output  \
0  The answer is Think answer   
1    Answer: The answer is A.   
2                     Answer:   
3            The answer is A.   

                                               image  \
0                                               None   
1  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
2                                               None   
3                                   

In [57]:
df = pd.read_csv("/content/drive/MyDrive/foundation_models/benchmarking_val/outputs/paligemma_ft/parsed_outputs/google_paligemma-3b-ft-science-qa-224_validation_output_setting_1.csv", delimiter="\t")
df.value_counts("answer") #508 of -1

Unnamed: 0_level_0,count
answer,Unnamed: 1_level_1
0,1410
-1,1018
1,361
2,131
3,121
4,85
13,6
6,5
8,5
5,4
