## Benchmarking saved LLM results on 3-Shot AugARC

### This Script evaluates existing predictions from an LLM on the 3-Shot AugARC Evaluation set.

In [None]:
import json


model_name = 'Llama-3-8B'
filename = f'{model_name}_AugARC_Evaluation_Results_Dict.json'

data = {}

with open(filename, 'r') as file:
    data = json.load(file)

In [None]:
special_task_ids = ['12997ef3', '1d398264', '31d5ba1a', '3b4c2228', '4852f2fa', '4c177718', '5d2a5c43', '6ea4a07e', '8b28cd80', '9110e3c5', '9b4c17c4', 'b1fc8b8e', 'bbb1b8b6', 'c074846d', 'd5c634a2', 'da2b0fe3', 'e21a174a', 'e345f17b', 'f3e62deb']

In [None]:
import os
import json

def read_json_files(directory):
    json_data = {}

    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            file_path = os.path.join(directory, filename)

            with open(file_path, 'r') as file:
                filename = str(file.name)[-13:][:-5]
                try:
                    data = json.load(file)
                    json_data[filename] = data
                except json.JSONDecodeError as e:
                    print(f"Error reading {filename}: {e}")

    return json_data

directory = 'arc_data/evaluation'
all_json_data = read_json_files(directory)

In [None]:
def rotate_matrix_90_degrees(matrix):
    return [list(row) for row in zip(*matrix[::-1])]

def rotate_matrix_270_degrees(matrix):
    return [list(row) for row in zip(*matrix)][::-1]

def transform_input_90(data, special_task=False):
    transformed_data = {}

    # Iterate through each train and test case
    for key in ['train', 'test']:
        transformed_data[key] = ''
        if key == 'test' and special_task:
            transformed_data_special = {
                '0': '',
                '1': '',
            }
            for indx, case in enumerate(data[key]):
                input_matrix = rotate_matrix_90_degrees(case['input'])
                output_matrix = rotate_matrix_90_degrees(case['output'])
                new_input = ''
                new_output = ''
                for row in input_matrix:
                    for i, element in enumerate(row):
                        new_input += str(element)
                        if i < len(row) -1 :
                            new_input += ' '
                        else:
                            new_input += '\n'

                for row in output_matrix:
                    for i, element in enumerate(row):
                        new_output += str(element)
                        if i < len(row) -1 :
                            new_output += ' '
                        else:
                            new_output += '\n'


                    # Add the output matrix to the transformed data
                transformed_data_special[str(indx)] = f'\n###Input:\n{new_input}\n###Output:\n{new_output}'

            transformed_data[key] = transformed_data_special
        else:
            for case in data[key]:
                input_matrix = rotate_matrix_90_degrees(case['input'])
                output_matrix = rotate_matrix_90_degrees(case['output'])
                new_input = ''
                new_output = ''
                for row in input_matrix:
                    for i, element in enumerate(row):
                        new_input += str(element)
                        if i < len(row) -1 :
                            new_input += ' '
                        else:
                            new_input += '\n'

                for row in output_matrix:
                    for i, element in enumerate(row):
                        new_output += str(element)
                        if i < len(row) -1 :
                            new_output += ' '
                        else:
                            new_output += '\n'

                # Add the output matrix to the transformed data
                transformed_data[key] += f'\n###Input:\n{new_input}\n###Output:\n{new_output}'

    return transformed_data

In [None]:
def transform_input_270(data, special_task=False):
    transformed_data = {}

    # Iterate through each train and test case
    for key in ['train', 'test']:
        transformed_data[key] = ''
        if key == 'test' and special_task:
            transformed_data_special = {
                '0': '',
                '1': '',
            }
            for indx, case in enumerate(data[key]):
                input_matrix = rotate_matrix_270_degrees(case['input'])
                output_matrix = rotate_matrix_270_degrees(case['output'])
                new_input = ''
                new_output = ''
                for row in input_matrix:
                    for i, element in enumerate(row):
                        new_input += str(element)
                        if i < len(row) -1 :
                            new_input += ' '
                        else:
                            new_input += '\n'

                for row in output_matrix:
                    for i, element in enumerate(row):
                        new_output += str(element)
                        if i < len(row) -1 :
                            new_output += ' '
                        else:
                            new_output += '\n'


                    # Add the output matrix to the transformed data
                transformed_data_special[str(indx)] = f'\n###Input:\n{new_input}\n###Output:\n{new_output}'

            transformed_data[key] = transformed_data_special
        else:
            for case in data[key]:
                input_matrix = rotate_matrix_270_degrees(case['input'])
                output_matrix = rotate_matrix_270_degrees(case['output'])
                new_input = ''
                new_output = ''
                for row in input_matrix:
                    for i, element in enumerate(row):
                        new_input += str(element)
                        if i < len(row) -1 :
                            new_input += ' '
                        else:
                            new_input += '\n'

                for row in output_matrix:
                    for i, element in enumerate(row):
                        new_output += str(element)
                        if i < len(row) -1 :
                            new_output += ' '
                        else:
                            new_output += '\n'

                # Add the output matrix to the transformed data
                transformed_data[key] += f'\n###Input:\n{new_input}\n###Output:\n{new_output}'

    return transformed_data

In [None]:
def transform_input(data, special_task=False):
    transformed_data = {}

    # Iterate through each train and test case
    for key in ['train', 'test']:
        transformed_data[key] = ''
        if key == 'test' and special_task:
            transformed_data_special = {
                '0': '',
                '1': '',
            }
            for indx, case in enumerate(data[key]):
                input_matrix = case['input']
                output_matrix = case['output']
                new_input = ''
                new_output = ''
                for row in input_matrix:
                    for i, element in enumerate(row):
                        new_input += str(element)
                        if i < len(row) -1 :
                            new_input += ' '
                        else:
                            new_input += '\n'

                for row in output_matrix:
                    for i, element in enumerate(row):
                        new_output += str(element)
                        if i < len(row) -1 :
                            new_output += ' '
                        else:
                            new_output += '\n'


                    # Add the output matrix to the transformed data
                transformed_data_special[str(indx)] = f'\n###Input:\n{new_input}\n###Output:\n{new_output}'

            transformed_data[key] = transformed_data_special
        else:
            for case in data[key]:
                input_matrix = case['input']
                output_matrix = case['output']
                new_input = ''
                new_output = ''
                for row in input_matrix:
                    for i, element in enumerate(row):
                        new_input += str(element)
                        if i < len(row) -1 :
                            new_input += ' '
                        else:
                            new_input += '\n'

                for row in output_matrix:
                    for i, element in enumerate(row):
                        new_output += str(element)
                        if i < len(row) -1 :
                            new_output += ' '
                        else:
                            new_output += '\n'

                # Add the output matrix to the transformed data
                transformed_data[key] += f'\n###Input:\n{new_input}\n###Output:\n{new_output}'

    return transformed_data

In [None]:
eval_data = {}
eval_data_90 = {}
eval_data_270 = {}

for key, element in all_json_data.items():
    if key in special_task_ids:
        eval_data[key] = transform_input(element, True)
    else:
        eval_data[key] = transform_input(element)

for key, element in all_json_data.items():
    if key in special_task_ids:
        eval_data_90[key] = transform_input_90(element, True)
    else:
        eval_data_90[key] = transform_input_90(element)

for key, element in all_json_data.items():
    if key in special_task_ids:
        eval_data_270[key] = transform_input_270(element, True)
    else:
        eval_data_270[key] = transform_input_270(element)

In [None]:
def extract_after_output(text):
    index = text.find('###Output:\n')
    if index != -1:
        return text[index + len('###Output:\n'):]
    else:
        return text

def extract_before_output(text):
    index = text.find('###Output:\n')
    if index != -1:
        return text[:index]
    else:
        return text

In [None]:
total_correct = 0
correct_tasks = []
count = 0

for key, element in eval_data.items():
    correct_output = ''
    j = 0
    correct_detect = False

    if key in special_task_ids:
        correct_list = [False, False]
        for indx in range(2):
            correct_detect = False

            j = 0
            while j < 3 and not correct_detect:
                try:
                    current_solution = data[key][str(indx)][str(j)].strip()
                except:
                    correct_detect = True

                correct_output = extract_after_output(eval_data[key]['test'][str(indx)]).strip()

                if j == 1:
                    correct_output = extract_after_output(eval_data_90[key]['test'][str(indx)]).strip()

                if j == 2:
                    correct_output = extract_after_output(eval_data_270[key]['test'][str(indx)]).strip()

                # print(current_solution)
                if (correct_output == current_solution):
                    correct_detect = True
                    correct_list[indx] = True

                j += 1

        if correct_list[0] == True and correct_list[1] == True:
            total_correct += 1
            correct_tasks.append(key)
    else:
        while j < 3 and not correct_detect:
            try:
                current_solution = data[key][str(j)].strip()
            except:
                correct_detect = True
            correct_output = extract_after_output(eval_data[key]['test']).strip()

            if j == 1:
                correct_output = extract_after_output(eval_data_90[key]['test']).strip()

            if j == 2:
                correct_output = extract_after_output(eval_data_270[key]['test']).strip()

            if (correct_output == current_solution):
                total_correct += 1
                correct_detect = True
                correct_tasks.append(key)
            j+=1

In [None]:
print(total_correct)