In [None]:
import numpy as np
from tqdm.notebook import tqdm
import benchmark
import utils
from openai_cache import Completion

In [None]:
def construct_placement_prompt(scenario):
    placement_prompt_template = '''objects = {seen_objects_str}
receptacles = {receptacles_str}
{seen_placements_str}
    
objects = {unseen_objects_str}
receptacles = {receptacles_str}
pick_and_place("{first_object}",'''
    seen_objects_str = '[' + ', '.join(map(lambda x: f'"{x}"', scenario.seen_objects)) + ']'
    receptacles_str = '[' + ', '.join(map(lambda x: f'"{x}"', scenario.receptacles)) + ']'
    seen_placements_str = '\n'.join(map(lambda x: f'pick_and_place("{x[0]}", "{x[1]}")', scenario.seen_placements))
    unseen_objects_str = '[' + ', '.join(map(lambda x: f'"{x}"', scenario.unseen_objects)) + ']'
    return placement_prompt_template.format(
        seen_objects_str=seen_objects_str, receptacles_str=receptacles_str,
        seen_placements_str=seen_placements_str, unseen_objects_str=unseen_objects_str,
        first_object=scenario.unseen_objects[0])

In [None]:
def evaluate(scenarios, model='text-davinci-003', verbose=False):
    completion = Completion()
    accuracies = []
    for i, scenario in enumerate(tqdm(scenarios)):
        if verbose:
            print(f'Scenario {i + 1} of {len(scenarios)}\n')

        # Object placement
        placement_prompt = construct_placement_prompt(scenario)
        placement_completion = completion.create(placement_prompt, model)['choices'][0]['text']
        if verbose:
            print(placement_prompt, end='')
            utils.print_colored(placement_completion, 'blue')
            print('\n' + 10 * '-' + '\n')

        # Analysis
        predicted_placements = benchmark.parse_placements(placement_completion, scenario.unseen_objects)
        corrects, accuracy = benchmark.check_placements(predicted_placements, scenario.unseen_placements)
        accuracies.append(accuracy)
        if verbose:
            print(f'Annotator notes: {scenario.annotator_notes}\n')
            print('Correct placements:')
            for placement in scenario.unseen_placements:
                print(placement)
            print('\nParsed placements:')
            for placement, correct in zip(predicted_placements, corrects):
                utils.print_colored(placement, 'green' if correct else 'red')
            print(f'\nAccuracy: {accuracy:.2f}')
            print('\n' + 80 * '-' + '\n')
    return accuracies

In [None]:
scenarios = benchmark.load_scenarios()
len(scenarios)

In [None]:
#accuracies = evaluate(scenarios, verbose=True)
accuracies = evaluate(scenarios)
np.mean(accuracies).round(3)