In [None]:
import numpy as np
from tqdm.notebook import tqdm
import benchmark
import utils
from openai_cache import Completion

In [None]:
def construct_placement_prompt(summary, objects, receptacles):
    placement_prompt_template = '''# Summary: Put clothes in the laundry basket and toys in the storage box.
objects = ["socks", "toy car", "shirt", "Lego brick"]
receptacles = ["laundry basket", "storage box"]
pick_and_place("socks", "laundry basket")
pick_and_place("toy car", "storage box")
pick_and_place("shirt", "laundry basket")
pick_and_place("Lego brick", "storage box")

# Summary: {summary}
objects = {objects_str}
receptacles = {receptacles_str}
pick_and_place("{first_object}",'''
    objects_str = '[' + ', '.join(map(lambda x: f'"{x}"', objects)) + ']'
    receptacles_str = '[' + ', '.join(map(lambda x: f'"{x}"', receptacles)) + ']'
    return placement_prompt_template.format(summary=summary, objects_str=objects_str, receptacles_str=receptacles_str, first_object=objects[0])

In [None]:
def evaluate(scenarios, eval_split='unseen', model='text-davinci-003', verbose=False):
    assert eval_split in {'unseen', 'seen'}
    completion = Completion()
    accuracies = []
    for i, scenario in enumerate(tqdm(scenarios)):
        if verbose:
            print(f'Scenario {i + 1} of {len(scenarios)}\n')

        # Object placement
        objects = scenario.seen_objects if eval_split == 'seen' else scenario.unseen_objects
        placement_prompt = construct_placement_prompt(scenario.annotator_notes, objects, scenario.receptacles)
        placement_completion = completion.create(placement_prompt, model)['choices'][0]['text']
        if verbose:
            print(placement_prompt, end='')
            utils.print_colored(placement_completion, 'blue')
            print('\n' + 10 * '-' + '\n')

        # Analysis
        predicted_placements = benchmark.parse_placements(placement_completion, objects)
        correct_placements = scenario.seen_placements if eval_split == 'seen' else scenario.unseen_placements
        corrects, accuracy = benchmark.check_placements(predicted_placements, correct_placements)
        accuracies.append(accuracy)
        if verbose:
            print(f'Annotator notes: {scenario.annotator_notes}\n')
            print('Correct placements:')
            for placement in correct_placements:
                print(placement)
            print('\nParsed placements:')
            for placement, correct in zip(predicted_placements, corrects):
                utils.print_colored(placement, 'green' if correct else 'red')
            print(f'\nAccuracy: {accuracy:.2f}')
            print('\n' + 80 * '-' + '\n')
    return accuracies

In [None]:
scenarios = benchmark.load_scenarios()
len(scenarios)

In [None]:
#accuracies = evaluate(scenarios, 'unseen', verbose=True)
accuracies = evaluate(scenarios, 'unseen')
np.mean(accuracies).round(3)

In [None]:
#accuracies = evaluate(scenarios, 'seen', verbose=True)
accuracies = evaluate(scenarios, 'seen')
np.mean(accuracies).round(3)