In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
!pip install -r requirements.txt

In [48]:
import nest_asyncio
nest_asyncio.apply()
import asyncio


import re
import json


In [49]:
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import clear_output, display
import time

bridge_trajs = np.load("assets/bridge_v2_10_trajs.npy", allow_pickle=True)

In [50]:
from vlm_autoeval_robot_benchmark.utils.ecot_primitives import ecot_primitive_movements, inverse_ecot_primitive_movements
from vlm_autoeval_robot_benchmark.models.vlm import VLM, create_vlm_request, parse_vlm_response


In [51]:
def show_trajectory_video(descriptors, delay=0.2, wait_for_key=False):
    """Show observations as a video with either delay between frames or keystroke.
    
    Args:
        descriptors: Dictionary containing trajectory information
        delay: Time delay between frames if wait_for_key is False
        wait_for_key: If True, wait for any key press between frames
    """
    plt.figure(figsize=(10, 10))
    for i, obs in enumerate(descriptors["obs"]):
        clear_output(wait=True)
        plt.imshow(obs)
        title = f"Frame {i} - {descriptors['task_language_instruction']}"
        for k,v in descriptors['moves'][i].items():
            print(f"{k}")
            print(f"\ntop {v[0]}, {type(v[0])}, {v[1]}, {type(v[1])}")
            text_actions = f"{v[0]} - {v[1].tolist()}" 
            print(f"\nbottom {descriptors['gt_actions'][i]}, {type(descriptors['gt_actions'][i])}")
            title += f"\n------------------------\n{k} - {text_actions}"
        gt_actions = descriptors['gt_actions'][i].tolist()
        title += f"\n------------------------\nGT actions: {gt_actions}"
        plt.title(title)
        plt.axis('off')
        display(plt.gcf())
        
        if wait_for_key:
            input("Press Enter to continue...")  # Wait for any key
        else:
            time.sleep(delay)
    plt.close()

In [52]:
def repackage_to_episode(traj):
    steps = []
    for i in range(len(traj["observations"])):
        step = {}
        step["observation"] = traj["observations"][i]
        step["action"] = traj["actions"][i]
        steps.append(step)
    return dict(steps=steps)

In [53]:
THRESHOLD = 0.00
WINDOW = 2
GRIPPER_INDEX = 6
GRIPPER_OPEN_THRESHOLD = 0.95

def get_gripper_position(gripper_state: float) -> str:
    return "OPEN" if gripper_state > GRIPPER_OPEN_THRESHOLD else "CLOSED"

def get_descriptors(traj):
    move_primitives = ecot_primitive_movements.get_move_primitives_episode(repackage_to_episode(traj), threshold=THRESHOLD, window=WINDOW)
    move_primitives = [dict(ecot=move) for move in move_primitives]
    obs_list = [t["images0"] for t in traj["observations"]]  # list of obs
    gt_actions = traj["actions"]  # list of ground truth actions
    gripper_states = [get_gripper_position(obs["state"][GRIPPER_INDEX]) for obs in traj["observations"]]
    task_language_instruction = traj["language"][0] if "language" in traj else None
    return dict(moves=move_primitives, obs=obs_list, gt_actions=gt_actions, task_language_instruction=task_language_instruction, gripper_states=gripper_states)

In [54]:
# note that the tasks that are phrased in the PAST tense are messing up the models 

"""
0 abriu a gaveta
1 close the drawer
2 removed the green thing from the drawer and placed it on the left side of the table.
3 take the red object out of the pot and put it on the left burner
4 take the blue stuffed animal and leaves it inside the drawer.
5 open the drawer
6 close the drawer
7 Open the drawer
8 close the drawer
9 removed the blue object from the drawer and put it on the lower left side of the table
"""

task_edits = {
    0: "open the drawer",
    2: "remove the green thing from the drawer and place it on the left side of the table",
    4: "take the blue stuffed animal and leave it inside the drawer",
    9: "remove the blue object from the drawer and put it on the lower left side of the table"
}
for k,v in task_edits.items():
    bridge_trajs[k]["language"][0] = v

In [55]:
traj_idx = 2  # Change this to visualize different trajectories
episode_descriptors = get_descriptors(bridge_trajs[traj_idx])
# show_trajectory_video(episode_descriptors, delay=0.15)

In [None]:
for i, traj in enumerate(bridge_trajs):
    descriptors = get_descriptors(traj)
    print(i, descriptors['task_language_instruction'])

In [57]:
from primitive_moves_tester import run_test, print_test_results

In [58]:
traj = bridge_trajs[traj_idx]
episode_descriptors = get_descriptors(bridge_trajs[traj_idx])

In [59]:
import io
import numpy as np
from PIL import Image

def numpy_array_to_png_bytes(arr: np.ndarray) -> bytes:
    """
    Convert a NumPy array to PNG file bytes, as if it was saved as a PNG and then read with fp.read()
    
    Args:
        arr: NumPy array with shape (height, width, 3) and dtype uint8
        
    Returns:
        PNG file bytes
    """
    # Ensure the array is the right shape and type
    if len(arr.shape) != 3 or arr.shape[2] != 3:
        raise ValueError(f"Expected array with shape (height, width, 3), got {arr.shape}")
    
    if arr.dtype != np.uint8:
        arr = arr.astype(np.uint8)
    
    # Convert the NumPy array to a PIL Image
    img = Image.fromarray(arr)
    
    # Create a BytesIO object to store the image bytes
    buffer = io.BytesIO()
    
    # Save the image to the BytesIO object as PNG
    img.save(buffer, format="PNG")
    
    # Get the bytes from the BytesIO object
    png_bytes = buffer.getvalue()
    
    return png_bytes

In [60]:
sub_episode_descriptors = dict()
sub_step_interval = 5  

for k,v in episode_descriptors.items():
    if isinstance(v, list):
        sub_episode_descriptors[k] = v[::sub_step_interval]
    else:
        sub_episode_descriptors[k] = v

In [61]:
# model = "gpt-4o"
# model = "gemini/gemini-2.0-flash"
# model = "gemini/gemini-2.0-pro-exp"
model = "gemini/gemini-2.5-pro-preview-03-25"

env_desc = "You are looking at a wooden desk with a black robot arm. The desk has a drawer with a handle and some objects on it."
task_desc = sub_episode_descriptors["task_language_instruction"]
img_bytes_list = [numpy_array_to_png_bytes(img) for img in sub_episode_descriptors["obs"]]
gripper_descriptors_list = sub_episode_descriptors["gripper_states"]

In [None]:
from vlm_autoeval_robot_benchmark.models.vlm import parse_vlm_responses
import litellm
litellm._turn_on_debug()

HISTORY_PREFIX = "This shows the history of a robotics episode."
HISTORY_SUFFIX = "Consider this history to answer the question below. Describe the history in detail before answering the question."

async def run_episode(model, env_desc, task_desc, img_bytes_list, gripper_descriptors):
    vlm = VLM()
    reqs = []
    input_history = []
    for i in range(len(img_bytes_list)):
        if input_history:
            current_history = dict(prefix=HISTORY_PREFIX, vlm_inputs=[input_history[-min(2, len(input_history))]], suffix=HISTORY_SUFFIX)
        else:
            current_history = None
        reqs.append(create_vlm_request(model, img_bytes_list[i], env_desc, task_desc, gripper_position=gripper_descriptors[i], history_dict=current_history))
        input_history.append(("Historical image", [img_bytes_list[i]]))
    responses = await vlm.generate_parallel(reqs)
    results = parse_vlm_responses(responses)
    return results, responses, reqs

# Run all tests in parallel
results, responses, reqs = asyncio.run(run_episode(
    model,
    env_desc,
    task_desc,
    img_bytes_list,
    gripper_descriptors_list
))
print_test_results(results)

In [None]:
bad = [r for r in results if 'answer' not in r]
print(f"num bad: {len(bad)}")
if len(bad) > 0:
    print(bad[0]['error'])
    print(bad[0]['raw_response'])

In [64]:

for t, res in enumerate(results):
    description = res['description']
    sub_episode_descriptors['moves'][t]['vlm - desc'] = ("\n".join([description[x:x+100] for x in range(0, len(description), 100)]), np.array([]))

    action_texts = []
    for k in ['x', 'y', 'z', 'tilt', 'roll', 'rotation', 'gripper']:
        action_texts.append(f"{res['answer'][k][0]}" + (f"({res['answer'][k][1]})" if res['answer'][k][1] != 0.0 else ""))
    
    calculated_actions = inverse_ecot_primitive_movements.text_to_move_vector(res['answer'])
    formatted_action_text = " , ".join([a for a in action_texts if a != 'None'])
    sub_episode_descriptors['moves'][t]['vlm'] = (formatted_action_text, calculated_actions)

In [None]:
results[0]['answer']['gripper']
# for res in results[7:]:
#     print(inverse_ecot_primitive_movements.text_to_move_vector(res['answer']))

In [None]:
show_trajectory_video(sub_episode_descriptors, delay=5, wait_for_key=True)