In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
!pip install -r requirements.txt

In [76]:
import nest_asyncio
nest_asyncio.apply()
import asyncio


import re
import json


In [77]:
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import clear_output, display
import time

bridge_trajs = np.load("assets/bridge_v2_10_trajs.npy", allow_pickle=True)

In [78]:
from vlm_autoeval_robot_benchmark.utils.ecot_primitives import ecot_primitive_movements, inverse_ecot_primitive_movements
from vlm_autoeval_robot_benchmark.models.vlm import VLM, create_vlm_request, parse_vlm_response


In [79]:
def show_trajectory_video(descriptors, delay=0.2, wait_for_key=False):
    """Show observations as a video with either delay between frames or keystroke.
    
    Args:
        descriptors: Dictionary containing trajectory information
        delay: Time delay between frames if wait_for_key is False
        wait_for_key: If True, wait for any key press between frames
    """
    plt.figure(figsize=(10, 10))
    for i, obs in enumerate(descriptors["obs"]):
        clear_output(wait=True)
        plt.imshow(obs)
        title = f"Frame {i} - {descriptors['task_language_instruction']}"
        for k,v in descriptors['moves'][i].items():
            title += f"\n------------------------\n{k} - {v}"
        plt.title(title)
        plt.axis('off')
        display(plt.gcf())
        
        if wait_for_key:
            input("Press Enter to continue...")  # Wait for any key
        else:
            time.sleep(delay)
    plt.close()

In [80]:
def repackage_to_episode(traj):
    steps = []
    for i in range(len(traj["observations"])):
        step = {}
        step["observation"] = traj["observations"][i]
        step["action"] = traj["actions"][i]
        steps.append(step)
    return dict(steps=steps)

In [81]:
def get_descriptors(traj):
    move_primitives = ecot_primitive_movements.get_move_primitives_episode(repackage_to_episode(traj), threshold=0.00)
    move_primitives = [dict(ecot=move) for move in move_primitives]
    obs_list = [t["images0"] for t in traj["observations"]]  # list of obs
    gt_actions = traj["actions"]  # list of ground truth actions
    task_language_instruction = traj["language"][0] if "language" in traj else None
    return dict(moves=move_primitives, obs=obs_list, gt_actions=gt_actions, task_language_instruction=task_language_instruction)

In [82]:
traj_idx = 1  # Change this to visualize different trajectories
episode_descriptors = get_descriptors(bridge_trajs[traj_idx])
# show_trajectory_video(episode_descriptors, delay=0.15)

In [None]:
for i, traj in enumerate(bridge_trajs):
    descriptors = get_descriptors(traj)
    print(i, descriptors['task_language_instruction'])

In [84]:
from primitive_moves_tester import run_test, print_test_results

In [85]:
traj = bridge_trajs[traj_idx]
episode_descriptors = get_descriptors(bridge_trajs[traj_idx])

In [86]:
import io
import numpy as np
from PIL import Image

def numpy_array_to_png_bytes(arr: np.ndarray) -> bytes:
    """
    Convert a NumPy array to PNG file bytes, as if it was saved as a PNG and then read with fp.read()
    
    Args:
        arr: NumPy array with shape (height, width, 3) and dtype uint8
        
    Returns:
        PNG file bytes
    """
    # Ensure the array is the right shape and type
    if len(arr.shape) != 3 or arr.shape[2] != 3:
        raise ValueError(f"Expected array with shape (height, width, 3), got {arr.shape}")
    
    if arr.dtype != np.uint8:
        arr = arr.astype(np.uint8)
    
    # Convert the NumPy array to a PIL Image
    img = Image.fromarray(arr)
    
    # Create a BytesIO object to store the image bytes
    buffer = io.BytesIO()
    
    # Save the image to the BytesIO object as PNG
    img.save(buffer, format="PNG")
    
    # Get the bytes from the BytesIO object
    png_bytes = buffer.getvalue()
    
    return png_bytes

In [87]:
sub_episode_descriptors = dict()
sub_step_interval = 2   

for k,v in episode_descriptors.items():
    if isinstance(v, list):
        sub_episode_descriptors[k] = v[::sub_step_interval]
    else:
        sub_episode_descriptors[k] = v

In [88]:
# model = "gpt-4o"
# model = "gemini/gemini-2.0-flash"
# model = "gemini/gemini-2.0-pro-exp"
model = "gemini/gemini-2.5-pro-preview-03-25"

env_desc = "You are looking at a wooden desk with a black robot arm. The desk has a drawer with a handle and some objects on it."
task_desc = sub_episode_descriptors["task_language_instruction"]
img_bytes = [numpy_array_to_png_bytes(img) for img in sub_episode_descriptors["obs"]]

In [None]:
from vlm_autoeval_robot_benchmark.models.vlm import parse_vlm_responses

async def run_episode(model, env_desc, task_desc, img_bytes_list):
    vlm = VLM()
    reqs = [create_vlm_request(model, img_bytes, env_desc, task_desc) for img_bytes in img_bytes_list]
    responses = await vlm.generate_parallel(reqs)
    results = parse_vlm_responses(responses)
    return results, responses

# Run all tests in parallel
results, responses = asyncio.run(run_episode(
    model,
    env_desc,
    task_desc,
    img_bytes,  # assuming img_bytes is your list of image bytes
))
print_test_results(results)

In [None]:
bad = [r for r in results if 'answer' not in r]
print(f"num bad: {len(bad)}")
if len(bad) > 0:
    print(bad[0]['error'])
    print(bad[0]['raw_response'])

In [91]:
for t, res in enumerate(results):
    description = res['description']
    sub_episode_descriptors['moves'][t]['vlm - desc'] = "\n".join([description[x:x+100] for x in range(0, len(description), 100)])
    action_texts = []
    for k in ['x', 'y', 'z', 'tilt', 'roll', 'rotation', 'gripper']:
        action_texts.append(f"{res['answer'][k][0]}" + (f"({res['answer'][k][1]})" if res['answer'][k][1] != '0.0' else ""))
    sub_episode_descriptors['moves'][t]['vlm'] = " , ".join([a for a in action_texts if a != 'None'])

In [None]:
show_trajectory_video(sub_episode_descriptors, delay=5, wait_for_key=True)

In [93]:
traj = bridge_trajs[0]

In [94]:
states = np.array([obs['state'] for obs in traj['observations']])
actions = np.array(traj['actions'])

In [None]:
states.shape, actions.shape

In [None]:
i = 2
print(states[i].round(5))
print(states[i+1].round(5))
print((states[i+1] - states[i]).round(5))

print(actions[i].round(5))