In [1]:
import sys
import os
sys.path.append('../')

In [2]:
from pathlib import Path

import gym_pusht  # noqa: F401
import gymnasium as gym
import imageio
import numpy
import torch
from huggingface_hub import snapshot_download

from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy

from lerobot.common.datasets.push_dataset_to_hub.utils import (
    concatenate_episodes,
    get_default_encoding,
    save_images_concurrently,
)
from lerobot.common.datasets.utils import (
    calculate_episode_data_index,
    hf_transform_to_torch,
)
from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
output_directory = Path("outputs/eval/example_pusht_diffusion")
output_directory.mkdir(parents=True, exist_ok=True)

In [4]:
pretrained_policy_path = Path(snapshot_download("lerobot/diffusion_pusht"))

policy = DiffusionPolicy.from_pretrained(pretrained_policy_path)
policy.eval()

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Device set to:", device)
else:
    device = torch.device("cpu")
    print(f"GPU is not available. Device set to: {device}. Inference will be slower than on GPU.")
    # Decrease the number of reverse-diffusion steps (trades off a bit of quality for 10x speed)
    policy.diffusion.num_inference_steps = 10

policy = policy.to(device)

Fetching 11 files: 100%|███████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 184549.38it/s]


Loading weights from local directory
GPU is available. Device set to: cuda


In [5]:
env = gym.make(
    "gym_pusht/PushT-v0",
    obs_type="pixels_agent_pos",
    max_episode_steps=300,
)

In [6]:
def build_ep_dict(ep_idx, 
                  observation_states: list, 
                  actions: list, 
                  rewards: list, 
                  dones: list, 
                  successes: list, 
                  frames: list, 
                  videos_dir: Path,
                  fps=10):
    num_frames = len(frames)
    assert len(observation_states) == num_frames
    assert len(actions) == num_frames
    assert len(rewards) == num_frames
    assert len(dones) == num_frames
    assert len(successes) == num_frames
    
    ep_dict = {}
    
    #  observation.image
    img_key = 'observation.image'
    fname = f"{img_key}_episode_{ep_idx:06d}.mp4"
    video_path = videos_dir / fname
    imageio.mimsave(video_path, frames, fps=fps)
    print(f"Video of the evaluation is available in '{video_path}'.")
    ep_dict[img_key] = [
        {'path': f"videos/{fname}", 'timestamp': i / fps} for i in range(num_frames)   
    ]

    #  observation.state (b x n)
    ep_dict['observation.state'] = torch.stack(observation_states)
    # action (b x 2)
    ep_dict['action'] = torch.stack(actions)
    # frame_index
    ep_dict['episode_index'] = torch.tensor([ep_idx] * num_frames, dtype=torch.int64)
    ep_dict['frame_index'] = torch.arange(0, num_frames, 1)
    ep_dict['timestamp'] = torch.arange(0, num_frames, 1) / fps
    ep_dict["next.reward"] = torch.tensor(rewards)
    ep_dict["next.done"] = torch.tensor(dones)
    ep_dict["next.success"] = torch.tensor(successes)
    return ep_dict

In [11]:
output_directory = Path("outputs/eval/dataset_test")
output_directory.mkdir(parents=True, exist_ok=True)

videos_dir = output_directory / 'videos'
videos_dir.mkdir(parents=True, exist_ok=True)

In [35]:
def rollout_for_ep_dicts(num_episodes, videos_dir):
    ep_dicts = []
    
    for ep_idx in range(num_episodes):
        policy.reset()
        ep_dict = {}
        
        numpy_observation, info = env.reset(seed=42)
    
        frames = []
        observation_states = []
        actions = []
        rewards = []
        dones = []
        successes = []
        
        step = 0
        done = False
        while not done:
            frames.append(env.render())
            
            # Prepare observation for the policy running in Pytorch
            state = torch.from_numpy(numpy_observation["agent_pos"])
            image = torch.from_numpy(numpy_observation["pixels"])
        
            # Convert to float32 with image from channel first in [0,255]
            # to channel last in [0,1]
            state_t = state.to(torch.float32)
            image = image.to(torch.float32) / 255
            image_t = image.permute(2, 0, 1)  # c x h x w
    
            # Send data tensors from CPU to GPU
            state = state_t.to(device, non_blocking=True)
            image = image_t.to(device, non_blocking=True)
        
            # Add extra (empty) batch dimension, required to forward the policy
            state = state.unsqueeze(0)
            image = image.unsqueeze(0)
        
            # Create the policy input dictionary
            observation = {
                "observation.state": state,
                "observation.image": image,
            }
        
            # Predict the next action with respect to the current observation
            with torch.inference_mode():
                action = policy.select_action(observation)
        
            # Prepare the action for the environment
            action_t = action.squeeze(0).to("cpu")
            numpy_action = action_t.numpy()
        
            # Step through the environment and receive a new observation
            numpy_observation, reward, terminated, truncated, info = env.step(numpy_action)
            if step % 100 == 0:
                print(f"{step=} {reward=} {terminated=}")
    
            # The rollout is considered done when the success state is reach (i.e. terminated is True),
            # or the maximum number of iterations is reached (i.e. truncated is True)
            done = terminated | truncated | done
        
            # Keep track of all the rewards and frames
            observation_states.append(state_t)
            actions.append(action_t)
            rewards.append(reward)
            successes.append(terminated)
            dones.append(done)
            
            step += 1
        
        if terminated:
            print("Success!")
        else:
            print("Failure!")
        
        # Get the speed of environment (i.e. its number of frames per second).
        fps = env.metadata["render_fps"]
        
        ep_dict = build_ep_dict(ep_idx,
                                observation_states=observation_states, 
                                actions=actions, 
                                rewards=rewards, 
                                dones=dones, 
                                successes=successes, 
                                frames=frames, 
                                fps=fps,
                                videos_dir=videos_dir)
        ep_dicts.append(ep_dict)

    return ep_dicts

In [36]:
ep_dicts = rollout_for_ep_dicts(num_episodes=2, videos_dir=videos_dir)

step=0 reward=0.0 terminated=False
step=100 reward=0.6541824427801204 terminated=False
step=200 reward=0.879881911712084 terminated=False




Success!




Video of the evaluation is available in 'outputs/eval/dataset_test/videos/observation.image_episode_000000.mp4'.
step=0 reward=0.0 terminated=False
step=100 reward=0.7817343968793377 terminated=False




Success!




Video of the evaluation is available in 'outputs/eval/dataset_test/videos/observation.image_episode_000001.mp4'.


In [37]:
from IPython.display import Video

# 비디오 표시
Video(videos_dir / 'observation.image_episode_000000.mp4', embed=True, width=640, height=360)

In [38]:
def convert_ep_dicts_to_data_dict(ep_dicts):
    data_dict = concatenate_episodes(ep_dicts)
    total_frames = data_dict["frame_index"].shape[0]
    data_dict["index"] = torch.arange(0, total_frames, 1)
    return data_dict

In [39]:
from lerobot.common.datasets.push_dataset_to_hub.pusht_zarr_format import to_hf_dataset, calculate_episode_data_index

In [40]:
fps = 10

data_dict = convert_ep_dicts_to_data_dict(ep_dicts)
hf_dataset = to_hf_dataset(data_dict, video=True, keypoints_instead_of_image=False)
episode_data_index = calculate_episode_data_index(hf_dataset)
info = {
    "codebase_version": CODEBASE_VERSION,
    "fps": fps,
    "video": True,
    "encoding": get_default_encoding()
}                                               

In [29]:
data_dict

{'observation.image': [{'path': 'videos/observation.image_episode_000000.mp4',
   'timestamp': 0.0},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.1},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.2},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.3},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.4},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.5},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.6},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.7},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.8},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.9},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 1.0},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 1.1},
  {'path': 'videos/observation.image_episode_000000.

## Data Save Test

In [47]:
import zarr
import numpy as np
import torch
import copy
import numcodecs

In [48]:
output_path = "outputs/eval/dataset_test/zarr_test"

dataset_root = zarr.open(output_path, mode='a')

In [49]:
def append_episode_to_zarr(ep_dict, zarr_group):
    for key, value in ep_dict.items():
        # 첫 번째 에피소드인 경우 해당 key로 데이터셋 생성
        if key not in zarr_group:
            data_shape = (0, *np.array(value).shape[1:])
            
            # object_codec 추가: 문자열 데이터를 저장하기 위해 JSON을 사용
            object_codec = numcodecs.JSON() if isinstance(value[0], dict) or isinstance(value[0], str) else None
            zarr_group.create_dataset(
                key,
                shape=data_shape,
                chunks=(1, *data_shape[1:]),
                dtype=np.array(value).dtype,
                maxshape=(None, *data_shape[1:]),
                object_codec=object_codec
            )
        
        # 데이터셋에 에피소드 데이터를 추가
        zarr_group[key].append(value)

In [51]:
data_dict_to_save = {k: v.cpu().numpy() if isinstance(v, torch.Tensor) else v for k, v in data_dict.items()}
data_dict_to_save

{'observation.image': [{'path': 'videos/observation.image_episode_000000.mp4',
   'timestamp': 0.0},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.1},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.2},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.3},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.4},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.5},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.6},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.7},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.8},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 0.9},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 1.0},
  {'path': 'videos/observation.image_episode_000000.mp4', 'timestamp': 1.1},
  {'path': 'videos/observation.image_episode_000000.

In [52]:
append_episode_to_zarr(data_dict_to_save, dataset_root)

  compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs)


# Test

In [21]:
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset

dataset = LeRobotDataset.from_preloaded(
    repo_id='place/holder',
    hf_dataset=hf_dataset,
    episode_data_index=episode_data_index,
    info=info,
    videos_dir=videos_dir,
)

In [22]:
print(dataset)
print(dataset.hf_dataset)

LeRobotDataset(
  Repository ID: 'place/holder',
  Split: 'train',
  Number of Samples: 420,
  Number of Episodes: 2,
  Type: video (.mp4),
  Recorded Frames per Second: 10,
  Camera Keys: ['observation.image'],
  Video Frame Keys: ['observation.image'],
  Transformations: None,
  Codebase Version: v1.6,
)
Dataset({
    features: ['observation.image', 'observation.state', 'action', 'episode_index', 'frame_index', 'timestamp', 'next.reward', 'next.done', 'next.success', 'index'],
    num_rows: 420
})


In [23]:
print(f"\naverage number of frames per episode: {dataset.num_samples / dataset.num_episodes:.3f}")
print(f"frames per second used during data collection: {dataset.fps=}")
print(f"keys to access images from cameras: {dataset.camera_keys=}\n")


average number of frames per episode: 210.000
frames per second used during data collection: dataset.fps=10
keys to access images from cameras: dataset.camera_keys=['observation.image']



In [24]:
episode_index = 0
from_idx = dataset.episode_data_index["from"][episode_index].item()
to_idx = dataset.episode_data_index["to"][episode_index].item()

print(f"episode {episode_index} start from index {from_idx} to index {to_idx}")

episode 0 start from index 0 to index 300


In [26]:
frames = [dataset[idx]["observation.image"] for idx in range(from_idx, to_idx)]

# Video frames are now float32 in range [0,1] channel first (c,h,w) to follow pytorch convention. To visualize
# them, we convert to uint8 in range [0,255]
frames = [(frame * 255).type(torch.uint8) for frame in frames]
# and to channel last (h,w,c).
frames = [frame.permute((1, 2, 0)).numpy() for frame in frames]

In [27]:
Path("outputs/examples/1_load_lerobot_dataset").mkdir(parents=True, exist_ok=True)
video_path = f"outputs/examples/1_load_lerobot_dataset/episode_{episode_index}.mp4"
imageio.mimsave(video_path, frames, fps=dataset.fps)

In [28]:
from IPython.display import Video

# 비디오 표시
Video(video_path, embed=True, width=640, height=360)