# LeRobot Policy : ACT (Action Chunking with Transformers) 

---

- Conda env : [lerobot](../README.md#setup-a-conda-environment)

----

- Ref: 
    - ...

## Device Setup

In [20]:
import torch

if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print(f"Available device : {device}")

Available device : cuda


In [10]:
if device == "cuda":
    !nvidia-smi

Thu Sep 11 10:46:01 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.57.08              Driver Version: 575.57.08      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  |   00000000:01:00.0  On |                  N/A |
| 25%   44C    P5             36W /  250W |    1389MiB /  11264MiB |     19%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## DataSet(aloha_sim_transfer_cube_human ) Visualization

In [2]:
!python -m lerobot.scripts.visualize_dataset \
    --repo-id lerobot/aloha_sim_transfer_cube_human  \
    --episode-index 0

Resolving data files: 100%|██████████████████| 50/50 [00:00<00:00, 77758.69it/s]
[0m[38;5;8m[[0m2025-09-10T20:49:57Z [0m[32mINFO [0m winit::platform_impl::linux::x11::window[0m[38;5;8m][0m Guessed window scale factor: 1
[0m[38;5;8m[[0m2025-09-10T20:49:58Z [0m[33mWARN [0m wgpu_hal::gles::egl[0m[38;5;8m][0m No config found!
[0m[38;5;8m[[0m2025-09-10T20:49:58Z [0m[33mWARN [0m wgpu_hal::gles::egl[0m[38;5;8m][0m EGL says it can present to the window but not natively
  0%|                                                    | 0/13 [00:00<?, ?it/s][0m[38;5;8m[[0m2025-09-10T20:49:58Z [0m[33mWARN [0m wgpu_hal::gles::adapter[0m[38;5;8m][0m Max vertex attribute stride unknown. Assuming it is 2048
[0m[38;5;8m[[0m2025-09-10T20:49:58Z [0m[33mWARN [0m wgpu_hal::vulkan::conv[0m[38;5;8m][0m Unrecognized present mode 1000361000
[0m[38;5;8m[[0m2025-09-10T20:49:58Z [0m[33mWARN [0m wgpu_hal::gles::adapter[0m[38;5;8m][0m Max vertex attribute stride unknow

##  Training ACT Model from scratch

In [2]:
import os

output_dir = "./temp/outputs/aloha_sim_transfer_cube_human"
print(output_dir)

./temp/outputs/aloha_sim_transfer_cube_human


In [None]:
!python -m lerobot.scripts.train \
    --policy.type=act \
    --dataset.repo_id=lerobot/aloha_sim_transfer_cube_human \
    --env.type=aloha \
    --env.task=AlohaTransferCube-v0 \
    --batch_size=16  \
    --steps=20000 \
    --save_freq=500 \
    --eval_freq=500 \
    --policy.device=$device \
    --wandb.enable=false \
    --output_dir=$output_dir \
    --policy.push_to_hub=false

INFO 2025-09-10 12:33:55 ts/train.py:111 {'batch_size': 16,
 'dataset': {'episodes': None,
             'image_transforms': {'enable': False,
                                  'max_num_transforms': 3,
                                  'random_order': False,
                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
                                                                                   1.2]},
                                                         'type': 'ColorJitter',
                                                         'weight': 1.0},
                                          'contrast': {'kwargs': {'contrast': [0.8,
                                                                               1.2]},
                                                       'type': 'ColorJitter',
                                                       'weight': 1.0},
                                          'hue': {'kwargs': {'hue': [-0.05,
                

## Resume the training

In [3]:
config_path = os.path.join(output_dir, "checkpoints/last/pretrained_model/train_config.json")
print(config_path)

./temp/outputs/aloha_sim_transfer_cube_human/checkpoints/last/pretrained_model/train_config.json


In [5]:
!python -m lerobot.scripts.train \
    --config_path=$config_path \
    --resume=true

INFO 2025-09-10 16:22:47 ts/train.py:111 {'batch_size': 16,
 'dataset': {'episodes': None,
             'image_transforms': {'enable': False,
                                  'max_num_transforms': 3,
                                  'random_order': False,
                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
                                                                                   1.2]},
                                                         'type': 'ColorJitter',
                                                         'weight': 1.0},
                                          'contrast': {'kwargs': {'contrast': [0.8,
                                                                               1.2]},
                                                       'type': 'ColorJitter',
                                                       'weight': 1.0},
                                          'hue': {'kwargs': {'hue': [-0.05,
                

## Video evaluation of the training sequence

In [None]:
from IPython.display import HTML, display

def display_video_grid(videos, cols=2, ratio = 100):
    """
    Display videos in a grid with titles.
    
    Args:
        videos (list of tuples): [(filepath, title), ...]
        cols (int): number of videos per row
    """
    # CSS for grid
    style = f"""
    <style>
    .video-grid {{
      display: grid;
      grid-template-columns: repeat({cols}, 1fr);
      gap: 20px;
      margin-top: 20px;
    }}
    .video-item {{
      text-align: center;
    }}
    .video-item video {{
      width: {ratio}%;
      border-radius: 10px;
      box-shadow: 0 4px 10px rgba(0,0,0,0.2);
    }}
    .video-title {{
      margin-top: 8px;
      font-weight: bold;
      font-family: sans-serif;
    }}
    </style>
    """

    # Build video HTML
    items = ""
    for path, title in videos:
        items += f"""
        <div class="video-item">
            <video controls>
                <source src="{path}" type="video/mp4">
            </video>
            <div class="video-title">{title}</div>
        </div>
        """

    html = style + f'<div class="video-grid">{items}</div>'
    display(HTML(html))


In [30]:
steps = ["step_000500", "step_005000", "step_010000", "step_015000", "step_020000"]
episodes = ["episode_0", "episode_1", "episode_2", "episode_3"]

video_list = []
for episode in episodes:
    for step in steps:
        v_fp = f"./output/train_videos_act_aloha_transfer/videos_{step}/eval_{episode}.mp4"
        v_title = f"{step}_{episode}"
        video_list.append((v_fp, v_title))

print(video_list)

display_video_grid(video_list, cols=len(steps))

[('./output/train_videos_act_aloha_transfer/videos_step_000500/eval_episode_0.mp4', 'step_000500_episode_0'), ('./output/train_videos_act_aloha_transfer/videos_step_005000/eval_episode_0.mp4', 'step_005000_episode_0'), ('./output/train_videos_act_aloha_transfer/videos_step_010000/eval_episode_0.mp4', 'step_010000_episode_0'), ('./output/train_videos_act_aloha_transfer/videos_step_015000/eval_episode_0.mp4', 'step_015000_episode_0'), ('./output/train_videos_act_aloha_transfer/videos_step_020000/eval_episode_0.mp4', 'step_020000_episode_0'), ('./output/train_videos_act_aloha_transfer/videos_step_000500/eval_episode_1.mp4', 'step_000500_episode_1'), ('./output/train_videos_act_aloha_transfer/videos_step_005000/eval_episode_1.mp4', 'step_005000_episode_1'), ('./output/train_videos_act_aloha_transfer/videos_step_010000/eval_episode_1.mp4', 'step_010000_episode_1'), ('./output/train_videos_act_aloha_transfer/videos_step_015000/eval_episode_1.mp4', 'step_015000_episode_1'), ('./output/train_v

## Evaluation

In [21]:
pretrained_path = os.path.join(output_dir, "checkpoints/last/pretrained_model")
eval_output = os.path.join("./output", "post_eval/act_aloha_transfer")

print(pretrained_path)
print(eval_output)

./temp/outputs/aloha_sim_transfer_cube_human/checkpoints/last/pretrained_model
./output/post_eval/act_aloha_transfer


In [22]:
!python -m lerobot.scripts.eval \
    --policy.path=$pretrained_path \
    --output_dir=$eval_output \
    --env.type=aloha \
    --env.task=AlohaTransferCube-v0 \
    --eval.n_episodes=50 \
    --eval.batch_size=50 \
    --policy.device=$device \
    --policy.use_amp=false

INFO 2025-09-11 11:05:18 pts/eval.py:462 {'env': {'episode_length': 400,
         'features': {'action': {'shape': (14,),
                                 'type': <FeatureType.ACTION: 'ACTION'>},
                      'agent_pos': {'shape': (14,),
                                    'type': <FeatureType.STATE: 'STATE'>},
                      'pixels/top': {'shape': (480, 640, 3),
                                     'type': <FeatureType.VISUAL: 'VISUAL'>}},
         'features_map': {'action': 'action',
                          'agent_pos': 'observation.state',
                          'pixels/top': 'observation.images.top',
                          'top': 'observation.image.top'},
         'fps': 50,
         'obs_type': 'pixels_agent_pos',
         'render_mode': 'rgb_array',
         'task': 'AlohaTransferCube-v0'},
 'eval': {'batch_size': 50, 'n_episodes': 50, 'use_async_envs': False},
 'job_name': 'aloha_act',
 'output_dir': PosixPath('output/post_eval/act_aloha_transfer'),
 'p

In [36]:
eval_video_output_dir = os.path.join(eval_output, "videos")
all_entries = os.listdir(eval_video_output_dir)
eval_video_list= []
for vf in sorted(all_entries):
    vfp = os.path.join(eval_video_output_dir, vf)
    v_title = vf[:-4]
    eval_video_list.append((vfp, v_title))

print(eval_video_list)
display_video_grid(eval_video_list, cols=5)


[('./output/post_eval/act_aloha_transfer/videos/eval_episode_0.mp4', 'eval_episode_0'), ('./output/post_eval/act_aloha_transfer/videos/eval_episode_1.mp4', 'eval_episode_1'), ('./output/post_eval/act_aloha_transfer/videos/eval_episode_2.mp4', 'eval_episode_2'), ('./output/post_eval/act_aloha_transfer/videos/eval_episode_3.mp4', 'eval_episode_3'), ('./output/post_eval/act_aloha_transfer/videos/eval_episode_4.mp4', 'eval_episode_4'), ('./output/post_eval/act_aloha_transfer/videos/eval_episode_5.mp4', 'eval_episode_5'), ('./output/post_eval/act_aloha_transfer/videos/eval_episode_6.mp4', 'eval_episode_6'), ('./output/post_eval/act_aloha_transfer/videos/eval_episode_7.mp4', 'eval_episode_7'), ('./output/post_eval/act_aloha_transfer/videos/eval_episode_8.mp4', 'eval_episode_8'), ('./output/post_eval/act_aloha_transfer/videos/eval_episode_9.mp4', 'eval_episode_9')]
