In [None]:
!pip install robotic-transformer-pytorch==0.2.3
!pip install gradio

import gradio as gr
import torch
import numpy as np
from PIL import Image
from robotic_transformer_pytorch import MaxViT, RT1

IMAGE_SIZE = 224
WINDOW_SIZE = 7

vit = MaxViT(
    num_classes=1000,
    dim_conv_stem=64,
    dim=96,
    dim_head=32,
    depth=(2, 2, 5, 2),
    window_size=WINDOW_SIZE,
    mbconv_expansion_rate=4,
    mbconv_shrinkage_rate=0.25,
    dropout=0.1
)

model = RT1(
    vit=vit,
    num_actions=7,
    depth=6,
    heads=8,
    dim_head=64,
    cond_drop_prob=0.2
)

if torch.cuda.is_available():
    model = model.cuda()

SKILLS = {
    "pick(object)": {
        "preconditions": ["object_visible", "gripper_empty"],
        "action_mapping": [0.7, 0.2, -0.1, 0.0, 0.0, 0.0, 1.0]
    },
    "place(location)": {
        "preconditions": ["location_clear"],
        "action_mapping": [0.5, -0.3, 0.4, 0.1, -0.2, 0.0, 0.0]
    },
    "open_drawer(top)": {
        "preconditions": ["drawer_closed", "gripper_empty"],
        "action_mapping": [0.8, 0.1, 0.05, 0.9, 0.0, 0.5, 0.0]
    },
    "close_drawer(middle)": {
        "preconditions": ["drawer_open"],
        "action_mapping": [-0.7, 0.2, 0.1, -0.8, 0.0, -0.4, 0.0]
    },
    "retrieve_from_drawer(object)": {
        "preconditions": ["drawer_open", "object_visible"],
        "action_mapping": [0.6, 0.3, -0.2, 0.7, 0.1, 0.3, 1.0]
    },
    "store_in_drawer(object)": {
        "preconditions": ["drawer_open", "gripper_occupied"],
        "action_mapping": [0.5, -0.2, 0.3, 0.8, -0.1, 0.4, 0.0]
    },
    "place_upright(bottle)": {
        "preconditions": ["object_grasped", "vertical_space"],
        "action_mapping": [0.4, 0.1, 0.6, 0.0, 1.0, 0.0, 0.0]
    },
    "knock_over(object)": {
        "preconditions": ["object_stable"],
        "action_mapping": [0.3, 0.8, 0.2, -0.3, -0.5, 0.7, 0.0]
    },
    "pull_napkin(dispenser)": {
        "preconditions": ["napkin_accessible"],
        "action_mapping": [0.2, 0.9, 0.1, 0.4, 0.2, -0.3, 0.5]
    },
    "open_jar(lid)": {
        "preconditions": ["jar_visible", "gripper_empty"],
        "action_mapping": [0.5, 0.4, 0.3, -0.2, 0.7, 0.6, 0.8]
    }
}

def process_rt1(image: np.ndarray, instruction: str):
    pil_image = Image.fromarray(image.astype('uint8')).convert('RGB')
    image_tensor = torch.tensor(np.array(pil_image)).float().permute(2, 0, 1).unsqueeze(0)
    image_tensor = torch.nn.functional.interpolate(
        image_tensor,
        size=(IMAGE_SIZE, IMAGE_SIZE),
        mode='bilinear',
        align_corners=False
    ).unsqueeze(2)
    if torch.cuda.is_available():
        image_tensor = image_tensor.cuda()
    with torch.no_grad():
        action_logits = model(image_tensor, [instruction])
        action_indices = torch.argmax(action_logits, dim=-1).squeeze()
        continuous_actions = (action_indices.float() / 255.0) * 2 - 1
    similarities = {}
    for skill, config in SKILLS.items():
        target = torch.tensor(config["action_mapping"], dtype=torch.float32)
        if torch.cuda.is_available():
            target = target.cuda()
        similarity = torch.cosine_similarity(continuous_actions, target, dim=0)
        similarities[skill] = similarity.item()
    best_skill = max(similarities, key=similarities.get)
    return (
        {
            "indices": action_indices.cpu().numpy().tolist(),
            "continuous": continuous_actions.cpu().numpy().round(3).tolist()
        },
        best_skill,
        f"{similarities[best_skill] * 100:.1f}%"
    )

interface = gr.Interface(
    fn=process_rt1,
    inputs=[
        gr.Image(label="Camera Input"),
        gr.Textbox(label="Instruction", placeholder="Enter robot command...")
    ],
    outputs=[
        gr.JSON(label="Action Output"),
        gr.Label(label="Selected Skill"),
        gr.Label(label="Confidence")
    ],
    title="RT-1 Robotics Controller",
    description="Real-time robot action prediction with symbolic skill interpretation",
    allow_flagging="never"
)

if __name__ == "__main__":
    interface.launch(debug=True)


Collecting robotic-transformer-pytorch==0.2.3
  Downloading robotic_transformer_pytorch-0.2.3-py3-none-any.whl.metadata (768 bytes)
Collecting classifier-free-guidance-pytorch>=0.7.1 (from robotic-transformer-pytorch==0.2.3)
  Downloading classifier_free_guidance_pytorch-0.7.1-py3-none-any.whl.metadata (883 bytes)
Collecting beartype (from classifier-free-guidance-pytorch>=0.7.1->robotic-transformer-pytorch==0.2.3)
  Downloading beartype-0.20.2-py3-none-any.whl.metadata (33 kB)
Collecting environs (from classifier-free-guidance-pytorch>=0.7.1->robotic-transformer-pytorch==0.2.3)
  Downloading environs-14.1.1-py3-none-any.whl.metadata (15 kB)
Collecting ftfy (from classifier-free-guidance-pytorch>=0.7.1->robotic-transformer-pytorch==0.2.3)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting open-clip-torch>=2.8.0 (from classifier-free-guidance-pytorch>=0.7.1->robotic-transformer-pytorch==0.2.3)
  Downloading open_clip_torch-2.32.0-py3-none-any.whl.metadata (31 kB)
Col

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]



It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://08506b0e35082f5ffa.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
