In [1]:
import gymnasium as gym
import numpy as np
import transformers
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# Get prompts

In [3]:
def obs_to_text(obs):
    starter = "Observation from last step: "
    hull_angle_speed = f"Hull angle: {obs[0]:.2f}"
    angular_velocity = f"Angular velocity: {obs[1]:.2f}"
    x_velocity = f"X velocity: {obs[2]:.2f}"
    y_velocity = f"Y velocity: {obs[3]:.2f}"
    back_revolute_joint_angle = f"Back revolute joint angle: {obs[4]:.2f}"
    back_revolute_joint_speed = f"Back revolute joint speed: {obs[5]:.2f}"
    back_lower_leg_angle = f"Back lower leg angle: {obs[6]:.2f}"
    back_lower_leg_speed = f"Back lower leg speed: {obs[7]:.2f}"
    back_leg_ground_contact_flag = f"Back leg ground contact flag: {obs[8]:.2f}"
    front_revolute_joint_angle = f"Front revolute joint angle: {obs[9]:.2f}"
    front_revolute_joint_speed = f"Front revolute joint speed: {obs[10]:.2f}"
    front_lower_leg_angle = f"Front lower leg angle: {obs[11]:.2f}"
    front_lower_leg_speed = f"Front lower leg speed: {obs[12]:.2f}"
    front_leg_ground_contact_flag = f"Front leg ground contact flag: {obs[13]:.2f}"
    
    # Lidars
    lidar_angles = [0.0, 0.15, 0.3, 0.45, 0.6, 0.75, 0.9, 1.05, 1.2, 1.35] # In radians starting from the top of the hull
    lidar_distances = obs[14:24]
    lidar_readings = [f"Lidar {i+1} ({angle:.2f} rad): {distance:.2f}" for i, (angle, distance) in enumerate(zip(lidar_angles, lidar_distances))]
    
    return "\n".join([starter, hull_angle_speed, angular_velocity, x_velocity, y_velocity, back_revolute_joint_angle, back_revolute_joint_speed, back_lower_leg_angle, back_lower_leg_speed, back_leg_ground_contact_flag, front_revolute_joint_angle, front_revolute_joint_speed, front_lower_leg_angle, front_lower_leg_speed, front_leg_ground_contact_flag] + lidar_readings)

def reward_to_text(reward):
    return f"The reward from the last step was: {reward:.2f}"

def text_to_action(text):
    """
        Given an output by the LLM in the form:
            Move Back revolute joint {value}, Back lower leg {value}, Front revolute joint {value}, Front lower leg {value}
        This function will return the corresponding action values for the environment 
    """
    action = [0, 0, 0, 0]
    split_text = text.split(", ")
    for i, action_value in enumerate(split_text):
        action[i] = float(action_value.split(" ")[-1])
    return action
    

In [4]:
env = gym.make("BipedalWalker-v3", render_mode="human")
env.reset()

(array([ 2.7475136e-03, -4.0627820e-06,  3.1602487e-04, -1.5999971e-02,
         9.2093654e-02, -4.1704316e-04,  8.6019379e-01,  1.7114585e-03,
         1.0000000e+00,  3.2500479e-02, -4.1701546e-04,  8.5374027e-01,
         2.9493857e-04,  1.0000000e+00,  4.4081402e-01,  4.4582012e-01,
         4.6142277e-01,  4.8955020e-01,  5.3410280e-01,  6.0246104e-01,
         7.0914888e-01,  8.8593185e-01,  1.0000000e+00,  1.0000000e+00],
       dtype=float32),
 {})

In [25]:
obs, reward, done, info, _ = env.step(env.action_space.sample())

In [26]:
obs_text = obs_to_text(obs)
reward_text = reward_to_text(reward)
obs_text

'Observation from last step: \nHull angle: -0.01\nAngular velocity: 0.00\nX velocity: -0.00\nY velocity: 0.00\nBack revolute joint angle: 0.47\nBack revolute joint speed: 0.28\nBack lower leg angle: -0.10\nBack lower leg speed: -1.00\nBack leg ground contact flag: 0.00\nFront revolute joint angle: 0.33\nFront revolute joint speed: -0.03\nFront lower leg angle: 0.19\nFront lower leg speed: 0.02\nFront leg ground contact flag: 0.00\nLidar 1 (0.00 rad): 0.45\nLidar 2 (0.15 rad): 0.46\nLidar 3 (0.30 rad): 0.47\nLidar 4 (0.45 rad): 0.50\nLidar 5 (0.60 rad): 0.55\nLidar 6 (0.75 rad): 0.62\nLidar 7 (0.90 rad): 0.72\nLidar 8 (1.05 rad): 0.91\nLidar 9 (1.20 rad): 1.00\nLidar 10 (1.35 rad): 1.00'

In [22]:
obs_text_example = obs_text
reward_text_example = reward_text

In [28]:
primer = """
You are playing a challenge to walk a bipedal machine as far as possible without falling over. The machine has a hull, two legs, a back and front revolute joint, and lower leg. The machine has 10 lidar sensors that can detect the distance to objects in front of it. 

You will be given scenarios in the form of observations from the environment and the current reward based on the last action. Your goal is to provide the next action in the following format:

'Move Back revolute joint `value`, Back lower leg 'value', Front revolute joint 'value', Front lower leg 'value''

The values must be in the range [-1, 1], and your goal is to maximize the reward and move the machine forward effectively. Only provide the action and no additional explanation.

Example:
Observation: {obs}
Reward: {reward}

Action: Move Back revolute joint 0.5, Back lower leg -0.3, Front revolute joint 0.2, Front lower leg 0.7

Give me the next action for the following scenario:

observation: {obs_step}
reward: {reward_step}

Action: 
"""

input_text = primer.format(obs=obs_text_example, reward=reward_text_example, obs_step=obs_text, reward_step=reward_text)
input_primer = tokenizer(input_text, return_tensors="pt")

generate_ids = model.generate(
    input_primer["input_ids"], 
    max_length=1000, 
    num_return_sequences=1, 
    temperature=0.7, 
    top_k=50, 
    top_p=0.9, 
    do_sample=True, 
    pad_token_id=tokenizer.eos_token_id
)

output = tokenizer.decode(generate_ids[0], skip_special_tokens=True)
print(output)



You are playing a challenge to walk a bipedal machine as far as possible without falling over. The machine has a hull, two legs, a back and front revolute joint, and lower leg. The machine has 10 lidar sensors that can detect the distance to objects in front of it. 

You will be given scenarios in the form of observations from the environment and the current reward based on the last action. Your goal is to provide the next action in the following format:

'Move Back revolute joint `value`, Back lower leg 'value', Front revolute joint 'value', Front lower leg 'value''

The values must be in the range [-1, 1], and your goal is to maximize the reward and move the machine forward effectively. Only provide the action and no additional explanation.

Example:
Observation: Observation from last step: 
Hull angle: -0.01
Angular velocity: -0.01
X velocity: -0.00
Y velocity: 0.00
Back revolute joint angle: 0.43
Back revolute joint speed: -0.03
Back lower leg angle: 0.13
Back lower leg speed: 0.1