In [1]:
import gymnasium as gym
import numpy as np
import pandas as pd

In [38]:
def obs_to_text(obs):
    starter = "Observation from last step: "
    x_coordinate = f"X coordinate: {obs[0]:2f}"
    y_coordinate = f"Y coordinate: {obs[1]:.2f}"
    x_velocity = f"X velocity: {obs[2]:.2f}"
    y_velocity = f"Y velocity: {obs[3]:.2f}"
    lander_angle = f"Angle of the lander: {obs[4]:.2f}"
    angular_velocity = obs[5] * 2.5
    angular_velocity = f"Angular velocity of lander: {angular_velocity}"
    leg_1_contact_flag = f"Leg 1 contact with ground flag: {obs[6]}"
    leg_2_contact_flag = f"Leg 2 contact with ground flag: {obs[7]}"
     
    return "\n".join([starter, x_coordinate, y_coordinate, x_velocity, y_velocity, lander_angle, angular_velocity, leg_1_contact_flag, leg_2_contact_flag])

def reward_to_text(reward):
    return f"The reward from the last step was: {reward:.2f}"

def text_to_action(text):
    """
        Given an output by the LLM in the form:
        0 - "do nothing"
        1 - "fire left orientation engine"
        2 - "fire main engine"
        3 - "fire right orientation engine"
        This function will return the corresponding action values for the environment 
    """
    action = text[0]
    return int(action)

In [56]:
primer = """
Imagine you are an expert at controlling a lunar lander, and you have been given the task of controlling a lunar landing rover in a reinforcement learning environment. The challenge is to land the lander between the two yellow flags on the surface that mark the landing zone. The lander has a square body and two legs. This environment is a classic rocket trajectory optimization problem. According to Pontryagin's maximum principle, it is optimal to fire the engine at full throttle or turn it off. This is the reason why this environment has discrete actions: engine on or off.

You will be given scenarios in the form of observations from the environment and the reward based on the last action and the Current Total Score. Your goal is to provide one of the following actions based on the previous observation:

'0 - Do nothing, 1 - fire left orientation engine, 2 - fire main engine, 3 - fire right orientation engine'

Your goal is to maximize the reward and land the lander on the landing pad. Only provide the action and no additional explanation. The episode finishes if:

- the lander crashes (the lander body gets in contact with the moon);

- the lander gets outside of the viewport (x coordinate is greater than 1);

Rewards:
After every step a reward is granted. The total reward of an episode is the sum of the rewards for all the steps within that episode.

For each step, the reward:

- is increased/decreased the closer/further the lander is to the landing pad.

- is increased/decreased the slower/faster the lander is moving.

- is decreased the more the lander is tilted (angle not horizontal).

- is increased by 10 points for each leg that is in contact with the ground.

- is decreased by 0.03 points each frame a side engine is firing.

- is decreased by 0.3 points each frame the main engine is firing.

The episode receive an additional reward of -100 or +100 points for crashing or landing safely respectively. An episode is considered a solution if it scores at least 200 points.

Observation:
The observation state is an 8-dimensional vector: the coordinates of the lander in x & y, its linear velocities in x & y, its angle, its angular velocity, and two booleans that represent whether each leg is in contact with the ground or not.

Example Observation:
'Observation from last step: \nX coordinate: -0.02\nY coordinate: -0.03\nX velocity: -0.03\nY velocity: -0.01\nAngle of the lander: 0.48\nAngular velocity of lander: 0.48\nLeg 1 contact with ground flag: -0\nLeg 1 contact with ground flag: 1'
'The reward from the last step was: -0.25'
Score: -0.25

Action: 3 - fire right orientation engine
"""

obs_skeleton = """
_
{obs_step}
{reward_step}
score: {score_step}

Action: 
"""

obs_skeleton_reset = """
Now, given the following observation, rewards and score give me the next action for the following scenario:
_
{obs_step}
reward: None as it is the first step
score: 0

Action:
"""

## Training Testing

In [14]:
!pip3 install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [None]:
import openai
import os
import time

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
STEPS = 500
OBS_TEXT_LENGTH = 800
PRIMER_STARTER_LENGTH = 2000
NUMBER_OF_OBS_MEMORY = 3 
save_dir = "runs/lunar_lander/"
NUMBER_ACTIONS = 5 

In [60]:
# Init env
env = gym.make("LunarLander-v3", continuous=False, render_mode="human")
obs, info = env.reset()

client = openai.OpenAI(api_key=OPENAI_API_KEY)
def get_action(primer, client):
    print("Getting action from LLM")
    response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": primer}
    ],
    response_format={
      "type": "text"
    },
    temperature=0.2,
    max_tokens=200,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
    )

    action_text = response.choices[0].message.content
    print(action_text)
    return action_text

def save_obs(obs, reward, score, step, store={}):
    obs_text = obs_to_text(obs)
    reward_text = reward_to_text(reward)
    score_text = f"Score: {score}"
    obs_skeleton_filled = obs_skeleton.format(obs_step=obs_text, reward_step=reward_text, score_step=score_text)
    inner = {
        "obs": obs,
        "obs_text": obs_text,
        "reward": reward,
        "reward_text": reward_text,
        "score": score,
        "skeleton": obs_skeleton_filled
    }
    store[step] = inner



In [61]:
score, reward, done, store = 0, 0, False, {}
obs, _ = env.reset()

# Make first primer
obs_text = obs_to_text(obs)
primer += obs_skeleton_reset.format(obs_step=obs_text)

for i in range(STEPS):
    # sleep to not exceed the rate limit
    time.sleep(5)

    # Get action from gpt
    action_text = get_action(primer, client)
    action = text_to_action(action_text)

    save_obs(obs, reward, score, i, store)

    # Perform action
    obs, reward, done, _, _ = env.step(action)
    if done:
        print("The lander has crashed or moved outside the viewport (X coordinate > 1). The total score was: ", score)
        break

    # Update score
    score += reward

    # Generate new primer
    obs_text = obs_to_text(obs)
    reward_text = reward_to_text(reward)
    obs_step = obs_skeleton.format(obs_step=obs_text, reward_step=reward_text, score_step=score)
    primer += action_text + "\n" + obs_step
    
# Check length of primer
    if len(primer) > PRIMER_STARTER_LENGTH + OBS_TEXT_LENGTH*NUMBER_OF_OBS_MEMORY or len(primer) > 4096:
        # split at "_" get the primer front and the continuation
        primer_splits = primer.split("_")
        primer = primer_splits[0] + "_" + "".join(primer_splits[2:]) # Remove the first observation and reward from memory

# Save the store as df
df = pd.DataFrame(store).T

Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting action from LLM
2 - fire main engine
Getting ac

In [62]:
env.close()

In [64]:
# save df
df.to_csv(save_dir + "run_1.csv")

## Increase number of actions per step

In [65]:
def rewards_to_text(rewards):
    # Gets a list of rewards and returns a string with the rewards in the form "[reward1, reward2, reward3]"
    return "[" + ", ".join([str(reward) for reward in rewards]) + "]"

In [66]:
primer = """
Imagine you are an expert at controlling a lunar lander, and you have been given the task of controlling a lunar landing rover in a reinforcement learning environment. The challenge is to land the lander between the two yellow flags on the surface that mark the landing zone. The lander has a square body and two legs. This environment is a classic rocket trajectory optimization problem. According to Pontryagin's maximum principle, it is optimal to fire the engine at full throttle or turn it off. This is the reason why this environment has discrete actions: engine on or off.

You will be given scenarios in the form of observations from the environment and the reward based on the last action and the Current Total Score. Your goal is to provide 5 of the following actions based on the previous observation:

'0 - Do nothing, 1 - fire left orientation engine, 2 - fire main engine, 3 - fire right orientation engine'

Your goal is to maximize the reward and land the lander on the landing pad. Only provide the action and no additional explanation. The episode finishes if:

- the lander crashes (the lander body gets in contact with the moon);

- the lander gets outside of the viewport (x coordinate is greater than 1);

Rewards:
After every step a reward is granted. The total reward of an episode is the sum of the rewards for all the steps within that episode.

For each step, the reward:

- is increased/decreased the closer/further the lander is to the landing pad.

- is increased/decreased the slower/faster the lander is moving.

- is decreased the more the lander is tilted (angle not horizontal).

- is increased by 10 points for each leg that is in contact with the ground.

- is decreased by 0.03 points each frame a side engine is firing.

- is decreased by 0.3 points each frame the main engine is firing.

The episode receive an additional reward of -100 or +100 points for crashing or landing safely respectively. An episode is considered a solution if it scores at least 200 points.

Observation:
The observation state is an 8-dimensional vector: the coordinates of the lander in x & y, its linear velocities in x & y, its angle, its angular velocity, and two booleans that represent whether each leg is in contact with the ground or not.

Example Observation:
'Observation from last step: \nX coordinate: -0.02\nY coordinate: -0.03\nX velocity: -0.03\nY velocity: -0.01\nAngle of the lander: 0.48\nAngular velocity of lander: 0.48\nLeg 1 contact with ground flag: -0\nLeg 1 contact with ground flag: 1'
'The reward from the last step was: -0.25'
Score: -0.25

Example Actions:

3 - fire right orientation engine
2 - fire main engine
0 - Do nothing
1 - fire left orientation engine
"""

obs_skeleton = """
_
{obs_step}
rewards: {reward_step}
score: {score_step}

Actions 
"""

obs_skeleton_reset = """
Now, given the following observation, rewards and score give me the optimal best {no_actions} action for the following scenario:
_
{obs_step}
reward: None as it is the first step
score: 0

Action:
"""

In [67]:
STEPS = 100
OBS_TEXT_LENGTH = 1200
PRIMER_STARTER_LENGTH = 2400
NUMBER_OF_OBS_MEMORY = 4
save_dir = "runs/lunar_lander/"
NUMBER_ACTIONS = 5

In [68]:
#Init env
env = gym.make("LunarLander-v3", continuous=False, render_mode="human")
obs, info = env.reset()

client = openai.OpenAI(api_key=OPENAI_API_KEY)
def get_action(primer, client):
    print("Getting action from LLM")
    response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": primer}
    ],
    response_format={
      "type": "text"
    },
    temperature=0.2,
    max_tokens=200,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
    )

    action_text = response.choices[0].message.content
    return action_text

def save_obs(obs, reward, score, step, actions_text, actions, store={}):
    obs_text = obs_to_text(obs)
    reward_text = reward_to_text(reward)
    score_text = f"Score: {score}"
    obs_skeleton_filled = obs_skeleton.format(obs_step=obs_text, reward_step=reward_text, score_step=score_text)
    inner = {
        "obs": obs,
        "obs_text": obs_text,
        "reward": reward,
        "reward_text": reward_text,
        "score": score,
        "skeleton": obs_skeleton_filled,
        "actions_text": actions_text,
        "actions": actions
    }
    store[step] = inner


In [69]:
score, rewards, done, store = 0, [], False, {}
action_text_store, action_store = [], []
obs, _ = env.reset()

# Make first primer
obs_text = obs_to_text(obs)
primer += obs_skeleton_reset.format(obs_step=obs_text, no_actions=NUMBER_ACTIONS)

for i in range(STEPS):
    # sleep to not exceed the rate limit
    time.sleep(20)

    # Get action from gpt
    response = get_action(primer, client)
    action_texts = response.split("\n")
    for action_text in action_texts:
        # Get each of the actions given by the LLM
        action = text_to_action(action_text)
        print(action)
        # perform action
        obs, reward, done, _, _ = env.step(action)

        # Update score
        score += reward
        
        # save the reward
        rewards.append(reward)
        # save the action
        action_text_store.append(action_text)
        action_store.append(action)

        # Add the action to the primer
        primer += action_text + "\n"

        if done:
            print("The lander has crashed or moved outside the viewport (X coordinate > 1). The total score was: ", score)
            break

    save_obs(obs, reward, score, i, action_text_store, action_store, store)

    # Generate new primer
    obs_text = obs_to_text(obs)
    reward_text = rewards_to_text(rewards)
    obs_step = obs_skeleton.format(obs_step=obs_text, reward_step=reward_text, score_step=score)
    primer += obs_step # actions are already added

    # reset rewards
    rewards = []
    action_text_store = []
    action_store = []

    # Check length of primer
    if len(primer) > PRIMER_STARTER_LENGTH + OBS_TEXT_LENGTH*NUMBER_OF_OBS_MEMORY:
        # split at "_" get the primer front and the continuation
        primer_splits = primer.split("_")
        primer = primer_splits[0] + "_" + "".join(primer_splits[2:]) # Remove the first observation and reward from memory

# Save the store as df
df = pd.DataFrame(store).T

Getting action from LLM
2
2
2
0
0
Getting action from LLM
2
2
2
0
0
Getting action from LLM
2
2
2
0
0
Getting action from LLM
2
2
1
0
0
Getting action from LLM
2
2
1
0
0
Getting action from LLM
2
2
1
0
0
Getting action from LLM
2
2
1
0
0
Getting action from LLM
2
2
1
0
0
Getting action from LLM
2
2
1
0
0
Getting action from LLM
2
1
2
1
2
Getting action from LLM
2
1
2
1
2
Getting action from LLM
2
1
2
1
2
Getting action from LLM
2
1
2
1
2
Getting action from LLM
2
1
2
1
2
Getting action from LLM
2
1
2
1
2
Getting action from LLM
2
1
2
1
2
Getting action from LLM
2
1
2
1
2
Getting action from LLM
2
The lander has crashed or moved outside the viewport (X coordinate > 1). The total score was:  -500.79239674224465


KeyboardInterrupt: 

In [70]:
env.close()

In [71]:
df.to_csv(save_dir + "run_2.csv")

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
df

Unnamed: 0,obs,obs_text,reward,reward_text,score,skeleton
0,"[-0.0036893846, 1.4124846, -0.37371176, 0.069520235, 0.0042818994, 0.08465126, 0.0, 0.0]",Observation from last step: \nX coordinate: -0.003689\nY coordinate: 1.41\nX velocity: -0.37\nY velocity: 0.07\nAngle of the lander: 0.00\nAngular velocity of lander: 0.21162815392017365\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0,0.0,The reward from the last step was: 0.00,0.0,\n_\nObservation from last step: \nX coordinate: -0.003689\nY coordinate: 1.41\nX velocity: -0.37\nY velocity: 0.07\nAngle of the lander: 0.00\nAngular velocity of lander: 0.21162815392017365\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0\nThe reward from the last step was: 0.00\nscore: Score: 0\n\nAction: \n
1,"[-0.0073950766, 1.4142457, -0.37472457, 0.07826131, 0.0083990665, 0.082351506, 0.0, 0.0]",Observation from last step: \nX coordinate: -0.007395\nY coordinate: 1.41\nX velocity: -0.37\nY velocity: 0.08\nAngle of the lander: 0.01\nAngular velocity of lander: 0.20587876439094543\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0,-1.157953,The reward from the last step was: -1.16,-1.157953,\n_\nObservation from last step: \nX coordinate: -0.007395\nY coordinate: 1.41\nX velocity: -0.37\nY velocity: 0.08\nAngle of the lander: 0.01\nAngular velocity of lander: 0.20587876439094543\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0\nThe reward from the last step was: -1.16\nscore: Score: -1.1579530160004083\n\nAction: \n
2,"[-0.011085033, 1.416822, -0.37326404, 0.11446548, 0.012627807, 0.08458259, 0.0, 0.0]",Observation from last step: \nX coordinate: -0.011085\nY coordinate: 1.42\nX velocity: -0.37\nY velocity: 0.11\nAngle of the lander: 0.01\nAngular velocity of lander: 0.21145647391676903\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0,-1.743997,The reward from the last step was: -1.74,-2.90195,\n_\nObservation from last step: \nX coordinate: -0.011085\nY coordinate: 1.42\nX velocity: -0.37\nY velocity: 0.11\nAngle of the lander: 0.01\nAngular velocity of lander: 0.21145647391676903\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0\nThe reward from the last step was: -1.74\nscore: Score: -2.901950458788781\n\nAction: \n
3,"[-0.014760876, 1.420248, -0.37199038, 0.15222766, 0.016981823, 0.08708827, 0.0, 0.0]",Observation from last step: \nX coordinate: -0.014761\nY coordinate: 1.42\nX velocity: -0.37\nY velocity: 0.15\nAngle of the lander: 0.02\nAngular velocity of lander: 0.21772067993879318\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0,-2.232561,The reward from the last step was: -2.23,-5.134511,\n_\nObservation from last step: \nX coordinate: -0.014761\nY coordinate: 1.42\nX velocity: -0.37\nY velocity: 0.15\nAngle of the lander: 0.02\nAngular velocity of lander: 0.21772067993879318\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0\nThe reward from the last step was: -2.23\nscore: Score: -5.134511066490655\n\nAction: \n
4,"[-0.018348884, 1.4238799, -0.36367413, 0.16135535, 0.021799827, 0.09636881, 0.0, 0.0]",Observation from last step: \nX coordinate: -0.018349\nY coordinate: 1.42\nX velocity: -0.36\nY velocity: 0.16\nAngle of the lander: 0.02\nAngular velocity of lander: 0.24092203006148338\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0,-0.74208,The reward from the last step was: -0.74,-5.876591,\n_\nObservation from last step: \nX coordinate: -0.018349\nY coordinate: 1.42\nX velocity: -0.36\nY velocity: 0.16\nAngle of the lander: 0.02\nAngular velocity of lander: 0.24092203006148338\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0\nThe reward from the last step was: -0.74\nscore: Score: -5.876591224873477\n\nAction: \n
5,"[-0.021973897, 1.4277126, -0.36724564, 0.17026229, 0.02649633, 0.09393876, 0.0, 0.0]",Observation from last step: \nX coordinate: -0.021974\nY coordinate: 1.43\nX velocity: -0.37\nY velocity: 0.17\nAngle of the lander: 0.03\nAngular velocity of lander: 0.23484690114855766\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0,-1.851221,The reward from the last step was: -1.85,-7.727812,\n_\nObservation from last step: \nX coordinate: -0.021974\nY coordinate: 1.43\nX velocity: -0.37\nY velocity: 0.17\nAngle of the lander: 0.03\nAngular velocity of lander: 0.23484690114855766\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0\nThe reward from the last step was: -1.85\nscore: Score: -7.727811859816541\n\nAction: \n
6,"[-0.025512123, 1.4319589, -0.35908085, 0.18862669, 0.031691797, 0.103918865, 0.0, 0.0]",Observation from last step: \nX coordinate: -0.025512\nY coordinate: 1.43\nX velocity: -0.36\nY velocity: 0.19\nAngle of the lander: 0.03\nAngular velocity of lander: 0.2597971633076668\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0,-1.331493,The reward from the last step was: -1.33,-9.059305,\n_\nObservation from last step: \nX coordinate: -0.025512\nY coordinate: 1.43\nX velocity: -0.36\nY velocity: 0.19\nAngle of the lander: 0.03\nAngular velocity of lander: 0.2597971633076668\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0\nThe reward from the last step was: -1.33\nscore: Score: -9.05930505464487\n\nAction: \n
7,"[-0.029059887, 1.4370517, -0.36012548, 0.22622225, 0.036986995, 0.10591395, 0.0, 0.0]",Observation from last step: \nX coordinate: -0.029060\nY coordinate: 1.44\nX velocity: -0.36\nY velocity: 0.23\nAngle of the lander: 0.04\nAngular velocity of lander: 0.26478487998247147\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0,-3.312946,The reward from the last step was: -3.31,-12.372251,\n_\nObservation from last step: \nX coordinate: -0.029060\nY coordinate: 1.44\nX velocity: -0.36\nY velocity: 0.23\nAngle of the lander: 0.04\nAngular velocity of lander: 0.26478487998247147\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0\nThe reward from the last step was: -3.31\nscore: Score: -12.372251110108273\n\nAction: \n
8,"[-0.032670595, 1.4422561, -0.3662054, 0.2311681, 0.04206606, 0.101590514, 0.0, 0.0]",Observation from last step: \nX coordinate: -0.032671\nY coordinate: 1.44\nX velocity: -0.37\nY velocity: 0.23\nAngle of the lander: 0.04\nAngular velocity of lander: 0.2539762854576111\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0,-2.114003,The reward from the last step was: -2.11,-14.486254,\n_\nObservation from last step: \nX coordinate: -0.032671\nY coordinate: 1.44\nX velocity: -0.37\nY velocity: 0.23\nAngle of the lander: 0.04\nAngular velocity of lander: 0.2539762854576111\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0\nThe reward from the last step was: -2.11\nscore: Score: -14.486254202837895\n\nAction: \n
9,"[-0.036305808, 1.4482481, -0.36872444, 0.2661604, 0.047213342, 0.10295504, 0.0, 0.0]",Observation from last step: \nX coordinate: -0.036306\nY coordinate: 1.45\nX velocity: -0.37\nY velocity: 0.27\nAngle of the lander: 0.05\nAngular velocity of lander: 0.25738760828971863\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0,-3.591118,The reward from the last step was: -3.59,-18.077373,\n_\nObservation from last step: \nX coordinate: -0.036306\nY coordinate: 1.45\nX velocity: -0.37\nY velocity: 0.27\nAngle of the lander: 0.05\nAngular velocity of lander: 0.25738760828971863\nLeg 1 contact with ground flag: 0.0\nLeg 2 contact with ground flag: 0.0\nThe reward from the last step was: -3.59\nscore: Score: -18.07737263730614\n\nAction: \n
