In [2]:
from openai import OpenAI
import yaml
import os
import numpy as np
import pandas as pd

In [3]:
def file_to_string(filename):
    with open(filename, 'r') as file:
        return file.read()

In [4]:
with open('./gpt/key.yaml', 'r') as stream:
    config = yaml.safe_load(stream)

client = OpenAI(api_key=config['OPENAI_API_KEY'])

# Playing with curriculum and reflection

In [9]:
def generate_curriculum():
    initial_system = file_to_string('./curriculum_system.txt')
    initial_user = file_to_string('./curriculum_user.txt')

    completion = client.chat.completions.create(
        model="gpt-4-1106-preview", # gpt-4-1106-preview, gpt-4-0613, gpt-4-32k, gpt-3.5-turbo-1106
        messages=[
        {"role": "system", "content": initial_system},
        {"role": "user", "content": initial_user}
        ]
    )

    print(completion.choices[0].message.content)

    return completion.choices[0].message.content

In [10]:
curriculum_txt = generate_curriculum()

with open('./curriculum.md', 'w') as file:
    file.write(curriculum_txt)

Task 1 Name
Understanding the Basics of Balance

Task 1 Description
The agent should learn to maintain the hopper in an upright position with zero velocity (not falling over) for an extended period of time. The starting position is set with the hopper's torso standing vertically with a slight randomized initial disturbance in position and velocity. The reward is positively proportional to the time maintaining balance without exceeding a predefined maximum angle deviation from vertical.

Task 2 Name
Stationary Hopping

Task 2 Description
The agent must learn to perform a single hop and land back to the starting position without tilting or falling over. The hopper must remain in place (x-coordinate should not change significantly) while only moving in the z direction. The reward is given for successfully performing a hop and returning to the balance position within a small margin of error for x-position and angles of body parts. 

Task 3 Name
Controlled Hopping

Task 3 Description
This t

In [None]:
def generate_reward():
  reward_system = file_to_string('./reward_system.txt')
  curriculum_user = file_to_string('./curriculum_user.txt')
  reward_user = file_to_string('./curriculum.md')

  user = curriculum_user + reward_user

  completion = client.chat.completions.create(
     model="gpt-4-1106-preview", # gpt-4-1106-preview, gpt-4-0613, gpt-4-32k, gpt-3.5-turbo-1106
     messages=[
        {"role": "system", "content": reward_system},
        {"role": "user", "content": user},
      ]
  )
  
  print(completion.choices[0].message.content)

  return completion.choices[0].message.content

In [None]:
reward_functions = generate_reward()

In [None]:
def generate_reflection():
    reflection_system = file_to_string('./reflection_system.txt')
    reflection_env_code = file_to_string('./reflection_user.txt')
    task = file_to_string('./reflection_task.txt')
    learning_curve = file_to_string('./reflection_learning_curve.txt')
    reason = file_to_string('./reflection_reason.txt')

    user = reflection_env_code + task + learning_curve + reason

    completion = client.chat.completions.create(
        model="gpt-4-1106-preview", # gpt-4-1106-preview, gpt-4-0613, gpt-4-32k, gpt-3.5-turbo-1106
        messages=[
        {"role": "system", "content": reflection_system},
        {"role": "user", "content": user},
        ]
    )

    print(completion.choices[0].message.content)

    return completion.choices[0].message.content

In [None]:
reflection = generate_reflection()

In [None]:
with open('./reflection.md', 'w') as file:
    file.write(reflection)

# Playing with Trajectory Feedback

In [6]:
def feedback():
  reflection_system = file_to_string('./trajectory_system.txt')
  reflection_env_code = file_to_string('./trajectory_user.txt')
  task = file_to_string('./trajectory_task.txt')
  trajectory_1 = file_to_string('./simple_hopping_observation.txt')
  trajectory_2 = file_to_string('./move_forward_observation.txt')

  user = reflection_env_code + task + "Trajectory of the agent 1: \n" + trajectory_1 + "\nTrajectory of the agent 2: \n" + trajectory_2

  completion = client.chat.completions.create(
      model="gpt-4-1106-preview", # gpt-4-1106-preview, gpt-4-0613, gpt-4-32k, gpt-3.5-turbo-1106
      messages=[
        {"role": "system", "content": reflection_system},
        {"role": "user", "content": user},
      ]
  )

  print(completion.choices[0].message.content)

  return completion.choices[0].message.content

In [22]:
decision = feedback()

Decision: Agent 2
Reason: The task specifies that the robot should maintain vertical jumping and landing without progressing in the x-direction. Upon examining the trajectories for both agents, it is evident that agent 1 has x positions that vary from 0.00 to 1.00 and keep increasing, which indicates horizontal movement. In contrast, agent 2 shows the x position consistently being 0.00 or very close to it across all timesteps, indicating successful adherence to the task of not moving horizontally while performing vertical jumps. Agent 2's trajectory is better aligned with the task description as it minimizes horizontal displacement.


In [34]:
def single_feedback():
  reflection_system = file_to_string('./trajectory_system.txt')
  reflection_env_code = file_to_string('./trajectory_user.txt')
  task = file_to_string('./trajectory_task.txt')
  trajectory_1 = file_to_string('./stand_still_observation.txt')

  user = reflection_env_code + task + "Trajectory of the agent: \n" + trajectory_1

  completion = client.chat.completions.create(
      model="gpt-4-1106-preview", # gpt-4-1106-preview, gpt-4-0613, gpt-4-32k, gpt-3.5-turbo-1106
      messages=[
        {"role": "system", "content": reflection_system},
        {"role": "user", "content": user},
      ]
  )

  print(completion.choices[0].message.content)

  return completion.choices[0].message.content

In [44]:
decision = single_feedback()

Decision: Not following
Reason: The initial z-coordinate of the torso is 1.25 and falls within the healthy range. However, the robot's z-coordinate starts to dip below 1.25 partway through the trajectory, dropping down to 1.20 and eventually to 1.21, which is outside the specified healthy range of z-coordinate. Therefore, the robot is not correctly maintaining the height parameter of the task.


In [53]:
def generate_reward():
  reward_system = file_to_string('./reward_system.txt')
  reward_user = file_to_string('./reward_user.txt')
  task = file_to_string('./trajectory_task.txt')

  user = reward_user + task

  completion = client.chat.completions.create(
     model="gpt-4-1106-preview", # gpt-4-1106-preview, gpt-4-0613, gpt-4-32k, gpt-3.5-turbo-1106
     messages=[
        {"role": "system", "content": reward_system},
        {"role": "user", "content": user},
      ]
  )
  
  print(completion.choices[0].message.content)

  return completion.choices[0].message.content

In [54]:
rewards = generate_reward()
with open('./rewards.md', 'w') as file:
    file.write(rewards)

To devise reward functions for Task 2 where the goal is to maintain vertical jumping and landing without progressing in the x-direction, we need to utilize the given observation details. Specifically, the reward functions will focus on the z height of the torso (`observation[1]`), the change in x position (`observation[0]` before and `next_observation[0]` after), as well as the torques applied as actions and whether the hopper is in a healthy state.

Sample 1
```python
from typing import List, Tuple, Dict
import numpy as np

def compute_reward_1(observation: List, action: List, next_observation: List) -> Tuple[np.float64, Dict[str, np.float64]]:
    z_height = next_observation[1]
    change_in_x = abs(next_observation[0] - observation[0])
    healthy = 1 if next_observation[1] >= 0.7 else 0  # Using 0.7 as the threshold for healthy z height
    
    # Reward Components
    vertical_reward = np.clip(z_height, 0, 2)  # Encouraging vertical movement
    horizontal_penalty = -change_in_x  

In [55]:
def compare_rewards():
  reward_system = file_to_string('./compare_reward_system.txt')
  reward_user = file_to_string('./reward_user.txt')
  task = file_to_string('./trajectory_task.txt')
  rewards = file_to_string('./rewards.md')

  user = reward_user + task + rewards

  completion = client.chat.completions.create(
     model="gpt-4-1106-preview", # gpt-4-1106-preview, gpt-4-0613, gpt-4-32k, gpt-3.5-turbo-1106
     messages=[
        {"role": "system", "content": reward_system},
        {"role": "user", "content": user},
      ]
  )
  
  print(completion.choices[0].message.content)

  return completion.choices[0].message.content

In [65]:
decision = compare_rewards()

Decision: Sample 4

Reason:
Looking at the task description for simple hopping, the agent's goal is to maintain vertical jumping and landing without moving in the x-direction. Sample 4's reward function aligns well with the task at hand.

Let's break down the reasons according to the reward function criteria:

1. Alignment with task description: In Sample 4, the reward function provides a binary reward for maintaining the healthy z-height between 0.7 and 2.0 and torso angle within the acceptable range of -0.2 to 0.2. The conditions align with the environment's `is_healthy` method and encourage the behavior of staying in place while hopping. The stillness and x-movement penalty that should be present according to the task description is implicitly addressed by rewarding only the correct z-height range, which can be achieved by vertical movement.

2. Simplicity: Sample 4 offers the simplest reward structure among the presented samples, which is a combination of the upright posture reward

# Chain of Rewards

In [7]:
def chain_reward():
  reward_system = file_to_string('./reward_system.txt')
  env_user = file_to_string('./chain_user.txt')
  reward_user = file_to_string('./curriculum.md')

  user = env_user + reward_user

  completion = client.chat.completions.create(
     model="gpt-4-1106-preview", # gpt-4-1106-preview, gpt-4-0613, gpt-4-32k, gpt-3.5-turbo-1106
     messages=[
        {"role": "system", "content": reward_system},
        {"role": "user", "content": user},
      ]
  )
  
  print(completion.choices[0].message.content)

  return completion.choices[0].message.content

In [8]:
chain_reward = chain_reward()
with open('./chain_reward.md', 'w') as file:
    file.write(chain_reward)

# Success Function generation

In [4]:
def success():
    system = file_to_string('./success_system.txt')
    user = file_to_string('./success_user.txt')

    completion = client.chat.completions.create(
        model="gpt-4-1106-preview", # gpt-4-1106-preview, gpt-4-0613, gpt-4-32k, gpt-3.5-turbo-1106
        messages=[
        {"role": "system", "content": system},
        {"role": "user", "content": user},
        ]
    )

    print(completion.choices[0].message.content)

    return completion.choices[0].message.content

In [8]:
success_function = success()
with open('./success.md', 'w') as file:
    file.write(success_function)

Based on the provided environment code and the description of Task 2, the success of the "Learn to Squat and Rise" task can be evaluated using the z height of the torso, the angles of the joints, and the overall health of the state as described in the `is_healthy` method. We need to ensure that the Hopper stays within healthy ranges without hopping or moving horizontally. 

The success function will check that:
1. The next z height is within the specified healthy range.
2. The hopper should not be hopping, which implies there should be minimal change in the x component of position between observations.
3. The angles of the thigh, leg, and foot joints remain within healthy boundaries.
4. The hopper stays 'healthy' according to the `is_healthy` function.

```python
def compute_success(observation, action, next_observation) -> bool:
    # Extract the relevant parts of the observation
    prev_z, next_z = observation[1], next_observation[1]
    prev_x, next_x = observation[0], next_observa

# Environment specific feedback

In [8]:
def single_feedback(x_pos_av, x_pos_std, z_pos_av, z_pos_std, x_vel_av, x_vel_std, z_vel_av, z_vel_std):
  reflection_system = file_to_string('./trajectory_system.txt')
  reflection_env_code = file_to_string('./trajectory_user.txt')
  task = file_to_string('./trajectory_task.txt')
  trajectory_1 = file_to_string('./stand_still_observation.txt')

  user = reflection_env_code + task + "Trajectory of the agent: \n" + trajectory_1 

  completion = client.chat.completions.create(
      model="gpt-4-1106-preview", # gpt-4-1106-preview, gpt-4-0613, gpt-4-32k, gpt-3.5-turbo-1106
      messages=[
        {"role": "system", "content": reflection_system},
        {"role": "user", "content": user},
      ]
  )

  print(completion.choices[0].message.content)

  return completion.choices[0].message.content

In [49]:
obs = np.loadtxt('./stand_still_observation.txt', delimiter=',')

In [50]:
# Get average and std of the x position
x_pos = obs[:, 0]
x_pos_avg = np.mean(x_pos)
x_pos_std = np.std(x_pos)

# Get average and std of the z position
z_pos = obs[:, 1]
z_pos_avg = np.mean(z_pos)
z_pos_std = np.std(z_pos)

# Get average and std of x velocity
x_pos_next = obs[1:, 0]
x_vel = (x_pos_next - x_pos[:-1]) / 0.008
x_vel_avg = np.mean(x_vel)
x_vel_std = np.std(x_vel)

# Get average and std of z velocity
z_pos_next = obs[1:, 1]
z_vel = (z_pos_next - z_pos[:-1]) / 0.008
z_vel_avg = np.mean(z_vel)
z_vel_std = np.std(z_vel)

stand_still_info = "\nAverage x position: " + str(x_pos_avg) + "\nStandard deviation of x position: " + str(x_pos_std) + "\nAverage z position: " + str(z_pos_avg) + "\nStandard deviation of z position: " + str(z_pos_std) + "\nAverage x velocity: " + str(x_vel_avg) + "\nStandard deviation of x velocity: " + str(x_vel_std) + "\nAverage z velocity: " + str(z_vel_avg) + "\nStandard deviation of z velocity: " + str(z_vel_std)

In [51]:
obs = np.loadtxt('./simple_hopping_observation.txt', delimiter=',')

In [52]:
# Get average and std of the x position
x_pos = obs[:, 0]
x_pos_avg = np.mean(x_pos)
x_pos_std = np.std(x_pos)

# Get average and std of the z position
z_pos = obs[:, 1]
z_pos_avg = np.mean(z_pos)
z_pos_std = np.std(z_pos)

# Get average and std of x velocity
x_pos_next = obs[1:, 0]
x_vel = (x_pos_next - x_pos[:-1]) / 0.008
x_vel_avg = np.mean(x_vel)
x_vel_std = np.std(x_vel)

# Get average and std of z velocity
z_pos_next = obs[1:, 1]
z_vel = (z_pos_next - z_pos[:-1]) / 0.008
z_vel_avg = np.mean(z_vel)
z_vel_std = np.std(z_vel)

simple_hopping_info = "\nAverage x position: " + str(x_pos_avg) + "\nStandard deviation of x position: " + str(x_pos_std) + "\nAverage z position: " + str(z_pos_avg) + "\nStandard deviation of z position: " + str(z_pos_std) + "\nAverage x velocity: " + str(x_vel_avg) + "\nStandard deviation of x velocity: " + str(x_vel_std) + "\nAverage z velocity: " + str(z_vel_avg) + "\nStandard deviation of z velocity: " + str(z_vel_std)

In [53]:
obs = np.loadtxt('./move_forward_observation.txt', delimiter=',')

In [54]:
# Get average and std of the x position
x_pos = obs[:, 0]
x_pos_avg = np.mean(x_pos)
x_pos_std = np.std(x_pos)

# Get average and std of the z position
z_pos = obs[:, 1]
z_pos_avg = np.mean(z_pos)
z_pos_std = np.std(z_pos)

# Get average and std of x velocity
x_pos_next = obs[1:, 0]
x_vel = (x_pos_next - x_pos[:-1]) / 0.008
x_vel_avg = np.mean(x_vel)
x_vel_std = np.std(x_vel)

# Get average and std of z velocity
z_pos_next = obs[1:, 1]
z_vel = (z_pos_next - z_pos[:-1]) / 0.008
z_vel_avg = np.mean(z_vel)
z_vel_std = np.std(z_vel)

move_forward_info = "\nAverage x position: " + str(x_pos_avg) + "\nStandard deviation of x position: " + str(x_pos_std) + "\nAverage z position: " + str(z_pos_avg) + "\nStandard deviation of z position: " + str(z_pos_std) + "\nAverage x velocity: " + str(x_vel_avg) + "\nStandard deviation of x velocity: " + str(x_vel_std) + "\nAverage z velocity: " + str(z_vel_avg) + "\nStandard deviation of z velocity: " + str(z_vel_std)

In [55]:
def feedback(traj_1_info, traj_2_info):
  reflection_system = file_to_string('./trajectory_system.txt')
  reflection_env_code = file_to_string('./trajectory_user.txt')
  task = file_to_string('./trajectory_task.txt')

  user = reflection_env_code + task + "Trajectory information of the agent 1: \n" + traj_1_info + "\nTrajectory information of the agent 2: \n" + traj_2_info

  completion = client.chat.completions.create(
      model="gpt-4-1106-preview", # gpt-4-1106-preview, gpt-4-0613, gpt-4-32k, gpt-3.5-turbo-1106
      messages=[
        {"role": "system", "content": reflection_system},
        {"role": "user", "content": user},
      ]
  )

  print(completion.choices[0].message.content)

  return completion.choices[0].message.content

In [76]:
feedback_txt = feedback(simple_hopping_info, stand_still_info)

Decision: Agent 2
Reason: Agent 2 better maintains its position in the x-direction, as both its average x position and standard deviation of x position are zero, indicating that it does not move horizontally. This is in line with the task description, which specifies that the agent's goal is to maintain vertical jumping and landing without progressing in the x-direction. The average z position is reasonably high, which indicates hopping. Standard deviation for z position is very small, showing consistent hopping height. Moreover, the average x velocity is zero with no standard deviation, which suggests that there is no horizontal movement over time, and a small standard deviation of z velocity indicates controlled vertical hopping. Agent 1 shows some horizontal deviation and hence, does not strictly follow the task description compared to Agent 2.


In [57]:
print(stand_still_info)
print(simple_hopping_info)
print(move_forward_info)


Average x position: 0.0
Standard deviation of x position: 0.0
Average z position: 1.2101798201798204
Standard deviation of z position: 0.0033100869613864286
Average x velocity: 0.0
Standard deviation of x velocity: 0.0
Average z velocity: 0.0
Standard deviation of z velocity: 0.18540496217739175

Average x position: 0.21748251748251748
Standard deviation of x position: 0.05654607400669276
Average z position: 1.4222977022977024
Standard deviation of z position: 0.16082117967756113
Average x velocity: -7.105427357601002e-18
Standard deviation of x velocity: 0.9826622003516774
Average z velocity: 0.0
Standard deviation of z velocity: 1.4437364717980912

Average x position: 1.0750218340611355
Standard deviation of x position: 0.9117063154529592
Average z position: 1.3292576419213973
Standard deviation of z position: 0.16504033910927382
Average x velocity: 1.6995614035087723
Standard deviation of x velocity: 1.124478351067322
Average z velocity: -0.2905701754385964
Standard deviation of z 