# Basic Example

In [5]:
# !pip install gymnasium
# !pip install ray
# !pip install ray[rllib]
# !pip install torch
import csbsingleenv

In [6]:
env = csbsingleenv.CodersStrikeBackSingle()

In [7]:
env.reset()
done = False
rewards = []
while not done:
    obs, reward, done, _, _ = env.step(env.action_space.sample())
    rewards.append(reward)
print(sum(rewards))

-10100


In [8]:
import ray
from ray import tune, air
from ray.rllib.algorithms.ppo import PPOConfig
from functools import partial

In [9]:
if ray.is_initialized():
  ray.shutdown()
ray.init(num_cpus=4)

2024-05-06 16:16:37,018	INFO worker.py:1749 -- Started a local Ray instance.


0,1
Python version:,3.10.12
Ray version:,2.20.0


In [10]:
config = (PPOConfig()
          .environment(csbsingleenv.CodersStrikeBackSingle)
          .framework('torch')
          .training(gamma=0.9)
          .rollouts(num_rollout_workers=3)
)

stop = {"timesteps_total": 12000}

tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop),
)

tuner.fit()

0,1
Current time:,2024-05-06 16:17:03
Running for:,00:00:25.70
Memory:,16.0/125.5 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CodersStrikeBackSingle_b01d7_00000,TERMINATED,206.211.132.160:2765464,3,19.4064,12000,-10103.3,-10100,-10184,103.31


2024-05-06 16:17:03,336	INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to '/home/healthcare/ray_results/PPO_2024-05-06_16-16-37' in 0.0338s.
2024-05-06 16:17:03,355	INFO tune.py:1039 -- Total run time: 25.75 seconds (25.66 seconds for the tuning loop).
[36m(PPO pid=2765464)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/healthcare/ray_results/PPO_2024-05-06_16-16-37/PPO_CodersStrikeBackSingle_b01d7_00000_0_2024-05-06_16-16-37/checkpoint_000000)


ResultGrid<[
  Result(
    metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 4.284105129139398, 'cur_kl_coeff': 0.3, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 7.254170803229014, 'policy_loss': -0.04909878835152154, 'vf_loss': 7.298242694588118, 'vf_explained_var': -0.00012150464519377678, 'kl': 0.016756310231823775, 'entropy': 4.3490105490530695, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0, 'num_grad_updates_lifetime': 2325.5, 'diff_num_grad_updates_vs_sampler_policy': 464.5}}, 'num_env_steps_sampled': 12000, 'num_env_steps_trained': 12000, 'num_agent_steps_sampled': 12000, 'num_agent_steps_trained': 12000}, 'sampler_results': {'episode_reward_max': -10100.0, 'episode_reward_min': -10184.0, 'episode_reward_mean': -10103.31, 'episode_len_mean': 103.31, 'episode_media': {}, 'episodes_this_iter': 38, 'episodes_timesteps_total': 10331, 'po

# Empty Customized Environment

In [11]:
import csbsingleenv

import sys
# Assuming your Python file is in 'src' under 'my_project' directory
sys.path.append('../..')
import env_customizer

In [12]:
import ray
from ray import tune, air
from ray.rllib.algorithms.ppo import PPOConfig
from functools import partial

In [13]:
if ray.is_initialized():
  ray.shutdown()

runtime_env = {"py_modules": ["../.."]}
ray.init(num_cpus=4, runtime_env=runtime_env)

2024-05-06 16:17:07,467	INFO worker.py:1749 -- Started a local Ray instance.
2024-05-06 16:17:07,546	INFO packaging.py:530 -- Creating a file package for local directory '../..'.
2024-05-06 16:17:07,615	INFO packaging.py:358 -- Pushing file package 'gcs://_ray_pkg_076655efa3af12e2.zip' (9.23MiB) to Ray cluster...
2024-05-06 16:17:07,659	INFO packaging.py:371 -- Successfully pushed file package 'gcs://_ray_pkg_076655efa3af12e2.zip'.


0,1
Python version:,3.10.12
Ray version:,2.20.0


In [14]:
config = (PPOConfig()
          .environment(env_customizer.CustomizedEnvironment,
                      env_config={"env" : csbsingleenv.CodersStrikeBackSingle()})
          .framework('torch')
          .training(gamma=0.9)
          .rollouts(num_rollout_workers=3)
)

stop = {"timesteps_total": 12000}

tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop),
)

tuner.fit()

0,1
Current time:,2024-05-06 16:17:34
Running for:,00:00:26.19
Memory:,16.1/125.5 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomizedEnvironment_c24a0_00000,TERMINATED,206.211.132.160:2766424,3,19.3051,12000,-10105.9,-10100,-10206,105.9


2024-05-06 16:17:34,307	INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to '/home/healthcare/ray_results/PPO_2024-05-06_16-17-08' in 0.0376s.
[36m(PPO pid=2766424)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/healthcare/ray_results/PPO_2024-05-06_16-17-08/PPO_CustomizedEnvironment_c24a0_00000_0_2024-05-06_16-17-08/checkpoint_000000)
2024-05-06 16:17:34,413	INFO tune.py:1039 -- Total run time: 26.32 seconds (26.16 seconds for the tuning loop).


ResultGrid<[
  Result(
    metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 4.622612444303369, 'cur_kl_coeff': 0.20000000000000004, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 7.1091064996616815, 'policy_loss': -0.038836950011631496, 'vf_loss': 7.144342686027609, 'vf_explained_var': -6.009820968874039e-05, 'kl': 0.01800365592763421, 'entropy': 4.373368304519243, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0, 'num_grad_updates_lifetime': 2325.5, 'diff_num_grad_updates_vs_sampler_policy': 464.5}}, 'num_env_steps_sampled': 12000, 'num_env_steps_trained': 12000, 'num_agent_steps_sampled': 12000, 'num_agent_steps_trained': 12000}, 'sampler_results': {'episode_reward_max': -10100.0, 'episode_reward_min': -10206.0, 'episode_reward_mean': -10105.9, 'episode_len_mean': 105.9, 'episode_media': {}, 'episodes_this_iter': 37, 'episodes_timesteps_total

# Real Customized Environment

In [15]:
import csbsingleenv

import sys
# Assuming your Python file is in 'src' under 'my_project' directory
sys.path.append('../..')
import env_customizer

In [26]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces, logger

# Note, this is only meant as an example of how to write a custom environment.
# I doubt it'll actually learn well.
class CustomizedRacing:
    def __init__(self):
        min_pos = -200000.0
        max_pos = 200000.0
        min_vel = -2000.0
        max_vel = 2000.0
        screen_max = [16000,9000]
        # This observation space will only have the next checkpoint rather than the next 5.
        self.observation_space = spaces.Box(
            low=np.array([0, -np.pi, min_pos, min_pos, min_vel, min_vel]+[0,0]),
            high=np.array([3, np.pi, max_pos, max_pos, max_vel, max_vel]+screen_max),
            dtype=np.float64
        )

        # This action will be how much to turn, rather than a point to turn towards.
        self.action_space = spaces.Box(
            low = np.array([-1.0, 0.0]),
            high = np.array([1.0, 100]),
            dtype=np.float64
        )

    # We can define and reset any extra variables we want to keep track of here.
    def reset(self):
        self.last_checkpoint = None
        self.last_sqr_dist = None

    def reward(self, env, rew):
        # Add reward for getting closer to the next checkpoint
        cur_dist = csbsingleenv.get_sqr_distance(env.pos, env.next_checkpoint())
        if self.last_sqr_dist is not None and cur_dist < self.last_sqr_dist:
            rew += 2
        self.last_sqr_dist = cur_dist
        
        # Add reward for passing a checkpoint
        cur_checkpoint = env.checkpoint_index
        if self.last_checkpoint is not None and cur_checkpoint != self.last_checkpoint:
            rew += 150
        self.last_checkpoint = cur_checkpoint
        
        if env.failed:
            return 0
        return rew
    
    # Need to define the traformation.  Should be easy here since we just need to drop the excess data
    def transform_observation(self, env, obs):
        return obs[:8]

    # Need to also transform the angle change into a point to aim at.
    # Will define it as 1000 distance from the angle relative to current pointed direction
    def transform_action(self, env, action):
        preferred_dir = env.theta + action[0]*env.maxSteeringAngle
        dir_vector = 1000*csbsingleenv.angle_to_vector(preferred_dir)
        point_to_aim_at = env.pos + dir_vector
        trans_action = np.array([point_to_aim_at.x, point_to_aim_at.y, action[1]])
        return trans_action

In [17]:
env = env_customizer.CustomizedEnvironment(
    env_config={"env":csbsingleenv.CodersStrikeBackSingle(),
                "customizer":CustomizedRacing()})

In [18]:
env.reset()

(array([3.00000000e+00, 3.06283315e+00, 5.78800000e+03, 8.16600000e+03,
        0.00000000e+00, 0.00000000e+00, 4.41000000e+02, 8.58800000e+03]),
 {})

In [19]:
env.step(np.array([0,100]))

(array([ 3.00000000e+00,  3.06283315e+00,  5.68800000e+03,  8.17400000e+03,
        -8.41500000e+01,  5.95000000e+00,  4.41000000e+02,  8.58800000e+03]),
 -1,
 False,
 False,
 {})

In [20]:
import ray
from ray import tune, air
from ray.rllib.algorithms.ppo import PPOConfig
from functools import partial

In [21]:
if ray.is_initialized():
  ray.shutdown()

runtime_env = {"py_modules": ["../.."]}
ray.init(runtime_env=runtime_env)

2024-05-06 16:17:38,827	INFO worker.py:1749 -- Started a local Ray instance.
2024-05-06 16:17:38,896	INFO packaging.py:530 -- Creating a file package for local directory '../..'.
2024-05-06 16:17:38,964	INFO packaging.py:358 -- Pushing file package 'gcs://_ray_pkg_076655efa3af12e2.zip' (9.23MiB) to Ray cluster...
2024-05-06 16:17:39,009	INFO packaging.py:371 -- Successfully pushed file package 'gcs://_ray_pkg_076655efa3af12e2.zip'.


0,1
Python version:,3.10.12
Ray version:,2.20.0


In [27]:
checkpoint_config = air.CheckpointConfig(checkpoint_frequency=5)

In [31]:
config = (PPOConfig()
          .environment(env_customizer.CustomizedEnvironment,
                      env_config={"env" : csbsingleenv.CodersStrikeBackSingle(),
                                 "customizer" : CustomizedRacing()})
          .framework('torch')
          .training(gamma=0.99)
          .rollouts(num_rollout_workers=20, num_envs_per_worker=5, rollout_fragment_length="auto")
          .resources(num_gpus=4, num_learner_workers=4, num_gpus_per_learner_worker=1)
)
config.num_sgd_iter = 6
stop = {"timesteps_total": 1000000}

tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop,
                             storage_path="~/Desktop/Tristan/rl-projects/results",
                             checkpoint_config=checkpoint_config)

)

tuner.fit()

0,1
Current time:,2024-05-06 17:14:01
Running for:,00:10:31.83
Memory:,24.0/125.5 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomizedEnvironment_3bf8b_00000,RUNNING,206.211.132.160:2996519,80,605.843,320000,10.39,187,-33,68.19


[36m(PPO pid=2996519)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/healthcare/Desktop/Tristan/rl-projects/results/PPO_2024-05-06_17-03-29/PPO_CustomizedEnvironment_3bf8b_00000_0_2024-05-06_17-03-29/checkpoint_000000)
[36m(PPO pid=2996519)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/healthcare/Desktop/Tristan/rl-projects/results/PPO_2024-05-06_17-03-29/PPO_CustomizedEnvironment_3bf8b_00000_0_2024-05-06_17-03-29/checkpoint_000001)
[36m(PPO pid=2996519)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/healthcare/Desktop/Tristan/rl-projects/results/PPO_2024-05-06_17-03-29/PPO_CustomizedEnvironment_3bf8b_00000_0_2024-05-06_17-03-29/checkpoint_000002)
[36m(PPO pid=2996519)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/healthcare/Desktop/Tristan/rl-projects/results/PPO_2024-05-06_17-03-29/PPO_CustomizedEnvironment_3bf8b_00000_0_2024-05-06_17-03-29/checkpo