# Basic Example

In [3]:
import csbsingleenv

In [4]:
env = csbsingleenv.CodersStrikeBackSingle()

In [5]:
env.reset()
done = False
rewards = []
while not done:
    obs, reward, done, _, _ = env.step(env.action_space.sample())
    rewards.append(reward)
print(sum(rewards))

-10100


In [6]:
import ray
from ray import tune, air
from ray.rllib.algorithms.ppo import PPOConfig
from functools import partial



In [7]:
if ray.is_initialized():
  ray.shutdown()
ray.init(num_cpus=4)

2024-04-24 14:32:44,203	INFO worker.py:1633 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.10.12
Ray version:,2.7.0
Dashboard:,http://127.0.0.1:8265


In [None]:
config = (PPOConfig()
          .environment(csbsingleenv.CodersStrikeBackSingle)
          .framework('torch')
          .training(gamma=0.9)
          .rollouts(num_rollout_workers=3)
)

stop = {"timesteps_total": 12000}

tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop),
)

tuner.fit()

# Empty Customized Environment

In [22]:
import csbsingleenv

import sys
# Assuming your Python file is in 'src' under 'my_project' directory
sys.path.append('../..')
import env_customizer

In [23]:
import ray
from ray import tune, air
from ray.rllib.algorithms.ppo import PPOConfig
from functools import partial

In [21]:
if ray.is_initialized():
  ray.shutdown()

runtime_env = {"py_modules": ["../.."]}
ray.init(num_cpus=4, runtime_env=runtime_env)

2024-04-28 20:24:50,623	INFO worker.py:1633 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2024-04-28 20:24:51,021	INFO packaging.py:518 -- Creating a file package for local directory '../..'.
2024-04-28 20:24:51,247	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_f822a251cc46b8e1.zip' (10.12MiB) to Ray cluster...
2024-04-28 20:24:51,270	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_f822a251cc46b8e1.zip'.


0,1
Python version:,3.10.12
Ray version:,2.7.0
Dashboard:,http://127.0.0.1:8265


In [None]:
config = (PPOConfig()
          .environment(env_customizer.CustomizedEnvironment,
                      env_config={"env" : csbsingleenv.CodersStrikeBackSingle()})
          .framework('torch')
          .training(gamma=0.9)
          .rollouts(num_rollout_workers=3)
)

stop = {"timesteps_total": 12000}

tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop),
)

tuner.fit()

# Real Customized Environment

In [1]:
import csbsingleenv

import sys
# Assuming your Python file is in 'src' under 'my_project' directory
sys.path.append('../..')
import env_customizer



In [94]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces, logger

# Note, this is only meant as an example of how to write a custom environment.
# I doubt it'll actually learn well.
class CustomizedRacing:
    def __init__(self):
        min_pos = -200000.0
        max_pos = 200000.0
        min_vel = -2000.0
        max_vel = 2000.0
        screen_max = [16000,9000]
        # This observation space will only have the next checkpoint rather than the next 5.
        self.observation_space = spaces.Box(
            low=np.array([0, -np.pi, min_pos, min_pos, min_vel, min_vel]+[0,0]),
            high=np.array([3, np.pi, max_pos, max_pos, max_vel, max_vel]+screen_max),
            dtype=np.float64
        )

        # This action will be how much to turn, rather than a point to turn towards.
        self.action_space = spaces.Box(
            low = np.array([-1.0, 0.0]),
            high = np.array([1.0, 100]),
            dtype=np.float64
        )

    # We can define and reset any extra variables we want to keep track of here.
    def reset(self):
        self.last_checkpoint = None
        self.last_sqr_dist = None

    def reward(self, env, rew):
        # Add reward for getting closer to the next checkpoint
        cur_dist = csbsingleenv.get_sqr_distance(env.pos, env.next_checkpoint())
        if self.last_sqr_dist is not None and cur_dist < self.last_sqr_dist:
            rew += 0.5
        self.last_sqr_dist = cur_dist
        
        # Add reward for passing a checkpoint
        cur_checkpoint = env.checkpoint_index
        if self.last_checkpoint is not None and cur_checkpoint != self.last_checkpoint:
            rew += 150
        self.last_checkpoint = cur_checkpoint
        
        if env.failed:
            return 0
        return rew
    
    # Need to define the traformation.  Should be easy here since we just need to drop the excess data
    def transform_observation(self, env, obs):
        return obs[:8]

    # Need to also transform the angle change into a point to aim at.
    # Will define it as 1000 distance from the angle relative to current pointed direction
    def transform_action(self, env, action):
        preferred_dir = env.theta + action[0]*env.maxSteeringAngle
        dir_vector = 1000*csbsingleenv.angle_to_vector(preferred_dir)
        point_to_aim_at = env.pos + dir_vector
        trans_action = np.array([point_to_aim_at.x, point_to_aim_at.y, action[1]])
        return trans_action

In [95]:
env = env_customizer.CustomizedEnvironment(
    env_config={"env":csbsingleenv.CodersStrikeBackSingle(),
                "customizer":CustomizedRacing()})

In [96]:
env.reset()

(array([3.00000000e+00, 1.53122625e+00, 1.20770000e+04, 6.58000000e+02,
        0.00000000e+00, 0.00000000e+00, 1.22550000e+04, 5.15400000e+03]),
 {})

In [97]:
env.step(np.array([0,100]))

(array([3.00000000e+00, 1.53122625e+00, 1.20810000e+04, 7.58000000e+02,
        2.55000000e+00, 8.41500000e+01, 1.22550000e+04, 5.15400000e+03]),
 -1,
 False,
 False,
 {})

In [98]:
import ray
from ray import tune, air
from ray.rllib.algorithms.ppo import PPOConfig
from functools import partial

In [99]:
if ray.is_initialized():
  ray.shutdown()

runtime_env = {"py_modules": ["../.."]}
ray.init(num_cpus=4, runtime_env=runtime_env)

2024-04-28 22:02:08,712	INFO worker.py:1633 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2024-04-28 22:02:08,931	INFO packaging.py:518 -- Creating a file package for local directory '../..'.
2024-04-28 22:02:09,185	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_a38743e6ce750843.zip' (10.12MiB) to Ray cluster...
2024-04-28 22:02:09,227	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_a38743e6ce750843.zip'.


0,1
Python version:,3.10.12
Ray version:,2.7.0
Dashboard:,http://127.0.0.1:8265


In [100]:
config = (PPOConfig()
          .environment(env_customizer.CustomizedEnvironment,
                      env_config={"env" : csbsingleenv.CodersStrikeBackSingle(),
                                 "customizer" : CustomizedRacing()})
          .framework('torch')
          .training(gamma=0.99)
          .rollouts(num_rollout_workers=3)
)

stop = {"timesteps_total": 100000}

tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop),
)

tuner.fit()

0,1
Current time:,2024-04-28 22:06:15
Running for:,00:04:04.87
Memory:,6.0/7.5 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomizedEnvironment_a2d3d_00000,TERMINATED,10.212.93.252:2440749,25,235.461,100000,-57.38,172.5,-94,106.63


[2m[36m(PPO pid=2440749)[0m Install gputil for GPU system monitoring.
[2m[36m(PPO pid=2440749)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/kajames/ray_results/PPO_2024-04-28_22-02-10/PPO_CustomizedEnvironment_a2d3d_00000_0_2024-04-28_22-02-10/checkpoint_000000)
2024-04-28 22:06:15,919	INFO tune.py:1143 -- Total run time: 245.00 seconds (244.84 seconds for the tuning loop).


ResultGrid<[
  Result(
    metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'__all__': {'num_agent_steps_trained': 128.0, 'num_env_steps_trained': 4000.0, 'total_loss': 8.874769193022999}, 'default_policy': {'total_loss': 8.874769193022999, 'policy_loss': -0.028919879482117796, 'vf_loss': 8.889652047838483, 'vf_loss_unclipped': 427.2954352730627, 'vf_explained_var': -0.0033682089116273406, 'entropy': 2.986706386624115, 'mean_kl_loss': 0.013863695907958147, 'default_optimizer_lr': 5.000000000000001e-05, 'curr_lr': 5e-05, 'curr_entropy_coeff': 0.0, 'curr_kl_coeff': 1.0125000476837158}}, 'num_env_steps_sampled': 100000, 'num_env_steps_trained': 0, 'num_agent_steps_sampled': 100000, 'num_agent_steps_trained': 0}, 'sampler_results': {'episode_reward_max': 172.5, 'episode_reward_min': -94.0, 'episode_reward_mean': -57.38, 'episode_len_mean': 106.63, 'episode_media': {}, 'episodes_this_iter': 41, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {