# Custom Env

In [2]:
import gym
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete

import os
import numpy as np
import random

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

### Types of Spaces

In [9]:
Discrete(3).sample()

2

In [49]:
Box(low=np.array([0]), high=np.array([100])).sample()
Box(0, 100, shape=(1, )).sample()



array([47.028362], dtype=float32)

In [31]:
Tuple((Discrete(3), Box(0, 1, shape=(3,)))).sample()

(1, array([0.54304576, 0.614665  , 0.68051237], dtype=float32))

In [27]:
Dict({'height': Discrete(2)}).sample()

OrderedDict([('height', 0)])

In [39]:
MultiBinary(4).sample()

array([1, 0, 1, 0], dtype=int8)

In [45]:
MultiDiscrete([3, 6, 2]).sample()

array([1, 0, 0])

## Env
- Build an agent to give best shower
- Temp changes randomly?
- Optimum 37-39
- Optimum score will be 90% of shower_length

In [3]:
ENV_NAME = 'Shower-v0'

In [4]:
class ShowerEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3)
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        
    def step(self, action):
        self.state += action-1
        
        self.shower_length -= 1

        if self.state >= 37 and self.state <= 39:
            reward = 1
        else:
            reward = -1

        done = True if self.shower_length <= 0 else False

        if self.state < 0 or self.state > 100:
            reward = -100
            done = True
            self.state = 0 if self.state < 0 else 100

        info = {}

        return self.state, reward, done, info
        
    def render(self):
        pass
    
    def reset(self):
        self.state = np.array([38 + random.randint(-3, 3)]).astype(float) # initial state
        self.shower_length = 60
        return self.state

#### Testing

In [5]:
env = ShowerEnv()

episodes = 5
for ep in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    while not done:
        env.render()
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        score += reward
    
    print(f"Ep #{ep} - Score: {score}")
env.close()

Ep #1 - Score: -46
Ep #2 - Score: -20
Ep #3 - Score: 16
Ep #4 - Score: -58
Ep #5 - Score: -60




### Vectorized Stack of Envs

In [6]:
env = ShowerEnv()
env = DummyVecEnv([lambda: env])

## Model

### Initializing

In [7]:
LOG_PATH = os.path.join('training', 'logs', ENV_NAME)

In [9]:
# new
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=LOG_PATH)

Using cpu device


In [21]:
# load
MODEL_PATH = os.path.join('training', 'saved_models', ENV_NAME, 'v1')
model = PPO.load(MODEL_PATH, env=env)

### Training

In [18]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=50, verbose=1)

eval_callback = EvalCallback(
    env,
    callback_on_new_best=stop_callback,
    eval_freq=1000,
    best_model_save_path=os.path.join('training', 'saved_models', ENV_NAME, input()),
    verbose=1,
)

model.learn(total_timesteps=20000, callback=eval_callback)

Logging to training/logs/Shower-v0/PPO_4
Eval num_timesteps=1000, episode_reward=-12.00 +/- 58.79
Episode length: 60.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 60       |
|    mean_reward     | -12      |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
Eval num_timesteps=2000, episode_reward=-12.00 +/- 58.79
Episode length: 60.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 60       |
|    mean_reward     | -12      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
-----------------------------
| time/              |      |
|    fps             | 1890 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
Eval num_timesteps=3000, episode_reward=36.00 +/- 48.00
Episode length: 60.

<stable_baselines3.ppo.ppo.PPO at 0x7ff5a395eca0>

In [23]:
model.learn(total_timesteps=40000)

Logging to training/logs/Shower-v0/PPO_5
-----------------------------
| time/              |      |
|    fps             | 2756 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1877        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.004458856 |
|    clip_fraction        | 0.0856      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.07       |
|    explained_variance   | 0.000411    |
|    learning_rate        | 0.0003      |
|    loss                 | 25.9        |
|    n_updates            | 160         |
|    policy_gradient_loss | -0.00349    |
|    value_loss           | 47.4        |
-----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7ff59af24370>

In [None]:
# save model
SAVE_PATH = os.path.join('training', 'saved_models', ENV_NAME, 'v1')
model.save(SAVE_PATH)
del model

### Evaluation

In [24]:
evaluate_policy(model, env, n_eval_episodes=100, render=False)
# returns reward, std deviation

(59.52, 0.854166260162505)

In [10]:
env.close()

### Test

In [28]:
episodes = 100
for ep in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    while not done:
        # env.render()
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
    
    print(f"Ep #{ep} - Score: {score}")
env.close()

Ep #1 - Score: [44.]
Ep #2 - Score: [58.]
Ep #3 - Score: [30.]
Ep #4 - Score: [46.]
Ep #5 - Score: [48.]
Ep #6 - Score: [48.]
Ep #7 - Score: [30.]
Ep #8 - Score: [52.]
Ep #9 - Score: [34.]
Ep #10 - Score: [42.]
Ep #11 - Score: [46.]
Ep #12 - Score: [40.]
Ep #13 - Score: [58.]
Ep #14 - Score: [28.]
Ep #15 - Score: [50.]
Ep #16 - Score: [54.]
Ep #17 - Score: [48.]
Ep #18 - Score: [52.]
Ep #19 - Score: [46.]
Ep #20 - Score: [54.]
Ep #21 - Score: [50.]
Ep #22 - Score: [46.]
Ep #23 - Score: [54.]
Ep #24 - Score: [52.]
Ep #25 - Score: [56.]
Ep #26 - Score: [54.]
Ep #27 - Score: [44.]
Ep #28 - Score: [54.]
Ep #29 - Score: [40.]
Ep #30 - Score: [54.]
Ep #31 - Score: [46.]
Ep #32 - Score: [42.]
Ep #33 - Score: [34.]
Ep #34 - Score: [56.]
Ep #35 - Score: [46.]
Ep #36 - Score: [46.]
Ep #37 - Score: [50.]
Ep #38 - Score: [56.]
Ep #39 - Score: [48.]
Ep #40 - Score: [34.]
Ep #41 - Score: [46.]
Ep #42 - Score: [48.]
Ep #43 - Score: [40.]
Ep #44 - Score: [40.]
Ep #45 - Score: [30.]
Ep #46 - Score: [52