In [1]:
!pip install gymnasium stable-baselines3[extra] stable-baselines3 torch
# pip install stable-baselines3[extra]
# pip install stable-baselines3
# pip install torch



In [2]:
import json
import urllib.request

class GymAPI:

    def __init__(self, url):
        self.url = url

    def new_game(self):
        req = urllib.request.Request(f'{self.url}/new_game', method ='POST')
        with urllib.request.urlopen(req, timeout=60) as resp:
            json_data = resp.read()
            json_dict = json.loads(json_data)
            return json_dict
    
    def reset(self, uuid):
        req = urllib.request.Request(f'{self.url}/reset/{uuid}', method = 'POST')
        with urllib.request.urlopen(req, timeout=60) as resp:
            json_data = resp.read()
            json_dict = json.loads(json_data)
            return json_dict
        
    def step(self, uuid, action):
        data = json.dumps({'action':action}).encode()
        req = urllib.request.Request(f'{self.url}/step/{uuid}', data=data, headers={'Content-Type': 'application/json'}, method = 'POST')
        with urllib.request.urlopen(req, timeout=60) as resp:
            json_data = resp.read()
            json_dict = json.loads(json_data)
            return json_dict

In [None]:
# api = GymAPI("http://#.##.###.###:####")

# try:
#     game_info = api.new_game()
#     print('New game started', game_info)

#     uuid = game_info['uuid']

#     init_obs = api.reset(uuid)
#     print('Game reset, initial observation', init_obs)

#     action=1
#     result = api.step(uuid,action)
#     print('Result after step:', result)

# except Exception as e:
#     print('Error during API interaction:', e)

New game started {'action_space': {'class': 'Discrete', 'n': 4}, 'info': 'The uuid needs to be included in each call if you want to make an action in this environment. The values of observation and action space describe the objects of gymnasium.spaces, you are expected to create them yourself on your custom Wrapper', 'observation_space': {'class': 'Box', 'dtype': 'numpy.float32', 'high': 4, 'low': 0, 'shape': '(2,)'}, 'uuid': 'afa8031c-94ba-489a-9e42-626da060ba00'}
Game reset, initial observation {'observation': [0.0, 0.0]}
Result after step: {'done': False, 'info': {}, 'observation': [0, 0], 'reward': -1, 'truncated': False}


In [3]:
import numpy as np
import gymnasium as gym
from gymnasium.spaces import Box, Discrete
from typing import Optional

class MazeEnv(gym.Env):
    
    def __init__(self,api):
        super(MazeEnv, self).__init__()
        self.api = api
        self.uuid = None

        maze_game = self.api.new_game()
        self.uuid = maze_game['uuid']

        self.observation_space = gym.spaces.Dict(
                {
                    'agent': gym.spaces.Box(0.0, 4.0, shape=(2,), dtype=np.float32),
               }
            ) 

        self.action_space = gym.spaces.Discrete(4)


    def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
        
        super().reset(seed=seed)

        response = self.api.reset(self.uuid)
        observation = {
        "agent": np.array(response["observation"], dtype=np.float32)
                        }

        return observation, {}
    
    def step(self, action:int):

        action = int(action)
        
        response = self.api.step(self.uuid, action)
       
        observation = {
        "agent": np.array(response["observation"], dtype=np.float32)
                }
        reward = response['reward']
        done = response['done']
        truncated = response['truncated']
        info = response['info']        

        return observation, reward, done, truncated, info

In [None]:
# api = GymAPI("http://#.##.###.###:####")
# env = MazeEnv(api)

# obs, info = env.reset()
# print("Initial observation:", obs)

# for _ in range(10):
#     action = env.action_space.sample() 
#     obs, reward, done, truncated, info = env.step(action)
#     print(f"Step result: Observation={obs}, Reward={reward}, Done={done}, Truncated={truncated}")

#     if done or truncated:
#         break

Initial observation: {'agent': array([0., 0.], dtype=float32)}
Step result: Observation={'agent': array([1., 0.], dtype=float32)}, Reward=-0.04, Done=False, Truncated=False
Step result: Observation={'agent': array([1., 0.], dtype=float32)}, Reward=-1, Done=False, Truncated=False
Step result: Observation={'agent': array([1., 0.], dtype=float32)}, Reward=-1, Done=False, Truncated=False
Step result: Observation={'agent': array([0., 0.], dtype=float32)}, Reward=-0.04, Done=False, Truncated=False
Step result: Observation={'agent': array([0., 0.], dtype=float32)}, Reward=-1, Done=False, Truncated=False
Step result: Observation={'agent': array([1., 0.], dtype=float32)}, Reward=-0.04, Done=False, Truncated=False
Step result: Observation={'agent': array([2., 0.], dtype=float32)}, Reward=-0.04, Done=False, Truncated=False
Step result: Observation={'agent': array([2., 0.], dtype=float32)}, Reward=-1, Done=False, Truncated=False
Step result: Observation={'agent': array([1., 0.], dtype=float32)}, R

Έναρξη εκπαίδευσης

In [4]:
pip install tensorboard




In [None]:
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

api = GymAPI("http://#.##.###.###:####")

learning_rate = 0.0015
exploration_initial_eps = 1.0
exploration_final_eps = 0.1
exploration_fraction = 0.7
learning_starts=1000


env = MazeEnv(api)
env = Monitor(env, "./dqn_monitor_logs/")

In [47]:
#pip install tqdm rich

Note: you may need to restart the kernel to use updated packages.


In [5]:
import tqdm 
import rich

In [None]:
dqn_model1 = DQN("MultiInputPolicy", env, learning_rate=learning_rate, learning_starts=learning_starts, exploration_initial_eps = exploration_initial_eps, exploration_final_eps = exploration_final_eps, exploration_fraction = exploration_fraction, verbose=1, tensorboard_log="./maze_dqn_tensorboard/")
dqn_model1.learn(total_timesteps=100000, tb_log_name="dqn_first_run", progress_bar=True)
dqn_model1.save("DQN_model1")

In [None]:
MER_dqn, STD_dqn = evaluate_policy(dqn_model1, env, n_eval_episodes=100, render=False)
print(f"Mean Reward: {MER_dqn}, Std Reward: {STD_dqn}")

PPO

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
#from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

api = GymAPI("http://#.##.###.###:####")

learning_rate = 0.001
n_envs = 8
n_steps = 1024
batch_size = 64
n_epochs = 10
gamma = 0.9
clip_range = 0.1

def create_env():
    env = MazeEnv(api)
    env = Monitor(env, "./ppo_monitor_logs/")
    return env

In [6]:
# Parallel environments
vec_env = make_vec_env(create_env, n_envs=n_envs, vec_env_cls=SubprocVecEnv)

ppo_model1 = PPO("MultiInputPolicy", vec_env, learning_rate=learning_rate, n_steps=n_steps, gamma=gamma, batch_size=batch_size, n_epochs=n_epochs, verbose=1, tensorboard_log="./maze_ppo_tensorboard/", device = 'cpu')
ppo_model1.learn(total_timesteps = 143360, tb_log_name="ppo_first_run", progress_bar=True)
ppo_model1.save("PPO_model1")

Using cpu device
Logging to ./maze_ppo_tensorboard/ppo_first_run_17


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 291      |
|    ep_rew_mean     | -115     |
| time/              |          |
|    fps             | 68       |
|    iterations      | 1        |
|    time_elapsed    | 119      |
|    total_timesteps | 8192     |
---------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 356         |
|    ep_rew_mean          | -121        |
| time/                   |             |
|    fps                  | 65          |
|    iterations           | 2           |
|    time_elapsed         | 251         |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.021800369 |
|    clip_fraction        | 0.31        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.37       |
|    explained_variance   | -0.00676    |
|    learning_rate        | 0.001       |
|    loss                 | 0.791       |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0219     |
|    value_loss           | 27.5        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 461         |
|    ep_rew_mean          | -137        |
| time/                   |             |
|    fps                  | 63          |
|    iterations           | 3           |
|    time_elapsed         | 385         |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.017389048 |
|    clip_fraction        | 0.311       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.33       |
|    explained_variance   | 0.128       |
|    learning_rate        | 0.001       |
|    loss                 | 3.58        |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.0204     |
|    value_loss           | 35.6        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 399         |
|    ep_rew_mean          | -86.3       |
| time/                   |             |
|    fps                  | 62          |
|    iterations           | 4           |
|    time_elapsed         | 520         |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.015070673 |
|    clip_fraction        | 0.225       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.26       |
|    explained_variance   | 0.152       |
|    learning_rate        | 0.001       |
|    loss                 | 19.2        |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.0127     |
|    value_loss           | 79.4        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 262         |
|    ep_rew_mean          | 9.29        |
| time/                   |             |
|    fps                  | 62          |
|    iterations           | 5           |
|    time_elapsed         | 656         |
|    total_timesteps      | 40960       |
| train/                  |             |
|    approx_kl            | 0.012422793 |
|    clip_fraction        | 0.162       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.18       |
|    explained_variance   | 0.544       |
|    learning_rate        | 0.001       |
|    loss                 | 34.3        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0114     |
|    value_loss           | 54.9        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 199         |
|    ep_rew_mean          | 43.2        |
| time/                   |             |
|    fps                  | 62          |
|    iterations           | 6           |
|    time_elapsed         | 792         |
|    total_timesteps      | 49152       |
| train/                  |             |
|    approx_kl            | 0.012048295 |
|    clip_fraction        | 0.15        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.11       |
|    explained_variance   | 0.672       |
|    learning_rate        | 0.001       |
|    loss                 | 33          |
|    n_updates            | 50          |
|    policy_gradient_loss | -0.0123     |
|    value_loss           | 73.5        |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 185        |
|    ep_rew_mean          | 54.1       |
| time/                   |            |
|    fps                  | 61         |
|    iterations           | 7          |
|    time_elapsed         | 926        |
|    total_timesteps      | 57344      |
| train/                  |            |
|    approx_kl            | 0.01216251 |
|    clip_fraction        | 0.171      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.08      |
|    explained_variance   | 0.762      |
|    learning_rate        | 0.001      |
|    loss                 | 17.4       |
|    n_updates            | 60         |
|    policy_gradient_loss | -0.00986   |
|    value_loss           | 45.1       |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 163         |
|    ep_rew_mean          | 66.7        |
| time/                   |             |
|    fps                  | 61          |
|    iterations           | 8           |
|    time_elapsed         | 1062        |
|    total_timesteps      | 65536       |
| train/                  |             |
|    approx_kl            | 0.012512701 |
|    clip_fraction        | 0.145       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.02       |
|    explained_variance   | 0.781       |
|    learning_rate        | 0.001       |
|    loss                 | 33          |
|    n_updates            | 70          |
|    policy_gradient_loss | -0.0074     |
|    value_loss           | 52.5        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 82.4        |
|    ep_rew_mean          | 85.9        |
| time/                   |             |
|    fps                  | 61          |
|    iterations           | 9           |
|    time_elapsed         | 1201        |
|    total_timesteps      | 73728       |
| train/                  |             |
|    approx_kl            | 0.020704962 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.964      |
|    explained_variance   | 0.823       |
|    learning_rate        | 0.001       |
|    loss                 | 24.1        |
|    n_updates            | 80          |
|    policy_gradient_loss | -0.0162     |
|    value_loss           | 45.2        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 42.5        |
|    ep_rew_mean          | 93.3        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 10          |
|    time_elapsed         | 1347        |
|    total_timesteps      | 81920       |
| train/                  |             |
|    approx_kl            | 0.026578661 |
|    clip_fraction        | 0.263       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.911      |
|    explained_variance   | 0.855       |
|    learning_rate        | 0.001       |
|    loss                 | 43.2        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0358     |
|    value_loss           | 67.5        |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 20.1       |
|    ep_rew_mean          | 98.3       |
| time/                   |            |
|    fps                  | 59         |
|    iterations           | 12         |
|    time_elapsed         | 1664       |
|    total_timesteps      | 98304      |
| train/                  |            |
|    approx_kl            | 0.03378561 |
|    clip_fraction        | 0.305      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.605     |
|    explained_variance   | 0.907      |
|    learning_rate        | 0.001      |
|    loss                 | 36.2       |
|    n_updates            | 110        |
|    policy_gradient_loss | -0.0495    |
|    value_loss           | 70.6       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 17.1       |
|    ep_rew_mean          | 98.8       |
| time/                   |            |
|    fps                  | 58         |
|    iterations           | 13         |
|    time_elapsed         | 1828       |
|    total_timesteps      | 106496     |
| train/                  |            |
|    approx_kl            | 0.03692009 |
|    clip_fraction        | 0.309      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.387     |
|    explained_variance   | 0.927      |
|    learning_rate        | 0.001      |
|    loss                 | 18.4       |
|    n_updates            | 120        |
|    policy_gradient_loss | -0.049     |
|    value_loss           | 47.7       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 15         |
|    ep_rew_mean          | 99.1       |
| time/                   |            |
|    fps                  | 57         |
|    iterations           | 14         |
|    time_elapsed         | 2000       |
|    total_timesteps      | 114688     |
| train/                  |            |
|    approx_kl            | 0.06359939 |
|    clip_fraction        | 0.211      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.175     |
|    explained_variance   | 0.953      |
|    learning_rate        | 0.001      |
|    loss                 | 7.61       |
|    n_updates            | 130        |
|    policy_gradient_loss | -0.0464    |
|    value_loss           | 25.3       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 14.4       |
|    ep_rew_mean          | 99.3       |
| time/                   |            |
|    fps                  | 56         |
|    iterations           | 15         |
|    time_elapsed         | 2173       |
|    total_timesteps      | 122880     |
| train/                  |            |
|    approx_kl            | 0.04408013 |
|    clip_fraction        | 0.0659     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.1       |
|    explained_variance   | 0.967      |
|    learning_rate        | 0.001      |
|    loss                 | 5.68       |
|    n_updates            | 140        |
|    policy_gradient_loss | -0.0211    |
|    value_loss           | 15.1       |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 16.4        |
|    ep_rew_mean          | 99.2        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 16          |
|    time_elapsed         | 2341        |
|    total_timesteps      | 131072      |
| train/                  |             |
|    approx_kl            | 0.052749343 |
|    clip_fraction        | 0.0746      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.0674     |
|    explained_variance   | 0.99        |
|    learning_rate        | 0.001       |
|    loss                 | 4.4         |
|    n_updates            | 150         |
|    policy_gradient_loss | -0.0181     |
|    value_loss           | 5.54        |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 15         |
|    ep_rew_mean          | 99         |
| time/                   |            |
|    fps                  | 55         |
|    iterations           | 17         |
|    time_elapsed         | 2521       |
|    total_timesteps      | 139264     |
| train/                  |            |
|    approx_kl            | 0.08811796 |
|    clip_fraction        | 0.14       |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.0912    |
|    explained_variance   | 0.78       |
|    learning_rate        | 0.001      |
|    loss                 | 71.2       |
|    n_updates            | 160        |
|    policy_gradient_loss | -0.0354    |
|    value_loss           | 104        |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 15.3       |
|    ep_rew_mean          | 98.3       |
| time/                   |            |
|    fps                  | 54         |
|    iterations           | 18         |
|    time_elapsed         | 2696       |
|    total_timesteps      | 147456     |
| train/                  |            |
|    approx_kl            | 0.07238458 |
|    clip_fraction        | 0.172      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.0471    |
|    explained_variance   | 0.903      |
|    learning_rate        | 0.001      |
|    loss                 | 13.7       |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.0461    |
|    value_loss           | 46.8       |
----------------------------------------


In [12]:
MER_ppo, STD_ppo = evaluate_policy(ppo_model1, vec_env, n_eval_episodes=100)
print(f"Mean Reward: {MER_ppo:.2f}, Std: {STD_ppo}")

Mean Reward: 99.48, Std: 2.842170943040401e-14
