In [1]:
from sb3_contrib import QRDQN
from stable_baselines3.dqn import MlpPolicy
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, sync_envs_normalization
from stable_baselines3.common.monitor import Monitor
import gymnasium as gym

In [2]:
from gymnasium_env.envs.f1_env import RewardFunction
from gymnasium_env.envs.f1_env import RewardFunctionPerPositionAtFinalLap

from racesim.config import GridConfigMixed
from racesim.config import GridConfig

In [3]:
class F1EnvEvalCallback(EvalCallback):
    def __init__(
        self,
        eval_env: Union[gym.Env, VecEnv],
        callback_on_new_best: Optional[BaseCallback] = None,
        callback_after_eval: Optional[BaseCallback] = None,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        log_path: Optional[str] = None,
        best_model_save_path: Optional[str] = None,
        deterministic: bool = True,
        render: bool = False,
        verbose: int = 1,
        warn: bool = True,
        seed_eval_env: int = 0,
        reward_function: RewardFunction = None,
        grid_config: GridConfig = None
    ):
        super().__init__(eval_env,callback_on_new_best,callback_after_eval,n_eval_episodes,eval_freq,
                         log_path,best_model_save_path,deterministic,render,verbose,warn)
        self.seed_eval_env = seed_eval_env
        self.reward_function = reward_function
        self.grid_config = grid_config

    def _on_step(self) -> bool:
        continue_training = super()._on_step()
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            self.set_env_for_next_eval()
        return continue_training
        

    def set_env_for_next_eval(self) -> None:
        self.eval_env = gym.make("F1Env/Basic-v0", seed=self.seed_eval_env,
                                 reward_function=self.reward_function, grid_config=self.grid_config)

In [4]:
seed = 0
n_seeds = 1
seed_eval = 50
reward_function = RewardFunctionPerPositionAtFinalLap()

grid_config = GridConfigMixed()
policy_kwargs = dict(n_quantiles=21)

In [5]:
for i in range(n_seeds):
    print("-----------------\n\n" + " RUN " + str(i) + "\n\n--------------------------")
    env = gym.make("F1Env/Basic-v0", seed=i, reward_function=reward_function, grid_config=grid_config)
    env_to_eval = gym.make("F1Env/Basic-v0", seed=seed_eval, reward_function=reward_function, grid_config=grid_config)

    eval_callback = F1EnvEvalCallback(eval_env=env_to_eval, best_model_save_path="./logs_" + str(i) + "/",
                             log_path="./logs_" + str(i) + "/", n_eval_episodes=20, eval_freq=25000,
                             deterministic=True, render=False, seed_eval_env=seed_eval,
                                      reward_function=reward_function, grid_config=grid_config)

    model_reward_position_change = QRDQN("MlpPolicy", env, gamma=1, buffer_size=100000, learning_starts=5000,
                                            train_freq=(16,"step"), exploration_initial_eps=0.25, exploration_fraction=0.2,
                                            exploration_final_eps=0.020, learning_rate=0.00005,
                                            batch_size=256, policy_kwargs=policy_kwargs,
                                            verbose=0, tensorboard_log="./qr-dqn_r3/", seed=i)
    model_reward_position_change.learn(total_timesteps=2e6, callback=eval_callback)
    model_reward_position_change.save(path="./model_reward_position_change_"+str(i))

-----------------

 RUN 0

--------------------------




Eval num_timesteps=25000, episode_reward=-1.00 +/- 0.00
Episode length: 77.00 +/- 0.00
New best mean reward!
Eval num_timesteps=50000, episode_reward=-0.75 +/- 0.43
Episode length: 77.00 +/- 0.00
New best mean reward!
Eval num_timesteps=75000, episode_reward=-0.05 +/- 0.22
Episode length: 77.00 +/- 0.00
New best mean reward!
Eval num_timesteps=100000, episode_reward=-0.25 +/- 0.43
Episode length: 77.00 +/- 0.00
Eval num_timesteps=125000, episode_reward=-0.33 +/- 0.50
Episode length: 77.00 +/- 0.00
Eval num_timesteps=150000, episode_reward=-0.29 +/- 1.06
Episode length: 77.00 +/- 0.00
Eval num_timesteps=175000, episode_reward=-0.34 +/- 1.55
Episode length: 77.00 +/- 0.00
Eval num_timesteps=200000, episode_reward=0.46 +/- 2.14
Episode length: 77.00 +/- 0.00
New best mean reward!
Eval num_timesteps=225000, episode_reward=1.75 +/- 2.31
Episode length: 77.00 +/- 0.00
New best mean reward!
Eval num_timesteps=250000, episode_reward=2.34 +/- 1.66
Episode length: 77.00 +/- 0.00
New best mean re

In [6]:
# import numpy as np
# from tqdm import tqdm
# import torch
# from stable_baselines3.common.utils import obs_as_tensor
# env_to_test = gym.make("F1Env/Basic-v0")

In [7]:
# agent = model_reward_position_change
# stops = 0

# for episode in tqdm(range(1)):
#     t = 0
#     total_rw = 0
#     obs, info = env_to_test.reset()
#     print(env_to_test.our_driver.tyre.__dict__)
#     # print(obs)
#     # print("------")
#     print("START POSITION: ", env_to_test.our_driver.position)
#     done = False

#     # play one episode
#     while not done:
#         # display(env_to_test.race_simulation.to_df())
#         action, _ = agent.predict(obs, deterministic=True)
#         with torch.no_grad():
#             observation = obs.reshape((-1,) + agent.observation_space.shape)
#             observation = obs_as_tensor(observation, 'cpu')
#             q_values = agent.q_net(observation)
#             # print(q_values)
#         if action != 0:
#             stops+=1
#             print(action, " - laps to go: ", env_to_test.our_driver.laps_to_go, " - position: ", env_to_test.our_driver.position)
#             print(q_values)
#         next_obs, reward, terminated, truncated, info = env_to_test.step(action)
#         # print('REWARDA: ', reward)

#         total_rw += reward

#         done = terminated or truncated
#         obs = next_obs
#         # print(obs)
#         # print("------")
#         t+=1

#     # print("Reward: " , total_rw)
# # display(env_to_test.race_simulation.to_df())
# print("Reward: " , total_rw, " - ", "Stops: ", stops)
# env_to_test.race_simulation.driver_being_controlled.position

In [8]:
# env_to_test.race_simulation.to_df()