In [1]:
from stable_baselines3 import DQN
from stable_baselines3.dqn import MlpPolicy
from stable_baselines3 import PPO
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, sync_envs_normalization
from stable_baselines3.common.monitor import Monitor
import gymnasium as gym

In [2]:
from gymnasium_env.envs.f1_env import RewardFunction
from gymnasium_env.envs.f1_env import RewardFunctionPerPositionAtFinalLap

from racesim.config import GridConfigMixed
from racesim.config import GridConfig

In [3]:
class F1EnvEvalCallback(EvalCallback):
    def __init__(
        self,
        eval_env: Union[gym.Env, VecEnv],
        callback_on_new_best: Optional[BaseCallback] = None,
        callback_after_eval: Optional[BaseCallback] = None,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        log_path: Optional[str] = None,
        best_model_save_path: Optional[str] = None,
        deterministic: bool = True,
        render: bool = False,
        verbose: int = 1,
        warn: bool = True,
        seed_eval_env: int = 0,
        reward_function: RewardFunction = None,
        grid_config: GridConfig = None
    ):
        super().__init__(eval_env,callback_on_new_best,callback_after_eval,n_eval_episodes,eval_freq,
                         log_path,best_model_save_path,deterministic,render,verbose,warn)
        self.seed_eval_env = seed_eval_env
        self.reward_function = reward_function
        self.grid_config = grid_config

    def _on_step(self) -> bool:
        continue_training = super()._on_step()
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            self.set_env_for_next_eval()
        return continue_training
        

    def set_env_for_next_eval(self) -> None:
        self.eval_env = gym.make("F1Env/Basic-v0", seed=self.seed_eval_env,
                                 reward_function=self.reward_function, grid_config=self.grid_config)

In [4]:
seed = 0
n_seeds = 3
seed_eval = 50
reward_function = RewardFunctionPerPositionAtFinalLap()

grid_config = GridConfigMixed()

In [5]:
for i in range(n_seeds):
    print("-----------------\n\n" + " RUN " + str(i) + "\n\n--------------------------")
    env = gym.make("F1Env/Basic-v0", seed=i, reward_function=reward_function, grid_config=grid_config)
    env_to_eval = gym.make("F1Env/Basic-v0", seed=seed_eval, reward_function=reward_function, grid_config=grid_config)

    eval_callback = F1EnvEvalCallback(eval_env=env_to_eval, best_model_save_path="./logs_" + str(i) + "/",
                             log_path="./logs_" + str(i) + "/", n_eval_episodes=20, eval_freq=25000,
                             deterministic=True, render=False, seed_eval_env=seed_eval,
                                      reward_function=reward_function, grid_config=grid_config)

    model_reward_position_change = DQN("MlpPolicy", env, gamma=1, buffer_size=100000, learning_starts=5000,
                                            train_freq=(16,"step"), exploration_initial_eps=0.25, exploration_fraction=0.2,
                                            exploration_final_eps=0.020, learning_rate=0.00005,
                                            batch_size=256,
                                            verbose=0, tensorboard_log="./dqn_r3/", seed=i)
    model_reward_position_change.learn(total_timesteps=1e6, callback=eval_callback)
    model_reward_position_change.save(path="./model_reward_position_change_"+str(i))

-----------------

 RUN 0

--------------------------




Eval num_timesteps=25000, episode_reward=-1.00 +/- 0.00
Episode length: 77.00 +/- 0.00
New best mean reward!
Eval num_timesteps=50000, episode_reward=-0.40 +/- 0.49
Episode length: 77.00 +/- 0.00
New best mean reward!
Eval num_timesteps=75000, episode_reward=-0.50 +/- 0.50
Episode length: 77.00 +/- 0.00
Eval num_timesteps=100000, episode_reward=0.41 +/- 2.47
Episode length: 77.00 +/- 0.00
New best mean reward!
Eval num_timesteps=125000, episode_reward=-0.03 +/- 1.02
Episode length: 77.00 +/- 0.00
Eval num_timesteps=150000, episode_reward=0.55 +/- 1.66
Episode length: 77.00 +/- 0.00
New best mean reward!
Eval num_timesteps=175000, episode_reward=0.86 +/- 1.86
Episode length: 77.00 +/- 0.00
New best mean reward!
Eval num_timesteps=200000, episode_reward=0.49 +/- 1.65
Episode length: 77.00 +/- 0.00
Eval num_timesteps=225000, episode_reward=1.71 +/- 1.81
Episode length: 77.00 +/- 0.00
New best mean reward!
Eval num_timesteps=250000, episode_reward=0.60 +/- 1.41
Episode length: 77.00 +/- 0.

In [6]:
# import numpy as np
# from tqdm import tqdm
# import torch
# from stable_baselines3.common.utils import obs_as_tensor
# env_to_test = gym.make("F1Env/Basic-v0")

In [7]:
# agent = model_reward_position_change
# stops = 0

# for episode in tqdm(range(1)):
#     t = 0
#     total_rw = 0
#     obs, info = env_to_test.reset()
#     print(env_to_test.our_driver.tyre.__dict__)
#     # print(obs)
#     # print("------")
#     print("START POSITION: ", env_to_test.our_driver.position)
#     done = False

#     # play one episode
#     while not done:
#         # display(env_to_test.race_simulation.to_df())
#         action, _ = agent.predict(obs, deterministic=True)
#         with torch.no_grad():
#             observation = obs.reshape((-1,) + agent.observation_space.shape)
#             observation = obs_as_tensor(observation, 'cpu')
#             q_values = agent.q_net(observation)
#             # print(q_values)
#         if action != 0:
#             stops+=1
#             print(action, " - laps to go: ", env_to_test.our_driver.laps_to_go, " - position: ", env_to_test.our_driver.position)
#             print(q_values)
#         next_obs, reward, terminated, truncated, info = env_to_test.step(action)
#         # print('REWARDA: ', reward)

#         total_rw += reward

#         done = terminated or truncated
#         obs = next_obs
#         # print(obs)
#         # print("------")
#         t+=1

#     # print("Reward: " , total_rw)
# # display(env_to_test.race_simulation.to_df())
# print("Reward: " , total_rw, " - ", "Stops: ", stops)
# env_to_test.race_simulation.driver_being_controlled.position

  0%|                                                     | 0/1 [00:00<?, ?it/s]

{'compound': 0, 'base_lap_time': 70.0193, 'squared_deg': 0.00281, 'linear_deg': -0.01999, 'used_laps': 0}
START POSITION:  14
3  - laps to go:  78  - position:  14
tensor([[0.5046, 0.4748, 0.5634, 0.6001]])
1  - laps to go:  77  - position:  20
tensor([[0.5628, 0.6032, 0.4985, 0.5832]])
2  - laps to go:  58  - position:  20
tensor([[1.5260, 1.5265, 1.5397, 1.2385]])
1  - laps to go:  37  - position:  19
tensor([[1.5190, 1.5768, 1.3950, 0.8807]])


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  2.15it/s]

Reward:  -3  -  Stops:  4





17

In [8]:
# env_to_test.race_simulation.to_df()

Unnamed: 0,id,interval,gap,lapsToGo,lastLapTime,position,potentialLapTime,compound,usedLaps,secondDry
0,6,0.0,0.0,0,73.2121,1,73.31312,0,38,True
1,13,7.389859,7.389859,0,73.425885,2,73.5354,0,39,True
2,14,0.711938,8.101797,0,73.641985,3,73.6798,2,45,True
3,10,1.602005,9.703802,0,73.645327,4,73.6836,2,45,True
4,7,6.182444,15.886246,0,74.105853,5,74.15684,2,48,True
5,2,10.065012,25.951258,0,73.608189,6,73.7228,0,40,True
6,5,5.140064,31.091321,0,73.462855,7,73.495,2,44,True
7,8,2.390633,33.481954,0,73.570424,8,73.6073,2,44,True
8,1,8.235431,41.717385,0,74.090032,9,74.13932,2,48,True
9,3,0.29779,42.015175,0,73.058395,10,72.32952,0,33,True
