In [1]:
#%pip install gym[box2d]

In [2]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback

In [3]:
class TrainAndLoggingCallback(BaseCallback):
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)
        
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

In [6]:
callback = TrainAndLoggingCallback(200000, os.path.join('Training', 'Models'))

In [4]:
env = gym.make('CarRacing-v0')
env = DummyVecEnv([lambda: env])

In [11]:
log_path = os.path.join('Training', 'Logs')
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=log_path)
#model = PPO.load('./Training/Models/PPO_500k_CarRacing.zip', env)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [12]:
model.learn(total_timesteps=10000000, callback=callback)

Track generation: 1210..1515 -> 305-tiles track
Logging to Training\Logs\PPO_4
Track generation: 1214..1521 -> 307-tiles track
Track generation: 1140..1429 -> 289-tiles track
-----------------------------
| time/              |      |
|    fps             | 155  |
|    iterations      | 1    |
|    time_elapsed    | 13   |
|    total_timesteps | 2048 |
-----------------------------
Track generation: 1306..1637 -> 331-tiles track
Track generation: 1228..1539 -> 311-tiles track
------------------------------------------
| time/                   |              |
|    fps                  | 135          |
|    iterations           | 2            |
|    time_elapsed         | 30           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0065173777 |
|    clip_fraction        | 0.0843       |
|    clip_range           | 0.2          |
|    entropy_loss         | -4.27        |
|    explained_variance   | 0.00726      |
|  

KeyboardInterrupt: 

In [36]:
loaded_model = PPO.load('./Training/Models/best_model_400000.zip')

In [38]:
evaluate_policy(loaded_model,env, render=True, n_eval_episodes=1)
env.close()

Track generation: 1184..1484 -> 300-tiles track
Track generation: 1131..1418 -> 287-tiles track


In [132]:
for n in range(1, 6):
    obs = env.reset()
    score = 0
    done = False
    while not done:
        env.render()
        action = model.predict(obs.copy())
        obs, reward, done, info = env.step(action[0])
        score+=reward
    print(f"episode - {n} || score - {score}")
env.close()

Track generation: 976..1227 -> 251-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1121..1411 -> 290-tiles track
episode - 1 || score - 412.1107266435878
Track generation: 1020..1279 -> 259-tiles track
episode - 2 || score - 551.1627906976701
Track generation: 1167..1472 -> 305-tiles track
episode - 3 || score - 54.60526315789862
Track generation: 939..1184 -> 245-tiles track
episode - 4 || score - 354.9180327868705
Track generation: 1147..1438 -> 291-tiles track
episode - 5 || score - 462.0689655172273
