Goal: use reinforcement learning to solve the breakout problem in Atari

1. Import Dependencies

In [1]:
import gym                                                      # import the openAI gym
from stable_baselines3 import A2C                               # import algorithm
from stable_baselines3.common.vec_env import VecFrameStack      # import stuff for vectorizing environment (speed up training)
from stable_baselines3.common.evaluation import evaluate_policy # import policy evaluation stuff
from stable_baselines3.common.env_util import make_atari_env    # for working with atari environment
import os                                                       # for paths and stuff

2) Test Environment

In [3]:
# To get the Atari ROMs use:
# pip install gymnasium[accept-rom-license]
#
# See this link for more info:
# https://gymnasium.farama.org/environments/atari/#atari

environment_name = 'Breakout-v4'
env = gym.make(environment_name,render_mode = 'human')
env.close()   # comment this out to see the computer play atari


In [4]:
env.reset()

(array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        ...,
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]], dtype=uint8),
 {'lives': 5, 'episode_frame_number': 0, 'frame_number': 0})

In [5]:
# the type of action space
env.action_space

Discrete(4)

In [6]:
# this is an image based model (dimensions of the image reported here)
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

In [7]:
# loop for testing environment
episodes = 5
for episode in range(1,episodes+1):
    obs = env.reset()[0]
    done = False
    score = 0

    while not done:
        # make the render
        env.render()

        # NOW USING MODEL HERE!!
        action = env.action_space.sample()

        # take a step using the action and return the new state, reward, is the episode done? x2, info
        obs, reward, terminated, truncated, info = env.step(action)
        done = truncated or terminated

        # aggregate the reward
        score += reward

    print('Episode:{} Score:{}'.format(episode, score))
env.close()

  logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


Episode:1 Score:2.0
Episode:2 Score:0.0
Episode:3 Score:0.0
Episode:4 Score:2.0
Episode:5 Score:0.0


3) Vectorize Environment and Train Model

In [2]:
env = make_atari_env('Breakout-v4', n_envs=4, seed=0)   # helper function for making the environment
env = VecFrameStack(env, n_stack=4)                     # wrapper for stacking environments together
env.close()

In [5]:
# set up the model for training
log_path = os.path.join('Training', 'Logs')
model = A2C('CnnPolicy', env, verbose=1, tensorboard_log=log_path)  # different algorithm from PPO or DQN, using CNN policy instead of MLP because our observations are an image

Using cuda device
Wrapping the env in a VecTransposeImage.


In [6]:
# train for 100,000 time steps
model.learn(total_timesteps=100000)

Logging to Training\Logs\A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 272      |
|    ep_rew_mean        | 1.36     |
| time/                 |          |
|    fps                | 324      |
|    iterations         | 100      |
|    time_elapsed       | 6        |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | 0.0783   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.0368   |
|    value_loss         | 0.0553   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 285      |
|    ep_rew_mean        | 1.63     |
| time/                 |          |
|    fps                | 405      |
|    iterations         | 200      |
|    time_elapsed       | 9        |
|    total_timesteps    | 4000     |
| train

<stable_baselines3.a2c.a2c.A2C at 0x2865ae26a50>

In [7]:
a2c_path = os.path.join('Training','Saved Models','A2C_Breakout_Model')
model.save(a2c_path)

In [8]:
del model

5) Evaluate and Test

In [5]:
# going to leverage model on non-vectorized environment
env = make_atari_env('Breakout-v4', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)

# load model
a2c_path = os.path.join('Training','Saved Models','A2C_Breakout_Model')
model = A2C.load(a2c_path, env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.




ValueError: Observation spaces do not match: Box(0, 255, (4, 84, 84), uint8) != Box(0, 255, (3, 210, 160), uint8)

: 

In [4]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

  logger.warn(


(7.4, 2.0591260281974)