<a href="https://colab.research.google.com/github/laurelkeys/machine-learning/blob/master/assignment-4/Trajectories.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [126]:
import os

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
PATH_TO_DATA = os.path.join("drive", "My Drive", "unicamp", "MC886", "atari")

# PATH_TO_DATA = ""

Mounted at /content/drive


In [127]:
SAVE_DIR = os.path.join(PATH_TO_DATA, "Dataset", "data")
os.makedirs(SAVE_DIR, exist_ok=True)

SAVE_DIR # where the trajectories for each game will be saved to

'drive/My Drive/unicamp/MC886/atari/Dataset/data'

In [128]:
LOG_DIR = os.path.join(PATH_TO_DATA, "Dataset", "data", "results")
os.makedirs(LOG_DIR, exist_ok=True)

LOG_DIR # where the stats for each game will be saved to

'drive/My Drive/unicamp/MC886/atari/Dataset/data/results'

## Generate a dataset of trajectories from pre-trained RL agents on [Atari](https://gym.openai.com/envs/#atari) [environments](https://github.com/openai/gym/wiki/Table-of-environments).
That is, by the end of this notebook we will have $observation \rightarrow action$ mappings, where $observation$s are images of shape `IMG_SHAPE` and $action$s are integer values in the range $[0, 18)$, meaning:

| 0 | 1 | 2 | 3 | 4 | 5 |
| --- | --- | --- | --- | --- | --- |
| NOOP | FIRE | UP | RIGHT | LEFT | DOWN |


| 6 | 7 | 8 | 9 |
| --- | --- | --- | --- |
| UPRIGHT | UPLEFT | DOWNRIGHT | DOWNLEFT |


| 10 | 11 | 12 | 13 |
| --- | --- | --- | --- |
| UPFIRE | RIGHTFIRE | LEFTFIRE | DOWNFIRE |


| 14 | 15 | 16 | 17 |
| --- | --- | --- | --- |
| UPRIGHTFIRE | UPLEFTFIRE | DOWNRIGHTFIRE | DOWNLEFTFIRE |

In [0]:
# number of trajectories to generate
N_OF_TRAJECTORIES = 200

# number of steps per trajectory
N_OF_STEPS = 1000

# list of string tuples in the format (RL Algorithm, Game Environment)
GAMES = [
    ("PPO2", "BreakoutNoFrameskip-v4"),
    ("PPO2", "PongNoFrameskip-v4"),
]

In [130]:
[env_id for algo, env_id in GAMES]

['BreakoutNoFrameskip-v4', 'PongNoFrameskip-v4']

## Install dependencies

Note that we're not installing [MPI](https://mpi4py.readthedocs.io/en/stable/), so the following algorithms will probably not work: `DDPG`, `GAIL`, `PPO1`, `TRPO`.

In [0]:
!apt-get update                                                  > /dev/null 2>&1
!apt-get install swig cmake zlib1g-dev ffmpeg freeglut3-dev xvfb > /dev/null 2>&1
!pip install pytablewriter                                       > /dev/null 2>&1
# !pip install pytablewriter pyyaml optuna scikit-optimize         > /dev/null 2>&1

In [132]:
#### Stable Baselines only supports TF 1.x for now ####
try:
    # Colab only
    %tensorflow_version 1.x
except Exception:
    pass

import tensorflow as tf
from tensorflow import keras
print(tf.__version__)

1.15.0


In [0]:
import os
from time import time
from IPython.display import clear_output

import cv2
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# NOTE use tqdm.write() instead of print() inside of tqdm wrapped loops
from tqdm import tqdm

import gym
from gym.envs.atari.atari_env import ACTION_MEANING

### Update [Stable Baselines](https://github.com/hill-a/stable-baselines) and clone [RL Zoo Baselines](https://github.com/araffin/rl-baselines-zoo)

In [134]:
!pip list | grep baselines

stable-baselines         2.9.0a0    


In [0]:
!yes | pip uninstall stable-baselines                           > /dev/null 2>&1
!pip install git+https://github.com/hill-a/stable-baselines.git > /dev/null 2>&1

In [136]:
!pip list | grep baselines

stable-baselines         2.9.0a0    


In [0]:
from stable_baselines.common.cmd_util import make_atari_env
from stable_baselines.common.vec_env import VecFrameStack, DummyVecEnv

# HACK to save logs
from stable_baselines import logger
os.environ["OPENAI_LOG_FORMAT"] = 'csv' # 'stdout,log,csv,tensorboard'
os.environ["OPENAI_LOGDIR"] = os.path.abspath(LOG_DIR)
logger.configure()

# NOTE add more algorithms here if you want to use them
from stable_baselines import PPO2, ACER, ACKTR
ALGO_IMPL = {
    'PPO2': PPO2,
    'ACER': ACER,
    'ACKTR': ACKTR,
}

In [0]:
!git clone https://github.com/araffin/rl-baselines-zoo.git      > /dev/null 2>&1

## Load the pre-trained agents

In [139]:
!ls rl-baselines-zoo/trained_agents/

a2c  acer  acktr  ddpg	dqn  her  ppo2	sac  td3  trpo


In [0]:
PATH_TO_AGENTS = os.path.join("rl-baselines-zoo", "trained_agents")

In [141]:
ext = "NoFrameskip-v4.pkl"
# check the available pre-trained models
algorithms = ["PPO2"]
for algo in algorithms:
    algo_path = os.path.join(PATH_TO_AGENTS, algo.lower())
    print(algo_path + '/')
    for f in sorted(os.listdir(algo_path), key=lambda x: x[::-1]):
        # sort by the reverse filename, so env types get grouped together
        if f.endswith(ext):
            print("├──", f)

rl-baselines-zoo/trained_agents/ppo2/
├── PongNoFrameskip-v4.pkl
├── MsPacmanNoFrameskip-v4.pkl
├── EnduroNoFrameskip-v4.pkl
├── BeamRiderNoFrameskip-v4.pkl
├── SpaceInvadersNoFrameskip-v4.pkl
├── QbertNoFrameskip-v4.pkl
├── SeaquestNoFrameskip-v4.pkl
├── BreakoutNoFrameskip-v4.pkl


In [142]:
for i in range(2):
    clear_output() # HACK to remove TensorFlow warnings
    for algo, env_id in GAMES:
        print(f"('{algo}', '{env_id}')")
        agent_path = os.path.join(PATH_TO_AGENTS, algo.lower(), env_id + '.pkl')
        model = ALGO_IMPL[algo].load(agent_path, verbose=0)
        print("observation_space:", model.observation_space)
        print("action_space:", model.action_space)
        print()

('PPO2', 'BreakoutNoFrameskip-v4')
observation_space: Box(84, 84, 4)
action_space: Discrete(4)

('PPO2', 'PongNoFrameskip-v4')
observation_space: Box(84, 84, 4)
action_space: Discrete(6)



In [0]:
VERBOSE = 2 # 0, 1 or 2

In [144]:
print("N_OF_STEPS:", N_OF_STEPS)
print("N_OF_TRAJECTORIES:", N_OF_TRAJECTORIES)
print(N_OF_STEPS, "*", N_OF_TRAJECTORIES, "=", N_OF_STEPS * N_OF_TRAJECTORIES)

N_OF_STEPS: 1000
N_OF_TRAJECTORIES: 200
1000 * 200 = 200000


## WIP

In [0]:
TRAJ_IMAGES_FOLDER = "images" # name of the folder in which to save the observations
TRAJ_FILE_NAME = "trajectory" # name of the .npz trajectory file

SAVE_IMAGES_AS_NUMPY_ARRAY = False # if False, the images will be saved as .png in TRAJ_IMAGES_FOLDER, 
                                   # and the 'observations' key in the .npz file will have the path to them

PRINT_EARLY_DONE = True
PRINT_ACTIONS_TAKEN = True

PRINT_EVERY_N_TRAJECTORIES = N_OF_TRAJECTORIES // 10 # set to N_OF_TRAJECTORIES + 1 not to print

In [146]:
print("PRINT_EVERY_N_TRAJECTORIES:", PRINT_EVERY_N_TRAJECTORIES)
print("N_OF_TRAJECTORIES:", N_OF_TRAJECTORIES)
print("N_OF_STEPS:", N_OF_STEPS)

time_start = time()
print("\n================")
for algo, env_id in GAMES:
    time_start_env = time()

    env = make_atari_env(env_id, num_env=1, seed=0)
    env = VecFrameStack(env, n_stack=4) # Frame-stacking with 4 frames
    agent_path = os.path.join(PATH_TO_AGENTS, algo.lower(), env_id + '.pkl')
    
    print(f"('{algo}', '{env_id}')")
    print(f"Getting pre-trained agent from: '{agent_path}'\n")
    
    # setup paths where data will be saved to
    traj_name = f"{env_id}_{algo}_{N_OF_STEPS}s"
    if not SAVE_IMAGES_AS_NUMPY_ARRAY:
        images_folder = os.path.join(SAVE_DIR, traj_name, TRAJ_IMAGES_FOLDER)
        os.makedirs(images_folder, exist_ok=True)
        if VERBOSE > 0:
            print("Images will be recorded to {}/\n".format(images_folder))

    model = ALGO_IMPL[algo].load(agent_path, env)
    
    for trajectory in tqdm(range(N_OF_TRAJECTORIES), position=0, leave=True):
        # store the "obs -> action" mapping
        observed_states, actions_taken = [], []
        
        # episode stats
        ep_rewards, ep_starts = np.zeros((N_OF_STEPS,)), []

        obs = env.reset() # (84, 84, 4)
        ep_starts.append(True)
        for step in range(N_OF_STEPS):
            observed_states.append(obs)
            action = model.predict(obs)
            actions_taken.append(action)
            obs, reward, done, infos = env.step(action)
            ep_starts.append(done[0])
            ep_rewards[step] = reward[0]
            if done[0]:
                obs = env.reset()
                if PRINT_EARLY_DONE:
                    tqdm.write(f" Done at step {step + 1} (reseting env)")
        
        # NOTE action, reward and done are arrays since we're using a vectorized env
        observed_states = [obs[0] for obs in observed_states]
        actions_taken = [action[0][0] for action in actions_taken]
        
        # left pad the trajectory with 0's
        traj_number_str = str(trajectory + 1).zfill(len(str(N_OF_TRAJECTORIES)))

        if not SAVE_IMAGES_AS_NUMPY_ARRAY:
            _observed_states = []
            for step, obs in enumerate(observed_states):
                image_path = os.path.join(images_folder, traj_number_str, f"{str(step).zfill(len(str(N_OF_STEPS)))}.png")
                os.makedirs(image_path, exist_ok=True)
                if obs.shape[-1] == 3:
                    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR)
                cv2.imwrite(image_path, obs)
                _observed_states.append(image_path)
            observed_states = np.array(_observed_states)
        else:
            observed_states = np.concatenate(observed_states).reshape((-1,) + env.observation_space.shape)

        actions_taken = np.array(actions_taken)
        ep_starts = np.array(ep_starts[:-1])

        data = {
            'actions': actions_taken,
            'observations': observed_states,
            'rewards': ep_rewards,
            'episode_starts': ep_starts
        } # type: Dict[str, np.ndarray]

        save_path = os.path.join(SAVE_DIR, traj_name, traj_number_str)
        os.makedirs(save_path, exist_ok=True)
        np.savez_compressed(file=os.path.join(save_path, TRAJ_FILE_NAME), **data)
        
        if (trajectory + 1) % 10 == 0:
            tqdm.write(f" Saved trajectory {trajectory + 1} (of {N_OF_TRAJECTORIES})")
            if VERBOSE > 1:
                tqdm.write(" Mean reward: {:.2f}, ep_rewards.shape == {}".format(
                        np.mean(ep_rewards), ep_rewards.shape))

        if PRINT_ACTIONS_TAKEN and trajectory == N_OF_TRAJECTORIES - 1:
            tqdm.write("\n Actions taken: {}".format(", ".join([ACTION_MEANING[action] for action in set(actions_taken)])))
    
    env.close()
    print(f"Δt = {(time() - time_start_env):.2f}s")
    print("================\n")

print(f"Total Δt = {(time() - time_start):.2f}s")

PRINT_EVERY_N_TRAJECTORIES: 20
N_OF_TRAJECTORIES: 200
N_OF_STEPS: 1000

('PPO2', 'BreakoutNoFrameskip-v4')
Getting pre-trained agent from: 'rl-baselines-zoo/trained_agents/ppo2/BreakoutNoFrameskip-v4.pkl'

Images will be recorded to drive/My Drive/unicamp/MC886/atari/Dataset/data/BreakoutNoFrameskip-v4_PPO2_1000s/images/



  0%|          | 1/200 [00:05<17:46,  5.36s/it]

 Done at step 111 (reseting env)


  0%|          | 1/200 [00:07<17:46,  5.36s/it]

 Done at step 520 (reseting env)
 Done at step 540 (reseting env)


  0%|          | 1/200 [00:07<17:46,  5.36s/it]

 Done at step 833 (reseting env)


  0%|          | 1/200 [00:08<17:46,  5.36s/it]

 Done at step 987 (reseting env)


  1%|          | 2/200 [00:12<18:05,  5.48s/it]

 Done at step 521 (reseting env)


  2%|▏         | 3/200 [00:16<18:07,  5.52s/it]

 Done at step 7 (reseting env)


  2%|▏         | 3/200 [00:18<18:07,  5.52s/it]

 Done at step 488 (reseting env)


  2%|▏         | 4/200 [00:22<18:02,  5.52s/it]

 Done at step 14 (reseting env)


  2%|▏         | 4/200 [00:22<18:02,  5.52s/it]

 Done at step 199 (reseting env)


  2%|▏         | 4/200 [00:23<18:02,  5.52s/it]

 Done at step 330 (reseting env)


  2%|▏         | 4/200 [00:23<18:02,  5.52s/it]

 Done at step 503 (reseting env)


  2%|▏         | 4/200 [00:24<18:02,  5.52s/it]

 Done at step 654 (reseting env)


  2%|▎         | 5/200 [00:27<18:01,  5.55s/it]

 Done at step 6 (reseting env)


  2%|▎         | 5/200 [00:29<18:01,  5.55s/it]

 Done at step 559 (reseting env)
 Done at step 579 (reseting env)


  2%|▎         | 5/200 [00:30<18:01,  5.55s/it]

 Done at step 674 (reseting env)


  2%|▎         | 5/200 [00:30<18:01,  5.55s/it]

 Done at step 837 (reseting env)


  3%|▎         | 6/200 [00:33<18:06,  5.60s/it]

 Done at step 64 (reseting env)


  3%|▎         | 6/200 [00:34<18:06,  5.60s/it]

 Done at step 179 (reseting env)


  3%|▎         | 6/200 [00:36<18:06,  5.60s/it]

 Done at step 991 (reseting env)


  4%|▎         | 7/200 [00:39<18:15,  5.68s/it]

 Done at step 8 (reseting env)


  4%|▎         | 7/200 [00:40<18:15,  5.68s/it]

 Done at step 437 (reseting env)


  4%|▍         | 8/200 [00:47<18:01,  5.63s/it]

 Done at step 763 (reseting env)


  4%|▍         | 9/200 [00:50<17:38,  5.54s/it]

 Done at step 121 (reseting env)


  5%|▌         | 10/200 [00:55<17:27,  5.51s/it]

 Saved trajectory 10 (of 200)

 Mean reward: 0.03, ep_rewards.shape == (1000,)


  5%|▌         | 10/200 [00:56<17:27,  5.51s/it]

 Done at step 279 (reseting env)


  5%|▌         | 10/200 [00:57<17:27,  5.51s/it]

 Done at step 670 (reseting env)


  5%|▌         | 10/200 [00:58<17:27,  5.51s/it]

 Done at step 833 (reseting env)


  6%|▌         | 11/200 [01:01<17:33,  5.57s/it]

 Done at step 61 (reseting env)


  6%|▌         | 11/200 [01:02<17:33,  5.57s/it]

 Done at step 173 (reseting env)


  6%|▌         | 11/200 [01:02<17:33,  5.57s/it]

 Done at step 453 (reseting env)


  6%|▌         | 11/200 [01:04<17:33,  5.57s/it]

 Done at step 950 (reseting env)


  8%|▊         | 15/200 [01:23<16:55,  5.49s/it]

 Done at step 124 (reseting env)


  8%|▊         | 15/200 [01:24<16:55,  5.49s/it]

 Done at step 317 (reseting env)


  8%|▊         | 16/200 [01:31<16:55,  5.52s/it]

 Done at step 770 (reseting env)


  8%|▊         | 16/200 [01:31<16:55,  5.52s/it]

 Done at step 873 (reseting env)


  8%|▊         | 17/200 [01:35<17:09,  5.62s/it]

 Done at step 150 (reseting env)


  8%|▊         | 17/200 [01:36<17:09,  5.62s/it]

 Done at step 465 (reseting env)


  9%|▉         | 18/200 [01:40<17:19,  5.71s/it]

 Done at step 56 (reseting env)


  9%|▉         | 18/200 [01:41<17:19,  5.71s/it]

 Done at step 230 (reseting env)


  9%|▉         | 18/200 [01:42<17:19,  5.71s/it]

 Done at step 389 (reseting env)
 Done at step 409 (reseting env)


  9%|▉         | 18/200 [01:42<17:19,  5.71s/it]

 Done at step 526 (reseting env)


  9%|▉         | 18/200 [01:42<17:19,  5.71s/it]

 Done at step 625 (reseting env)


  9%|▉         | 18/200 [01:43<17:19,  5.71s/it]

 Done at step 797 (reseting env)
 Done at step 845 (reseting env)


 10%|█         | 20/200 [01:52<17:13,  5.74s/it]

 Saved trajectory 20 (of 200)

 Mean reward: 0.06, ep_rewards.shape == (1000,)


 10%|█         | 20/200 [01:53<17:13,  5.74s/it]

 Done at step 452 (reseting env)


 12%|█▏        | 23/200 [02:08<16:11,  5.49s/it]

 Done at step 3 (reseting env)


 12%|█▏        | 23/200 [02:10<16:11,  5.49s/it]

 Done at step 708 (reseting env)


 12%|█▏        | 24/200 [02:15<16:11,  5.52s/it]

 Done at step 542 (reseting env)


 12%|█▎        | 25/200 [02:19<16:09,  5.54s/it]

 Done at step 8 (reseting env)


 12%|█▎        | 25/200 [02:19<16:09,  5.54s/it]

 Done at step 82 (reseting env)


 12%|█▎        | 25/200 [02:21<16:09,  5.54s/it]

 Done at step 556 (reseting env)


 12%|█▎        | 25/200 [02:21<16:09,  5.54s/it]

 Done at step 620 (reseting env)


 12%|█▎        | 25/200 [02:22<16:09,  5.54s/it]

 Done at step 772 (reseting env)


 12%|█▎        | 25/200 [02:22<16:09,  5.54s/it]

 Done at step 950 (reseting env)


 13%|█▎        | 26/200 [02:25<16:11,  5.59s/it]

 Done at step 117 (reseting env)


 13%|█▎        | 26/200 [02:27<16:11,  5.59s/it]

 Done at step 810 (reseting env)


 14%|█▎        | 27/200 [02:31<16:02,  5.56s/it]

 Done at step 238 (reseting env)


 14%|█▎        | 27/200 [02:32<16:02,  5.56s/it]

 Done at step 401 (reseting env)


 14%|█▎        | 27/200 [02:32<16:02,  5.56s/it]

 Done at step 594 (reseting env)


 14%|█▍        | 28/200 [02:36<16:02,  5.60s/it]

 Done at step 6 (reseting env)


 14%|█▍        | 28/200 [02:37<16:02,  5.60s/it]

 Done at step 422 (reseting env)


 14%|█▍        | 29/200 [02:42<15:51,  5.56s/it]

 Done at step 80 (reseting env)


 14%|█▍        | 29/200 [02:43<15:51,  5.56s/it]

 Done at step 399 (reseting env)


 15%|█▌        | 30/200 [02:47<15:42,  5.55s/it]

 Saved trajectory 30 (of 200)

 Mean reward: 0.04, ep_rewards.shape == (1000,)


 15%|█▌        | 30/200 [02:49<15:42,  5.55s/it]

 Done at step 624 (reseting env)


 15%|█▌        | 30/200 [02:49<15:42,  5.55s/it]

 Done at step 730 (reseting env)


 16%|█▌        | 31/200 [02:53<15:32,  5.52s/it]

 Done at step 242 (reseting env)


 16%|█▌        | 31/200 [02:53<15:32,  5.52s/it]

 Done at step 340 (reseting env)


 16%|█▌        | 32/200 [03:00<15:25,  5.51s/it]

 Done at step 566 (reseting env)


 16%|█▌        | 32/200 [03:01<15:25,  5.51s/it]

 Done at step 860 (reseting env)


 16%|█▌        | 32/200 [03:01<15:25,  5.51s/it]

 Done at step 997 (reseting env)


 16%|█▋        | 33/200 [03:04<15:21,  5.52s/it]

 Done at step 14 (reseting env)


 16%|█▋        | 33/200 [03:05<15:21,  5.52s/it]

 Done at step 326 (reseting env)


 16%|█▋        | 33/200 [03:05<15:21,  5.52s/it]

 Done at step 461 (reseting env)


 17%|█▋        | 34/200 [03:11<15:21,  5.55s/it]

 Done at step 654 (reseting env)


 18%|█▊        | 35/200 [03:15<15:14,  5.54s/it]

 Done at step 190 (reseting env)


 18%|█▊        | 35/200 [03:17<15:14,  5.54s/it]

 Done at step 632 (reseting env)
 Done at step 678 (reseting env)


 18%|█▊        | 36/200 [03:21<15:06,  5.53s/it]

 Done at step 256 (reseting env)


 18%|█▊        | 37/200 [03:26<15:15,  5.62s/it]

 Done at step 6 (reseting env)


 18%|█▊        | 37/200 [03:27<15:15,  5.62s/it]

 Done at step 179 (reseting env)


 18%|█▊        | 37/200 [03:27<15:15,  5.62s/it]

 Done at step 277 (reseting env)


 18%|█▊        | 37/200 [03:28<15:15,  5.62s/it]

 Done at step 549 (reseting env)


                                                

 Done at step 98 (reseting env)


 20%|█▉        | 39/200 [03:38<15:09,  5.65s/it]

 Done at step 148 (reseting env)


 20%|█▉        | 39/200 [03:38<15:09,  5.65s/it]

 Done at step 245 (reseting env)
 Done at step 263 (reseting env)


 20%|█▉        | 39/200 [03:40<15:09,  5.65s/it]

 Done at step 569 (reseting env)


 20%|██        | 40/200 [03:49<19:49,  7.44s/it]

 Saved trajectory 40 (of 200)

 Mean reward: 0.03, ep_rewards.shape == (1000,)


 20%|██        | 41/200 [03:55<18:04,  6.82s/it]

 Done at step 107 (reseting env)


 20%|██        | 41/200 [03:55<18:04,  6.82s/it]

 Done at step 215 (reseting env)


 20%|██        | 41/200 [03:56<18:04,  6.82s/it]

 Done at step 496 (reseting env)


 21%|██        | 42/200 [04:00<17:05,  6.49s/it]

 Done at step 103 (reseting env)


 21%|██        | 42/200 [04:01<17:05,  6.49s/it]

 Done at step 206 (reseting env)


 21%|██        | 42/200 [04:01<17:05,  6.49s/it]

 Done at step 427 (reseting env)


 22%|██▏       | 43/200 [04:08<16:29,  6.30s/it]

 Done at step 617 (reseting env)


 22%|██▏       | 43/200 [04:09<16:29,  6.30s/it]

 Done at step 926 (reseting env)


 22%|██▏       | 43/200 [04:09<16:29,  6.30s/it]

 Done at step 1000 (reseting env)


 22%|██▏       | 44/200 [04:12<15:56,  6.13s/it]

 Done at step 17 (reseting env)


 22%|██▎       | 45/200 [04:17<15:24,  5.97s/it]

 Done at step 4 (reseting env)


 22%|██▎       | 45/200 [04:18<15:24,  5.97s/it]

 Done at step 140 (reseting env)


 22%|██▎       | 45/200 [04:19<15:24,  5.97s/it]

 Done at step 740 (reseting env)


 22%|██▎       | 45/200 [04:20<15:24,  5.97s/it]

 Done at step 905 (reseting env)


 23%|██▎       | 46/200 [04:24<15:09,  5.91s/it]

 Done at step 304 (reseting env)


 24%|██▎       | 47/200 [04:31<14:45,  5.79s/it]

 Done at step 771 (reseting env)


 24%|██▎       | 47/200 [04:31<14:45,  5.79s/it]

 Done at step 983 (reseting env)


 24%|██▍       | 48/200 [04:35<14:37,  5.77s/it]

 Done at step 410 (reseting env)


 24%|██▍       | 48/200 [04:36<14:37,  5.77s/it]

 Done at step 481 (reseting env)


 24%|██▍       | 48/200 [04:36<14:37,  5.77s/it]

 Done at step 647 (reseting env)


 24%|██▍       | 48/200 [04:37<14:37,  5.77s/it]

 Done at step 714 (reseting env)


 24%|██▍       | 49/200 [04:42<14:30,  5.77s/it]

 Done at step 668 (reseting env)


 25%|██▌       | 50/200 [04:46<14:21,  5.75s/it]

 Saved trajectory 50 (of 200)

 Mean reward: 0.04, ep_rewards.shape == (1000,)


 25%|██▌       | 50/200 [04:47<14:21,  5.75s/it]

 Done at step 344 (reseting env)


 25%|██▌       | 50/200 [04:49<14:21,  5.75s/it]

 Done at step 954 (reseting env)


 26%|██▌       | 51/200 [04:52<14:09,  5.70s/it]

 Done at step 343 (reseting env)


 26%|██▌       | 51/200 [04:53<14:09,  5.70s/it]

 Done at step 567 (reseting env)


 26%|██▌       | 52/200 [05:00<13:53,  5.63s/it]

 Done at step 959 (reseting env)


  0%|          | 0/200 [45:47<?, ?it/s]

 Done at step 5 (reseting env)
 Done at step 55 (reseting env)


 26%|██▋       | 53/200 [05:04<13:46,  5.62s/it]

 Done at step 375 (reseting env)


 26%|██▋       | 53/200 [05:04<13:46,  5.62s/it]

 Done at step 509 (reseting env)


 26%|██▋       | 53/200 [05:05<13:46,  5.62s/it]

 Done at step 837 (reseting env)


 26%|██▋       | 53/200 [05:05<13:46,  5.62s/it]

 Done at step 954 (reseting env)


 27%|██▋       | 54/200 [05:09<13:58,  5.74s/it]

 Done at step 103 (reseting env)


 27%|██▋       | 54/200 [05:10<13:58,  5.74s/it]

 Done at step 512 (reseting env)


 28%|██▊       | 55/200 [05:15<13:47,  5.71s/it]

 Done at step 225 (reseting env)


 28%|██▊       | 55/200 [05:16<13:47,  5.71s/it]

 Done at step 572 (reseting env)


 28%|██▊       | 55/200 [05:17<13:47,  5.71s/it]

 Done at step 935 (reseting env)


 28%|██▊       | 56/200 [05:21<13:36,  5.67s/it]

 Done at step 476 (reseting env)


 28%|██▊       | 56/200 [05:21<13:36,  5.67s/it]

 Done at step 597 (reseting env)


 28%|██▊       | 56/200 [05:22<13:36,  5.67s/it]

 Done at step 781 (reseting env)


 28%|██▊       | 57/200 [05:25<13:26,  5.64s/it]

 Done at step 7 (reseting env)


 29%|██▉       | 58/200 [05:31<13:17,  5.62s/it]

 Done at step 48 (reseting env)


 29%|██▉       | 58/200 [05:32<13:17,  5.62s/it]

 Done at step 507 (reseting env)


 29%|██▉       | 58/200 [05:32<13:17,  5.62s/it]

 Done at step 570 (reseting env)


 29%|██▉       | 58/200 [05:33<13:17,  5.62s/it]

 Done at step 636 (reseting env)


 30%|███       | 60/200 [05:42<13:00,  5.58s/it]

 Saved trajectory 60 (of 200)

 Mean reward: 0.06, ep_rewards.shape == (1000,)


 30%|███       | 60/200 [05:43<13:00,  5.58s/it]

 Done at step 537 (reseting env)


 30%|███       | 60/200 [05:44<13:00,  5.58s/it]

 Done at step 811 (reseting env)
 Done at step 860 (reseting env)


 30%|███       | 61/200 [05:48<12:56,  5.59s/it]

 Done at step 84 (reseting env)


 31%|███       | 62/200 [05:53<12:45,  5.55s/it]

 Done at step 47 (reseting env)


 31%|███       | 62/200 [05:54<12:45,  5.55s/it]

 Done at step 196 (reseting env)


 31%|███       | 62/200 [05:54<12:45,  5.55s/it]

 Done at step 418 (reseting env)


 31%|███       | 62/200 [05:56<12:45,  5.55s/it]

 Done at step 832 (reseting env)


 32%|███▏      | 64/200 [06:05<12:28,  5.50s/it]

 Done at step 229 (reseting env)


 32%|███▏      | 64/200 [06:05<12:28,  5.50s/it]

 Done at step 326 (reseting env)


 32%|███▎      | 65/200 [06:11<12:30,  5.56s/it]

 Done at step 602 (reseting env)


 33%|███▎      | 66/200 [06:16<12:29,  5.60s/it]

 Done at step 141 (reseting env)


 34%|███▎      | 67/200 [06:21<12:18,  5.55s/it]

 Done at step 3 (reseting env)


 34%|███▎      | 67/200 [06:21<12:18,  5.55s/it]

 Done at step 146 (reseting env)


 34%|███▍      | 68/200 [06:27<12:11,  5.54s/it]

 Done at step 312 (reseting env)


 34%|███▍      | 68/200 [06:27<12:11,  5.54s/it]

 Done at step 387 (reseting env)


 34%|███▍      | 68/200 [06:28<12:11,  5.54s/it]

 Done at step 717 (reseting env)


 34%|███▍      | 68/200 [06:29<12:11,  5.54s/it]

 Done at step 893 (reseting env)


 34%|███▍      | 69/200 [06:32<12:12,  5.59s/it]

 Done at step 24 (reseting env)


 35%|███▌      | 70/200 [06:38<12:09,  5.61s/it]

 Saved trajectory 70 (of 200)

 Mean reward: 0.03, ep_rewards.shape == (1000,)
 Done at step 8 (reseting env)


 35%|███▌      | 70/200 [06:39<12:09,  5.61s/it]

 Done at step 392 (reseting env)


 36%|███▌      | 72/200 [06:49<12:13,  5.73s/it]

 Done at step 4 (reseting env)


 36%|███▌      | 72/200 [06:50<12:13,  5.73s/it]

 Done at step 157 (reseting env)


 36%|███▌      | 72/200 [06:51<12:13,  5.73s/it]

 Done at step 514 (reseting env)


 36%|███▋      | 73/200 [06:56<12:04,  5.70s/it]

 Done at step 375 (reseting env)


 36%|███▋      | 73/200 [06:57<12:04,  5.70s/it]

 Done at step 755 (reseting env)


 37%|███▋      | 74/200 [07:03<11:55,  5.68s/it]

 Done at step 994 (reseting env)


 38%|███▊      | 75/200 [07:06<11:38,  5.59s/it]

 Done at step 11 (reseting env)


 38%|███▊      | 75/200 [07:07<11:38,  5.59s/it]

 Done at step 247 (reseting env)


 38%|███▊      | 76/200 [07:13<11:30,  5.57s/it]

 Done at step 664 (reseting env)


 38%|███▊      | 76/200 [07:14<11:30,  5.57s/it]

 Done at step 806 (reseting env)


 38%|███▊      | 77/200 [07:17<11:32,  5.63s/it]

 Done at step 14 (reseting env)


 39%|███▉      | 78/200 [07:25<11:24,  5.61s/it]

 Done at step 892 (reseting env)


 40%|███▉      | 79/200 [07:28<11:11,  5.55s/it]

 Done at step 8 (reseting env)


 40%|███▉      | 79/200 [07:29<11:11,  5.55s/it]

 Done at step 324 (reseting env)


 40%|███▉      | 79/200 [07:30<11:11,  5.55s/it]

 Done at step 487 (reseting env)


 40%|███▉      | 79/200 [07:31<11:11,  5.55s/it]

 Done at step 883 (reseting env)
 Done at step 924 (reseting env)


 40%|████      | 80/200 [07:34<11:23,  5.70s/it]

 Saved trajectory 80 (of 200)

 Mean reward: 0.03, ep_rewards.shape == (1000,)
 Done at step 8 (reseting env)


 40%|████      | 81/200 [07:40<11:10,  5.63s/it]

 Done at step 5 (reseting env)


 40%|████      | 81/200 [07:40<11:10,  5.63s/it]

 Done at step 91 (reseting env)


 40%|████      | 81/200 [07:42<11:10,  5.63s/it]

 Done at step 807 (reseting env)


 40%|████      | 81/200 [07:42<11:10,  5.63s/it]

 Done at step 881 (reseting env)


 40%|████      | 81/200 [07:43<11:10,  5.63s/it]

 Done at step 957 (reseting env)


 42%|████▏     | 83/200 [07:51<10:57,  5.62s/it]

 Done at step 6 (reseting env)


 42%|████▏     | 84/200 [07:56<10:48,  5.59s/it]

 Done at step 10 (reseting env)


 42%|████▏     | 84/200 [07:57<10:48,  5.59s/it]

 Done at step 204 (reseting env)


 42%|████▎     | 85/200 [08:03<10:44,  5.60s/it]

 Done at step 152 (reseting env)


 42%|████▎     | 85/200 [08:03<10:44,  5.60s/it]

 Done at step 273 (reseting env)


                                       

 Done at step 690 (reseting env)
 Done at step 708 (reseting env)


 43%|████▎     | 86/200 [08:10<10:48,  5.69s/it]

 Done at step 728 (reseting env)


 44%|████▎     | 87/200 [08:15<10:46,  5.72s/it]

 Done at step 409 (reseting env)
 Done at step 429 (reseting env)


 44%|████▍     | 88/200 [08:19<10:37,  5.69s/it]

 Done at step 5 (reseting env)


 44%|████▍     | 88/200 [08:22<10:37,  5.69s/it]

 Done at step 845 (reseting env)


 44%|████▍     | 89/200 [08:26<10:28,  5.67s/it]

 Done at step 434 (reseting env)


 44%|████▍     | 89/200 [08:27<10:28,  5.67s/it]

 Done at step 698 (reseting env)


 45%|████▌     | 90/200 [08:31<10:19,  5.63s/it]

 Saved trajectory 90 (of 200)

 Mean reward: 0.05, ep_rewards.shape == (1000,)
 Done at step 22 (reseting env)


 45%|████▌     | 90/200 [08:31<10:19,  5.63s/it]

 Done at step 193 (reseting env)


 45%|████▌     | 90/200 [08:33<10:19,  5.63s/it]

 Done at step 903 (reseting env)


 45%|████▌     | 90/200 [08:34<10:19,  5.63s/it]

 Done at step 989 (reseting env)


 46%|████▌     | 91/200 [08:37<10:23,  5.72s/it]

 Done at step 181 (reseting env)


 46%|████▌     | 91/200 [08:39<10:23,  5.72s/it]

 Done at step 680 (reseting env)


 46%|████▋     | 93/200 [08:50<10:11,  5.72s/it]

 Done at step 640 (reseting env)


 46%|████▋     | 93/200 [08:50<10:11,  5.72s/it]

 Done at step 701 (reseting env)


 46%|████▋     | 93/200 [08:51<10:11,  5.72s/it]

 Done at step 970 (reseting env)


 47%|████▋     | 94/200 [08:53<09:56,  5.62s/it]

 Done at step 35 (reseting env)


 47%|████▋     | 94/200 [08:55<09:56,  5.62s/it]

 Done at step 472 (reseting env)


 48%|████▊     | 95/200 [08:59<09:53,  5.65s/it]

 Done at step 6 (reseting env)


 48%|████▊     | 95/200 [09:00<09:53,  5.65s/it]

 Done at step 253 (reseting env)


 48%|████▊     | 96/200 [09:05<09:44,  5.62s/it]

 Done at step 2 (reseting env)


 48%|████▊     | 97/200 [09:10<09:34,  5.57s/it]

 Done at step 2 (reseting env)


 48%|████▊     | 97/200 [09:12<09:34,  5.57s/it]

 Done at step 570 (reseting env)


 48%|████▊     | 97/200 [09:13<09:34,  5.57s/it]

 Done at step 863 (reseting env)


 49%|████▉     | 98/200 [09:16<09:27,  5.56s/it]

 Done at step 6 (reseting env)


 49%|████▉     | 98/200 [09:17<09:27,  5.56s/it]

 Done at step 608 (reseting env)


 49%|████▉     | 98/200 [09:18<09:27,  5.56s/it]

 Done at step 750 (reseting env)


 50%|████▉     | 99/200 [09:21<09:24,  5.59s/it]

 Done at step 3 (reseting env)


 50%|█████     | 100/200 [09:27<09:23,  5.64s/it]

 Saved trajectory 100 (of 200)

 Mean reward: 0.04, ep_rewards.shape == (1000,)
 Done at step 1 (reseting env)


 50%|█████     | 100/200 [09:27<09:23,  5.64s/it]

 Done at step 126 (reseting env)


 50%|█████     | 100/200 [09:29<09:23,  5.64s/it]

 Done at step 661 (reseting env)


 50%|█████     | 101/200 [09:33<09:16,  5.62s/it]

 Done at step 6 (reseting env)


 51%|█████     | 102/200 [09:38<09:11,  5.63s/it]

 Done at step 1 (reseting env)


 51%|█████     | 102/200 [09:39<09:11,  5.63s/it]

 Done at step 243 (reseting env)


 51%|█████     | 102/200 [09:40<09:11,  5.63s/it]

 Done at step 426 (reseting env)


 52%|█████▏    | 103/200 [09:46<09:08,  5.66s/it]

 Done at step 605 (reseting env)


 52%|█████▏    | 104/200 [09:49<08:59,  5.62s/it]

 Done at step 7 (reseting env)


 52%|█████▏    | 104/200 [09:51<08:59,  5.62s/it]

 Done at step 509 (reseting env)


 52%|█████▏    | 104/200 [09:52<08:59,  5.62s/it]

 Done at step 943 (reseting env)


 52%|█████▎    | 105/200 [09:57<08:56,  5.65s/it]

 Done at step 765 (reseting env)


 53%|█████▎    | 106/200 [10:01<08:48,  5.62s/it]

 Done at step 182 (reseting env)


 54%|█████▎    | 107/200 [10:06<08:38,  5.57s/it]

 Done at step 3 (reseting env)


 54%|█████▎    | 107/200 [10:07<08:38,  5.57s/it]

 Done at step 315 (reseting env)


 54%|█████▎    | 107/200 [10:08<08:38,  5.57s/it]

 Done at step 619 (reseting env)


 54%|█████▎    | 107/200 [10:09<08:38,  5.57s/it]

 Done at step 782 (reseting env)


 54%|█████▎    | 107/200 [10:09<08:38,  5.57s/it]

 Done at step 931 (reseting env)


 54%|█████▍    | 108/200 [10:12<08:39,  5.64s/it]

 Done at step 9 (reseting env)


 54%|█████▍    | 108/200 [10:14<08:39,  5.64s/it]

 Done at step 743 (reseting env)


 54%|█████▍    | 108/200 [10:15<08:39,  5.64s/it]

 Done at step 939 (reseting env)


 55%|█████▍    | 109/200 [10:18<08:33,  5.64s/it]

 Done at step 111 (reseting env)


 55%|█████▌    | 110/200 [10:23<08:29,  5.66s/it]

 Saved trajectory 110 (of 200)

 Mean reward: 0.05, ep_rewards.shape == (1000,)


 55%|█████▌    | 110/200 [10:24<08:29,  5.66s/it]

 Done at step 284 (reseting env)


 55%|█████▌    | 110/200 [10:25<08:29,  5.66s/it]

 Done at step 422 (reseting env)


 55%|█████▌    | 110/200 [10:25<08:29,  5.66s/it]

 Done at step 601 (reseting env)


 56%|█████▌    | 111/200 [10:31<08:29,  5.72s/it]

 Done at step 545 (reseting env)


 56%|█████▌    | 111/200 [10:32<08:29,  5.72s/it]

 Done at step 906 (reseting env)


 56%|█████▋    | 113/200 [10:40<08:06,  5.59s/it]

 Done at step 111 (reseting env)


 56%|█████▋    | 113/200 [10:41<08:06,  5.59s/it]

 Done at step 196 (reseting env)


 56%|█████▋    | 113/200 [10:42<08:06,  5.59s/it]

 Done at step 451 (reseting env)


 56%|█████▋    | 113/200 [10:42<08:06,  5.59s/it]

 Done at step 624 (reseting env)


 57%|█████▋    | 114/200 [10:47<08:05,  5.64s/it]

 Done at step 464 (reseting env)


 57%|█████▊    | 115/200 [10:52<07:57,  5.62s/it]

 Done at step 20 (reseting env)


 57%|█████▊    | 115/200 [10:52<07:57,  5.62s/it]

 Done at step 126 (reseting env)


 57%|█████▊    | 115/200 [10:53<07:57,  5.62s/it]

 Done at step 397 (reseting env)


 57%|█████▊    | 115/200 [10:54<07:57,  5.62s/it]

 Done at step 810 (reseting env)


 58%|█████▊    | 117/200 [11:03<07:48,  5.64s/it]

 Done at step 3 (reseting env)
 Done at step 54 (reseting env)


 58%|█████▊    | 117/200 [11:03<07:48,  5.64s/it]

 Done at step 184 (reseting env)


 59%|█████▉    | 118/200 [11:09<07:42,  5.64s/it]

 Done at step 9 (reseting env)


 60%|█████▉    | 119/200 [11:16<07:34,  5.61s/it]

 Done at step 702 (reseting env)


 60%|██████    | 120/200 [11:20<07:30,  5.63s/it]

 Saved trajectory 120 (of 200)

 Mean reward: 0.06, ep_rewards.shape == (1000,)
 Done at step 8 (reseting env)


 60%|██████    | 120/200 [11:20<07:30,  5.63s/it]

 Done at step 76 (reseting env)


 60%|██████    | 120/200 [11:21<07:30,  5.63s/it]

 Done at step 358 (reseting env)


 60%|██████    | 120/200 [11:21<07:30,  5.63s/it]

 Done at step 479 (reseting env)


  0%|          | 0/200 [50:01<?, ?it/s]

 Done at step 651 (reseting env)
 Done at step 671 (reseting env)
 Done at step 691 (reseting env)


 60%|██████    | 121/200 [11:28<07:22,  5.60s/it]

 Done at step 944 (reseting env)


 61%|██████    | 122/200 [11:31<07:19,  5.64s/it]

 Done at step 27 (reseting env)


 62%|██████▏   | 123/200 [11:37<07:15,  5.66s/it]

 Done at step 9 (reseting env)


 62%|██████▏   | 123/200 [11:38<07:15,  5.66s/it]

 Done at step 341 (reseting env)


 62%|██████▏   | 123/200 [11:38<07:15,  5.66s/it]

 Done at step 510 (reseting env)


 62%|██████▏   | 124/200 [11:42<07:10,  5.66s/it]

 Done at step 3 (reseting env)


 62%|██████▏   | 124/200 [11:43<07:10,  5.66s/it]

 Done at step 128 (reseting env)


 62%|██████▏   | 124/200 [11:43<07:10,  5.66s/it]

 Done at step 324 (reseting env)


 62%|██████▏   | 124/200 [11:44<07:10,  5.66s/it]

 Done at step 616 (reseting env)


 62%|██████▎   | 125/200 [11:49<07:06,  5.69s/it]

 Done at step 313 (reseting env)


 62%|██████▎   | 125/200 [11:50<07:06,  5.69s/it]

 Done at step 613 (reseting env)


 62%|██████▎   | 125/200 [11:50<07:06,  5.69s/it]

 Done at step 714 (reseting env)
 Done at step 734 (reseting env)


 63%|██████▎   | 126/200 [11:54<06:59,  5.67s/it]

 Done at step 213 (reseting env)


 63%|██████▎   | 126/200 [11:55<06:59,  5.67s/it]

 Done at step 379 (reseting env)


 64%|██████▎   | 127/200 [12:01<07:01,  5.77s/it]

 Done at step 245 (reseting env)


 64%|██████▎   | 127/200 [12:01<07:01,  5.77s/it]

 Done at step 329 (reseting env)


 64%|██████▍   | 128/200 [12:06<06:56,  5.78s/it]

 Done at step 3 (reseting env)


 64%|██████▍   | 129/200 [12:12<06:49,  5.77s/it]

 Done at step 265 (reseting env)


 64%|██████▍   | 129/200 [12:13<06:49,  5.77s/it]

 Done at step 492 (reseting env)


 64%|██████▍   | 129/200 [12:13<06:49,  5.77s/it]

 Done at step 641 (reseting env)


 65%|██████▌   | 130/200 [12:17<06:43,  5.76s/it]

 Saved trajectory 130 (of 200)

 Mean reward: 0.04, ep_rewards.shape == (1000,)
 Done at step 4 (reseting env)


 66%|██████▌   | 131/200 [12:25<06:31,  5.67s/it]

 Done at step 860 (reseting env)


 66%|██████▌   | 131/200 [12:25<06:31,  5.67s/it]

 Done at step 928 (reseting env)


 66%|██████▌   | 132/200 [12:28<06:20,  5.60s/it]

 Done at step 45 (reseting env)


 66%|██████▌   | 132/200 [12:28<06:20,  5.60s/it]

 Done at step 164 (reseting env)


 66%|██████▌   | 132/200 [12:29<06:20,  5.60s/it]

 Done at step 232 (reseting env)


 66%|██████▌   | 132/200 [12:29<06:20,  5.60s/it]

 Done at step 359 (reseting env)


 66%|██████▌   | 132/200 [12:29<06:20,  5.60s/it]

 Done at step 473 (reseting env)


 66%|██████▋   | 133/200 [12:34<06:19,  5.66s/it]

 Done at step 15 (reseting env)


 66%|██████▋   | 133/200 [12:35<06:19,  5.66s/it]

 Done at step 564 (reseting env)


 66%|██████▋   | 133/200 [12:36<06:19,  5.66s/it]

 Done at step 742 (reseting env)


 68%|██████▊   | 135/200 [12:46<06:06,  5.63s/it]

 Done at step 237 (reseting env)


 68%|██████▊   | 135/200 [12:47<06:06,  5.63s/it]

 Done at step 731 (reseting env)


 68%|██████▊   | 135/200 [12:48<06:06,  5.63s/it]

 Done at step 987 (reseting env)


 68%|██████▊   | 136/200 [12:51<06:00,  5.64s/it]

 Done at step 80 (reseting env)


 68%|██████▊   | 136/200 [12:51<06:00,  5.64s/it]

 Done at step 249 (reseting env)


 68%|██████▊   | 137/200 [12:58<05:53,  5.62s/it]

 Done at step 515 (reseting env)


 68%|██████▊   | 137/200 [12:58<05:53,  5.62s/it]

 Done at step 720 (reseting env)


 68%|██████▊   | 137/200 [12:59<05:53,  5.62s/it]

 Done at step 987 (reseting env)


 69%|██████▉   | 138/200 [13:02<05:44,  5.56s/it]

 Done at step 217 (reseting env)


 69%|██████▉   | 138/200 [13:03<05:44,  5.56s/it]

 Done at step 383 (reseting env)


 70%|██████▉   | 139/200 [13:07<05:37,  5.53s/it]

 Done at step 3 (reseting env)


 70%|██████▉   | 139/200 [13:09<05:37,  5.53s/it]

 Done at step 755 (reseting env)


 70%|███████   | 140/200 [13:13<05:34,  5.58s/it]

 Saved trajectory 140 (of 200)

 Mean reward: 0.07, ep_rewards.shape == (1000,)
 Done at step 7 (reseting env)


 70%|███████   | 140/200 [13:14<05:34,  5.58s/it]

 Done at step 327 (reseting env)


 70%|███████   | 140/200 [13:14<05:34,  5.58s/it]

 Done at step 498 (reseting env)


 70%|███████   | 140/200 [13:15<05:34,  5.58s/it]

 Done at step 772 (reseting env)


 71%|███████   | 142/200 [13:25<05:25,  5.62s/it]

 Done at step 317 (reseting env)


 71%|███████   | 142/200 [13:26<05:25,  5.62s/it]

 Done at step 496 (reseting env)


 71%|███████   | 142/200 [13:26<05:25,  5.62s/it]

 Done at step 600 (reseting env)
 Done at step 620 (reseting env)


 72%|███████▏  | 143/200 [13:30<05:21,  5.63s/it]

 Done at step 101 (reseting env)


 72%|███████▏  | 144/200 [13:37<05:17,  5.67s/it]

 Done at step 483 (reseting env)


 72%|███████▏  | 144/200 [13:37<05:17,  5.67s/it]

 Done at step 593 (reseting env)


 72%|███████▏  | 144/200 [13:38<05:17,  5.67s/it]

 Done at step 691 (reseting env)


 72%|███████▏  | 144/200 [13:38<05:17,  5.67s/it]

 Done at step 928 (reseting env)


 72%|███████▎  | 145/200 [13:42<05:15,  5.73s/it]

 Done at step 247 (reseting env)


 74%|███████▎  | 147/200 [13:53<04:57,  5.62s/it]

 Done at step 285 (reseting env)


 74%|███████▎  | 147/200 [13:54<04:57,  5.62s/it]

 Done at step 451 (reseting env)
 Done at step 471 (reseting env)


 74%|███████▎  | 147/200 [13:54<04:57,  5.62s/it]

 Done at step 547 (reseting env)


 74%|███████▎  | 147/200 [13:55<04:57,  5.62s/it]

 Done at step 697 (reseting env)


 74%|███████▎  | 147/200 [13:55<04:57,  5.62s/it]

 Done at step 905 (reseting env)


 74%|███████▍  | 148/200 [13:59<04:55,  5.69s/it]

 Done at step 93 (reseting env)


 74%|███████▍  | 148/200 [14:00<04:55,  5.69s/it]

 Done at step 494 (reseting env)


 74%|███████▍  | 148/200 [14:01<04:55,  5.69s/it]

 Done at step 802 (reseting env)


 74%|███████▍  | 149/200 [14:04<04:48,  5.66s/it]

 Done at step 73 (reseting env)


 74%|███████▍  | 149/200 [14:07<04:48,  5.66s/it]

 Done at step 980 (reseting env)


 75%|███████▌  | 150/200 [14:10<04:43,  5.68s/it]

 Saved trajectory 150 (of 200)

 Mean reward: 0.04, ep_rewards.shape == (1000,)


 75%|███████▌  | 150/200 [14:10<04:43,  5.68s/it]

 Done at step 217 (reseting env)


 75%|███████▌  | 150/200 [14:11<04:43,  5.68s/it]

 Done at step 525 (reseting env)


 75%|███████▌  | 150/200 [14:12<04:43,  5.68s/it]

 Done at step 688 (reseting env)


 76%|███████▌  | 152/200 [14:21<04:30,  5.63s/it]

 Done at step 75 (reseting env)


 76%|███████▌  | 152/200 [14:22<04:30,  5.63s/it]

 Done at step 332 (reseting env)


 76%|███████▌  | 152/200 [14:22<04:30,  5.63s/it]

 Done at step 512 (reseting env)


 76%|███████▌  | 152/200 [14:23<04:30,  5.63s/it]

 Done at step 597 (reseting env)


 76%|███████▌  | 152/200 [14:24<04:30,  5.63s/it]

 Done at step 868 (reseting env)


 76%|███████▋  | 153/200 [14:27<04:23,  5.60s/it]

 Done at step 61 (reseting env)


 77%|███████▋  | 154/200 [14:32<04:17,  5.60s/it]

 Done at step 106 (reseting env)


 77%|███████▋  | 154/200 [14:33<04:17,  5.60s/it]

 Done at step 490 (reseting env)


 77%|███████▋  | 154/200 [14:34<04:17,  5.60s/it]

 Done at step 660 (reseting env)


 77%|███████▋  | 154/200 [14:35<04:17,  5.60s/it]

 Done at step 895 (reseting env)


 78%|███████▊  | 155/200 [14:39<04:12,  5.61s/it]

 Done at step 487 (reseting env)


 78%|███████▊  | 156/200 [14:44<04:05,  5.58s/it]

 Done at step 155 (reseting env)


 78%|███████▊  | 156/200 [14:44<04:05,  5.58s/it]

 Done at step 231 (reseting env)
 Done at step 251 (reseting env)


 78%|███████▊  | 156/200 [14:45<04:05,  5.58s/it]

 Done at step 496 (reseting env)


 78%|███████▊  | 156/200 [14:45<04:05,  5.58s/it]

 Done at step 717 (reseting env)


 78%|███████▊  | 157/200 [14:51<04:03,  5.67s/it]

 Done at step 729 (reseting env)


 79%|███████▉  | 158/200 [14:55<03:58,  5.68s/it]

 Done at step 229 (reseting env)


 79%|███████▉  | 158/200 [14:56<03:58,  5.68s/it]

 Done at step 405 (reseting env)


 79%|███████▉  | 158/200 [14:57<03:58,  5.68s/it]

 Done at step 613 (reseting env)


 80%|███████▉  | 159/200 [15:01<03:53,  5.68s/it]

 Done at step 152 (reseting env)


 80%|███████▉  | 159/200 [15:02<03:53,  5.68s/it]

 Done at step 514 (reseting env)


 80%|████████  | 160/200 [15:06<03:47,  5.68s/it]

 Saved trajectory 160 (of 200)

 Mean reward: 0.06, ep_rewards.shape == (1000,)
 Done at step 5 (reseting env)


 80%|████████  | 160/200 [15:07<03:47,  5.68s/it]

 Done at step 481 (reseting env)


 80%|████████  | 160/200 [15:08<03:47,  5.68s/it]

 Done at step 676 (reseting env)


 81%|████████  | 162/200 [15:18<03:32,  5.60s/it]

 Done at step 333 (reseting env)


 81%|████████  | 162/200 [15:19<03:32,  5.60s/it]

 Done at step 432 (reseting env)


 81%|████████  | 162/200 [15:20<03:32,  5.60s/it]

 Done at step 883 (reseting env)
 Done at step 901 (reseting env)


 82%|████████▏ | 163/200 [15:23<03:28,  5.64s/it]

 Done at step 10 (reseting env)


 82%|████████▏ | 163/200 [15:23<03:28,  5.64s/it]

 Done at step 177 (reseting env)


                                       

 Done at step 587 (reseting env)


 82%|████████▏ | 163/200 [15:25<03:28,  5.64s/it]

 Done at step 635 (reseting env)
 Done at step 685 (reseting env)


 82%|████████▎ | 165/200 [15:34<03:17,  5.64s/it]

 Done at step 5 (reseting env)


 84%|████████▎ | 167/200 [15:45<03:04,  5.58s/it]

 Done at step 6 (reseting env)


 84%|████████▎ | 167/200 [15:46<03:04,  5.58s/it]

 Done at step 368 (reseting env)


 84%|████████▎ | 167/200 [15:47<03:04,  5.58s/it]

 Done at step 466 (reseting env)
 Done at step 486 (reseting env)


 84%|████████▎ | 167/200 [15:48<03:04,  5.58s/it]

 Done at step 961 (reseting env)


 84%|████████▍ | 169/200 [15:57<02:54,  5.63s/it]

 Done at step 6 (reseting env)


 84%|████████▍ | 169/200 [15:57<02:54,  5.63s/it]

 Done at step 123 (reseting env)


 84%|████████▍ | 169/200 [15:58<02:54,  5.63s/it]

 Done at step 337 (reseting env)


 84%|████████▍ | 169/200 [15:58<02:54,  5.63s/it]

 Done at step 449 (reseting env)


 84%|████████▍ | 169/200 [15:59<02:54,  5.63s/it]

 Done at step 695 (reseting env)


 85%|████████▌ | 170/200 [16:02<02:50,  5.69s/it]

 Saved trajectory 170 (of 200)

 Mean reward: 0.04, ep_rewards.shape == (1000,)


 86%|████████▌ | 171/200 [16:08<02:44,  5.66s/it]

 Done at step 6 (reseting env)


 86%|████████▌ | 171/200 [16:09<02:44,  5.66s/it]

 Done at step 352 (reseting env)


 86%|████████▌ | 171/200 [16:11<02:44,  5.66s/it]

 Done at step 943 (reseting env)


 86%|████████▌ | 172/200 [16:14<02:40,  5.74s/it]

 Done at step 25 (reseting env)


 86%|████████▌ | 172/200 [16:15<02:40,  5.74s/it]

 Done at step 207 (reseting env)


 86%|████████▌ | 172/200 [16:16<02:40,  5.74s/it]

 Done at step 526 (reseting env)


 87%|████████▋ | 174/200 [16:27<02:29,  5.76s/it]

 Done at step 422 (reseting env)


 87%|████████▋ | 174/200 [16:27<02:29,  5.76s/it]

 Done at step 492 (reseting env)


 87%|████████▋ | 174/200 [16:28<02:29,  5.76s/it]

 Done at step 862 (reseting env)


 90%|█████████ | 180/200 [16:59<01:50,  5.51s/it]

 Saved trajectory 180 (of 200)

 Mean reward: 0.00, ep_rewards.shape == (1000,)
 Done at step 3 (reseting env)


 90%|█████████ | 181/200 [17:05<01:45,  5.54s/it]

 Done at step 280 (reseting env)


 90%|█████████ | 181/200 [17:06<01:45,  5.54s/it]

 Done at step 623 (reseting env)




 Done at step 2 (reseting env)


 91%|█████████ | 182/200 [17:10<01:42,  5.68s/it]

 Done at step 59 (reseting env)
 Done at step 79 (reseting env)


 92%|█████████▏| 183/200 [17:17<01:37,  5.75s/it]

 Done at step 477 (reseting env)


 92%|█████████▏| 183/200 [17:19<01:37,  5.75s/it]

 Done at step 957 (reseting env)


 92%|█████████▏| 184/200 [17:23<01:31,  5.72s/it]

 Done at step 405 (reseting env)


 92%|█████████▎| 185/200 [17:27<01:24,  5.60s/it]

 Done at step 5 (reseting env)


 93%|█████████▎| 186/200 [17:33<01:18,  5.64s/it]

 Done at step 1 (reseting env)


 93%|█████████▎| 186/200 [17:33<01:18,  5.64s/it]

 Done at step 128 (reseting env)


 94%|█████████▎| 187/200 [17:41<01:13,  5.66s/it]

 Done at step 788 (reseting env)


 94%|█████████▍| 188/200 [17:44<01:07,  5.65s/it]

 Done at step 8 (reseting env)


 94%|█████████▍| 188/200 [17:45<01:07,  5.65s/it]

 Done at step 248 (reseting env)


 94%|█████████▍| 188/200 [17:46<01:07,  5.65s/it]

 Done at step 508 (reseting env)


 94%|█████████▍| 189/200 [17:52<01:01,  5.63s/it]

 Done at step 821 (reseting env)


 95%|█████████▌| 190/200 [17:55<00:56,  5.63s/it]

 Saved trajectory 190 (of 200)

 Mean reward: 0.06, ep_rewards.shape == (1000,)
 Done at step 24 (reseting env)


 95%|█████████▌| 190/200 [17:56<00:56,  5.63s/it]

 Done at step 148 (reseting env)


 95%|█████████▌| 190/200 [17:57<00:56,  5.63s/it]

 Done at step 657 (reseting env)


 95%|█████████▌| 190/200 [17:58<00:56,  5.63s/it]

 Done at step 782 (reseting env)


 95%|█████████▌| 190/200 [17:58<00:56,  5.63s/it]

 Done at step 910 (reseting env)


 96%|█████████▌| 191/200 [18:01<00:50,  5.65s/it]

 Done at step 133 (reseting env)


 96%|█████████▌| 191/200 [18:02<00:50,  5.65s/it]

 Done at step 251 (reseting env)


 96%|█████████▌| 191/200 [18:03<00:50,  5.65s/it]

 Done at step 758 (reseting env)


 96%|█████████▌| 191/200 [18:04<00:50,  5.65s/it]

 Done at step 896 (reseting env)


 96%|█████████▋| 193/200 [18:12<00:39,  5.68s/it]

 Done at step 6 (reseting env)


 96%|█████████▋| 193/200 [18:14<00:39,  5.68s/it]

 Done at step 542 (reseting env)


 96%|█████████▋| 193/200 [18:14<00:39,  5.68s/it]

 Done at step 629 (reseting env)


 97%|█████████▋| 194/200 [18:18<00:33,  5.65s/it]

 Done at step 165 (reseting env)


 97%|█████████▋| 194/200 [18:19<00:33,  5.65s/it]

 Done at step 240 (reseting env)


 98%|█████████▊| 195/200 [18:25<00:28,  5.60s/it]

 Done at step 506 (reseting env)


 98%|█████████▊| 195/200 [18:26<00:28,  5.60s/it]

 Done at step 937 (reseting env)


 98%|█████████▊| 196/200 [18:29<00:22,  5.57s/it]

 Done at step 114 (reseting env)


 98%|█████████▊| 196/200 [18:32<00:22,  5.57s/it]

 Done at step 898 (reseting env)


 98%|█████████▊| 197/200 [18:35<00:16,  5.51s/it]

 Done at step 91 (reseting env)


 98%|█████████▊| 197/200 [18:35<00:16,  5.51s/it]

 Done at step 226 (reseting env)


 99%|█████████▉| 198/200 [18:41<00:11,  5.66s/it]

 Done at step 298 (reseting env)


100%|█████████▉| 199/200 [18:46<00:05,  5.66s/it]

 Done at step 2 (reseting env)


100%|█████████▉| 199/200 [18:47<00:05,  5.66s/it]

 Done at step 262 (reseting env)


100%|██████████| 200/200 [18:52<00:00,  5.69s/it]


 Saved trajectory 200 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)

 Actions taken: NOOP, FIRE, UP, RIGHT
Δt = 1133.99s

('PPO2', 'PongNoFrameskip-v4')
Getting pre-trained agent from: 'rl-baselines-zoo/trained_agents/ppo2/PongNoFrameskip-v4.pkl'

Images will be recorded to drive/My Drive/unicamp/MC886/atari/Dataset/data/PongNoFrameskip-v4_PPO2_1000s/images/



 21%|██        | 42/200 [55:55<18:37,  7.07s/it]

 Done at step 688 (reseting env)


 21%|██        | 42/200 [56:05<18:37,  7.07s/it]

 Done at step 388 (reseting env)


 21%|██        | 42/200 [56:16<18:37,  7.07s/it]

 Done at step 361 (reseting env)


 21%|██        | 42/200 [56:29<18:37,  7.07s/it]

 Done at step 903 (reseting env)


  5%|▌         | 10/200 [00:55<17:28,  5.52s/it]

 Saved trajectory 10 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [56:44<18:37,  7.07s/it]

 Done at step 573 (reseting env)


 21%|██        | 42/200 [56:59<18:37,  7.07s/it]

 Done at step 81 (reseting env)


 21%|██        | 42/200 [57:07<18:37,  7.07s/it]

 Done at step 945 (reseting env)


 21%|██        | 42/200 [57:22<18:37,  7.07s/it]

 Done at step 430 (reseting env)


 21%|██        | 42/200 [57:32<18:37,  7.07s/it]

 Done at step 150 (reseting env)


 10%|█         | 20/200 [01:49<16:28,  5.49s/it]

 Saved trajectory 20 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [57:40<18:37,  7.07s/it]

 Done at step 851 (reseting env)


 21%|██        | 42/200 [57:55<18:37,  7.07s/it]

 Done at step 462 (reseting env)


 21%|██        | 42/200 [58:10<18:37,  7.07s/it]

 Done at step 73 (reseting env)


 21%|██        | 42/200 [58:23<18:37,  7.07s/it]

 Done at step 674 (reseting env)


 15%|█▌        | 30/200 [02:44<15:46,  5.57s/it]

 Saved trajectory 30 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [58:38<18:37,  7.07s/it]

 Done at step 139 (reseting env)


 21%|██        | 42/200 [58:51<18:37,  7.07s/it]

 Done at step 695 (reseting env)


 21%|██        | 42/200 [59:01<18:37,  7.07s/it]

 Done at step 392 (reseting env)


 21%|██        | 42/200 [59:18<18:37,  7.07s/it]

 Done at step 80 (reseting env)


 21%|██        | 42/200 [59:25<18:37,  7.07s/it]

 Done at step 763 (reseting env)


 20%|██        | 40/200 [03:40<14:54,  5.59s/it]

 Saved trajectory 40 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [59:40<18:37,  7.07s/it]

 Done at step 513 (reseting env)


 21%|██        | 42/200 [59:51<18:37,  7.07s/it]

 Done at step 212 (reseting env)


 21%|██        | 42/200 [1:00:01<18:37,  7.07s/it]

 Done at step 31 (reseting env)


 21%|██        | 42/200 [1:00:14<18:37,  7.07s/it]

 Done at step 569 (reseting env)


 25%|██▌       | 50/200 [04:35<13:36,  5.45s/it]

 Saved trajectory 50 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:00:28<18:37,  7.07s/it]

 Done at step 56 (reseting env)


 21%|██        | 42/200 [1:00:36<18:37,  7.07s/it]

 Done at step 741 (reseting env)


 21%|██        | 42/200 [1:00:51<18:37,  7.07s/it]

 Done at step 353 (reseting env)


 21%|██        | 42/200 [1:01:01<18:37,  7.07s/it]

 Done at step 52 (reseting env)


 21%|██        | 42/200 [1:01:09<18:37,  7.07s/it]

 Done at step 753 (reseting env)


 30%|███       | 60/200 [05:30<12:53,  5.53s/it]

 Saved trajectory 60 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:01:19<18:37,  7.07s/it]

 Done at step 438 (reseting env)


 21%|██        | 42/200 [1:01:32<18:37,  7.07s/it]

 Done at step 925 (reseting env)


 21%|██        | 42/200 [1:01:47<18:37,  7.07s/it]

 Done at step 533 (reseting env)


 21%|██        | 42/200 [1:01:57<18:37,  7.07s/it]

 Done at step 235 (reseting env)


 35%|███▌      | 70/200 [06:25<12:02,  5.56s/it]

 Saved trajectory 70 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:02:14<18:37,  7.07s/it]

 Done at step 299 (reseting env)


 21%|██        | 42/200 [1:02:22<18:37,  7.07s/it]

 Done at step 982 (reseting env)


 21%|██        | 42/200 [1:02:37<18:37,  7.07s/it]

 Done at step 749 (reseting env)


 21%|██        | 42/200 [1:02:53<18:37,  7.07s/it]

 Done at step 349 (reseting env)


 21%|██        | 42/200 [1:03:05<18:37,  7.07s/it]

 Done at step 835 (reseting env)


 40%|████      | 80/200 [07:20<11:08,  5.57s/it]

 Saved trajectory 80 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:03:16<18:37,  7.07s/it]

 Done at step 679 (reseting env)


 21%|██        | 42/200 [1:03:31<18:37,  7.07s/it]

 Done at step 258 (reseting env)


 21%|██        | 42/200 [1:03:43<18:37,  7.07s/it]

 Done at step 722 (reseting env)


 21%|██        | 42/200 [1:03:58<18:37,  7.07s/it]

 Done at step 302 (reseting env)


 45%|████▌     | 90/200 [08:15<10:03,  5.49s/it]

 Saved trajectory 90 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:04:09<18:37,  7.07s/it]

 Done at step 211 (reseting env)


 21%|██        | 42/200 [1:04:17<18:37,  7.07s/it]

 Done at step 895 (reseting env)


 21%|██        | 42/200 [1:04:27<18:37,  7.07s/it]

 Done at step 661 (reseting env)


 21%|██        | 42/200 [1:04:43<18:37,  7.07s/it]

 Done at step 208 (reseting env)


 21%|██        | 42/200 [1:04:50<18:37,  7.07s/it]

 Done at step 892 (reseting env)


 50%|█████     | 100/200 [09:11<09:16,  5.57s/it]

 Saved trajectory 100 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:05:01<18:37,  7.07s/it]

 Done at step 629 (reseting env)


 21%|██        | 42/200 [1:05:11<18:37,  7.07s/it]

 Done at step 308 (reseting env)


 21%|██        | 42/200 [1:05:21<18:37,  7.07s/it]

 Done at step 7 (reseting env)


 21%|██        | 42/200 [1:05:34<18:37,  7.07s/it]

 Done at step 835 (reseting env)


 21%|██        | 42/200 [1:05:50<18:37,  7.07s/it]

 Done at step 373 (reseting env)


 55%|█████▌    | 110/200 [10:06<08:24,  5.60s/it]

 Saved trajectory 110 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:06:00<18:37,  7.07s/it]

 Done at step 71 (reseting env)


 21%|██        | 42/200 [1:06:08<18:37,  7.07s/it]

 Done at step 757 (reseting env)


 21%|██        | 42/200 [1:06:18<18:37,  7.07s/it]

 Done at step 458 (reseting env)


 21%|██        | 42/200 [1:06:28<18:37,  7.07s/it]

 Done at step 156 (reseting env)


 21%|██        | 42/200 [1:06:41<18:37,  7.07s/it]

 Done at step 742 (reseting env)


 60%|██████    | 120/200 [11:02<07:25,  5.57s/it]

 Saved trajectory 120 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:06:56<18:37,  7.07s/it]

 Done at step 277 (reseting env)


 21%|██        | 42/200 [1:07:09<18:37,  7.07s/it]

 Done at step 770 (reseting env)


 21%|██        | 42/200 [1:07:24<18:37,  7.07s/it]

 Done at step 318 (reseting env)


 21%|██        | 42/200 [1:07:35<18:37,  7.07s/it]

 Done at step 82 (reseting env)


 21%|██        | 42/200 [1:07:42<18:37,  7.07s/it]

 Done at step 771 (reseting env)


 65%|██████▌   | 130/200 [11:57<06:28,  5.55s/it]

 Saved trajectory 130 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:07:57<18:37,  7.07s/it]

 Done at step 385 (reseting env)


 21%|██        | 42/200 [1:08:10<18:37,  7.07s/it]

 Done at step 874 (reseting env)


 21%|██        | 42/200 [1:08:20<18:37,  7.07s/it]

 Done at step 575 (reseting env)


 21%|██        | 42/200 [1:08:37<18:37,  7.07s/it]

 Done at step 397 (reseting env)


 70%|███████   | 140/200 [12:54<05:46,  5.77s/it]

 Saved trajectory 140 (of 200)

 Mean reward: 0.00, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:08:50<18:37,  7.07s/it]

 Done at step 948 (reseting env)


 21%|██        | 42/200 [1:09:01<18:37,  7.07s/it]

 Done at step 698 (reseting env)


 21%|██        | 42/200 [1:09:17<18:37,  7.07s/it]

 Done at step 367 (reseting env)


 21%|██        | 42/200 [1:09:29<18:37,  7.07s/it]

 Done at step 920 (reseting env)


 75%|███████▌  | 150/200 [13:50<04:46,  5.73s/it]

 Saved trajectory 150 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:09:40<18:37,  7.07s/it]

 Done at step 620 (reseting env)


 21%|██        | 42/200 [1:09:56<18:37,  7.07s/it]

 Done at step 247 (reseting env)


 21%|██        | 42/200 [1:10:06<18:37,  7.07s/it]

 Done at step 3 (reseting env)


 21%|██        | 42/200 [1:10:19<18:37,  7.07s/it]

 Done at step 535 (reseting env)


 80%|████████  | 160/200 [14:46<03:43,  5.58s/it]

 Saved trajectory 160 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:10:34<18:37,  7.07s/it]

 Done at step 114 (reseting env)


 21%|██        | 42/200 [1:10:42<18:37,  7.07s/it]

 Done at step 850 (reseting env)


 21%|██        | 42/200 [1:10:53<18:37,  7.07s/it]

 Done at step 722 (reseting env)


 21%|██        | 42/200 [1:11:08<18:37,  7.07s/it]

 Done at step 314 (reseting env)


 21%|██        | 42/200 [1:11:24<18:37,  7.07s/it]

 Done at step 35 (reseting env)


 85%|████████▌ | 170/200 [15:42<02:46,  5.54s/it]

 Saved trajectory 170 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:11:37<18:37,  7.07s/it]

 Done at step 738 (reseting env)


 21%|██        | 42/200 [1:11:48<18:37,  7.07s/it]

 Done at step 437 (reseting env)


 21%|██        | 42/200 [1:12:00<18:37,  7.07s/it]

 Done at step 989 (reseting env)


 21%|██        | 42/200 [1:12:11<18:37,  7.07s/it]

 Done at step 749 (reseting env)


 21%|██        | 42/200 [1:12:21<18:37,  7.07s/it]

 Done at step 481 (reseting env)


 90%|█████████ | 180/200 [16:37<01:51,  5.58s/it]

 Saved trajectory 180 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:12:37<18:37,  7.07s/it]

 Done at step 357 (reseting env)


 21%|██        | 42/200 [1:12:50<18:37,  7.07s/it]

 Done at step 984 (reseting env)


 21%|██        | 42/200 [1:13:00<18:37,  7.07s/it]

 Done at step 669 (reseting env)


 21%|██        | 42/200 [1:13:11<18:37,  7.07s/it]

 Done at step 353 (reseting env)


 95%|█████████▌| 190/200 [17:32<00:55,  5.54s/it]

 Saved trajectory 190 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)


 21%|██        | 42/200 [1:13:23<18:37,  7.07s/it]

 Done at step 975 (reseting env)


 21%|██        | 42/200 [1:13:40<18:37,  7.07s/it]

 Done at step 914 (reseting env)


 21%|██        | 42/200 [1:13:55<18:37,  7.07s/it]

 Done at step 461 (reseting env)


 21%|██        | 42/200 [1:14:11<18:37,  7.07s/it]

 Done at step 11 (reseting env)


100%|██████████| 200/200 [18:28<00:00,  5.64s/it]

 Saved trajectory 200 (of 200)

 Mean reward: 0.01, ep_rewards.shape == (1000,)

 Actions taken: NOOP, FIRE, UP, RIGHT, LEFT, DOWN
Δt = 1110.49s

Total Δt = 2244.48s





In [150]:
!ls drive/My\ Drive/unicamp/MC886/atari/Dataset/data/PongNoFrameskip-v4_PPO2_1000s/images/001/

0000.png  0125.png  0250.png  0375.png	0500.png  0625.png  0750.png  0875.png
0001.png  0126.png  0251.png  0376.png	0501.png  0626.png  0751.png  0876.png
0002.png  0127.png  0252.png  0377.png	0502.png  0627.png  0752.png  0877.png
0003.png  0128.png  0253.png  0378.png	0503.png  0628.png  0753.png  0878.png
0004.png  0129.png  0254.png  0379.png	0504.png  0629.png  0754.png  0879.png
0005.png  0130.png  0255.png  0380.png	0505.png  0630.png  0755.png  0880.png
0006.png  0131.png  0256.png  0381.png	0506.png  0631.png  0756.png  0881.png
0007.png  0132.png  0257.png  0382.png	0507.png  0632.png  0757.png  0882.png
0008.png  0133.png  0258.png  0383.png	0508.png  0633.png  0758.png  0883.png
0009.png  0134.png  0259.png  0384.png	0509.png  0634.png  0759.png  0884.png
0010.png  0135.png  0260.png  0385.png	0510.png  0635.png  0760.png  0885.png
0011.png  0136.png  0261.png  0386.png	0511.png  0636.png  0761.png  0886.png
0012.png  0137.png  0262.png  0387.png	0512.png  0637.png  0762.

## Use Stable Baseline's `generate_expert_traj`

In [0]:
TRAJ_IMAGES_FOLDER = "images" # name of the folder in which to save the observations
TRAJ_FILE_NAME = "trajectory" # name of the .npz trajectory file

In [0]:
print(LOG_DIR)
print(logger.get_dir())

data/results
/content/data/results


In [0]:
from stable_baselines.gail import generate_expert_traj

time_start = time()
print("================")
for algo, env_id in GAMES:
    time_start_env = time()

    env = make_atari_env(env_id, num_env=1, seed=0)
    env = VecFrameStack(env, n_stack=4) # Frame-stacking with 4 frames
    agent_path = os.path.join(PATH_TO_AGENTS, algo.lower(), env_id + '.pkl')
    
    print(f"('{algo}', '{env_id}')")
    print(f"Getting pre-trained agent from: '{agent_path}'")
    if VERBOSE > 1:
        print(f"env.envs: {env.envs}")
    print()
    
    model = ALGO_IMPL[algo].load(agent_path, env)
    
    traj_name = f"{env_id}_{algo}_{N_OF_STEPS}s"
    generate_expert_traj(model, env=env, 
                         n_episodes=N_OF_TRAJECTORIES, 
                         save_path=os.path.join(SAVE_DIR, traj_name, TRAJ_FILE_NAME), 
                         image_folder=TRAJ_IMAGES_FOLDER)

    env.close()
    print(f"\nΔt = {(time() - time_start_env):.2f}s")
    print("================")

print(f"\nTotal Δt = {(time() - time_start):.2f}s")

('PPO2', 'BreakoutNoFrameskip-v4')
Getting pre-trained agent from: 'rl-baselines-zoo/trained_agents/ppo2/BreakoutNoFrameskip-v4.pkl'
env.envs: [<ClipRewardEnv<WarpFrame<FireResetEnv<EpisodicLifeEnv<Monitor<MaxAndSkipEnv<NoopResetEnv<TimeLimit<AtariEnv<BreakoutNoFrameskip-v4>>>>>>>>>>]

Images will be recorded to data/BreakoutNoFrameskip-v4_PPO2_20s/images/
Image shape: (84, 84, 4)
actions (2182, 1)
obs (2182,)
rewards (2182, 1)
episode_returns (2,)
episode_starts (2182,)

Δt = 8.31s
('PPO2', 'PongNoFrameskip-v4')
Getting pre-trained agent from: 'rl-baselines-zoo/trained_agents/ppo2/PongNoFrameskip-v4.pkl'
env.envs: [<ClipRewardEnv<WarpFrame<FireResetEnv<EpisodicLifeEnv<Monitor<MaxAndSkipEnv<NoopResetEnv<TimeLimit<AtariEnv<PongNoFrameskip-v4>>>>>>>>>>]

Images will be recorded to data/PongNoFrameskip-v4_PPO2_20s/images/
Image shape: (84, 84, 4)
actions (3400, 1)
obs (3400,)
rewards (3400, 1)
episode_returns (2,)
episode_starts (3400,)

Δt = 11.97s

Total Δt = 20.28s


## Download generated dataset

In [0]:
trajectories = []
image_folders = []
for save_folder in os.listdir(SAVE_DIR):
    for x in os.listdir(os.path.join(SAVE_DIR, save_folder)):
        path = os.path.join(SAVE_DIR, save_folder, x)
        if x == TRAJ_IMAGES_FOLDER:
            image_folders.append(path)
        elif os.path.splitext(x)[0] == TRAJ_FILE_NAME:
            trajectories.append(path)

assert len(image_folders) == len(trajectories), f"{len(image_folders)} != {len(trajectories)} (len(image_folders) != len(trajectories))"

print('\n\n'.join([traj + '\n' + img_folder for traj, img_folder in zip(trajectories, image_folders)]))

data/BreakoutNoFrameskip-v4_PPO2_20s/trajectory.npz
data/BreakoutNoFrameskip-v4_PPO2_20s/images

data/PongNoFrameskip-v4_PPO2_20s/trajectory.npz
data/PongNoFrameskip-v4_PPO2_20s/images


In [0]:
x = np.load('data/PongNoFrameskip-v4_PPO2_20s/trajectory.npz', 
            allow_pickle=True, mmap_mode='r')
print(x.files)

['actions', 'obs', 'rewards', 'episode_returns', 'episode_starts']


In [0]:
def print_trajectory_info(x):
    # actions taken
    print("- actions:", x['actions'].shape)
    print("  actions taken:", ', '.join([ACTION_MEANING[action] for action in set(x['actions'].reshape(-1))]))
    # path to the observed images
    print("- obs:", x['obs'].shape)
    # reward for each step
    print("- rewards:", x['rewards'].shape)
    print("  reward values:", ', '.join([str(r) for r in set(x['rewards'].reshape(-1))]))
    # reward for each trajectory
    print("- episode_returns:", x['episode_returns'].shape)
    print("  episode returns", x['episode_returns'])
    # `done` value returned by env.step(action)
    print("- episode_starts:", x['episode_starts'].shape)
    print("  episode starts:", [i for i, ep_start in enumerate(x['episode_starts']) if ep_start])

In [0]:
print("================")
for trajectory in trajectories:
    print(trajectory)
    print_trajectory_info(np.load(trajectory, allow_pickle=True, mmap_mode='r'))
    print("================")

data/BreakoutNoFrameskip-v4_PPO2_20s/trajectory.npz
- actions: (2182, 1)
  actions taken: NOOP, FIRE, UP, RIGHT
- obs: (2182,)
- rewards: (2182, 1)
  reward values: 0.0, 1.0
- episode_returns: (2,)
  episode returns [96.  1.]
- episode_starts: (2182,)
  episode starts: [0, 2033]
data/PongNoFrameskip-v4_PPO2_20s/trajectory.npz
- actions: (3400, 1)
  actions taken: NOOP, FIRE, UP, RIGHT, LEFT, DOWN
- obs: (3400,)
- rewards: (3400, 1)
  reward values: 0.0, 1.0
- episode_returns: (2,)
  episode returns [21. 21.]
- episode_starts: (3400,)
  episode starts: [0, 1691]


## Evaluate agents
Let's evaluate a few trajectories of each game from `GAMES` to get a sense of how the agents are performing

In [0]:
# number of trajectories to evaluate each agent on
N_OF_EVAL_TRAJECTORIES = 2

results = {
    'algo': [],
    'env_id': [],
    'mean_reward': [],
    'std_reward': [],
    'n_timesteps': [],
    'n_episodes': []
}

TENSORBOARD_LOG = False
# https://stable-baselines.readthedocs.io/en/master/guide/tensorboard.html

if TENSORBOARD_LOG:
    %load_ext tensorboard
    %tensorboard --logdir tmp/tb_logs

In [0]:
time_start = time()
print("================")
for algo, env_id in GAMES:
    time_start_env = time()

    env = make_atari_env(env_id, num_env=1, seed=0)
    env = VecFrameStack(env, n_stack=4) # Frame-stacking with 4 frames
    agent_path = os.path.join(PATH_TO_AGENTS, algo.lower(), env_id + '.pkl')
    
    print(f"('{algo}', '{env_id}')")
    print(f"Getting pre-trained agent from: '{agent_path}'")
    if VERBOSE > 1:
        print(f"env.envs: {env.envs}")
    print()
    
    model = ALGO_IMPL[algo].load(agent_path, env)
    if TENSORBOARD_LOG:
        model.tensorboard_log = os.path.join(LOG_DIR, f"tb_logs")
        print(f"Adding TensorBoard logs to '{model.tensorboard_log}/'")
    
    for trajectory in tqdm(range(N_OF_EVAL_TRAJECTORIES), position=0, leave=True):
        # episode stats
        ep_len, ep_reward, ep_rewards = 0, 0.0, []

        obs = env.reset() # (84, 84, 4)
        for step in range(N_OF_STEPS):
            action = model.predict(obs)
            # clip action to avoid out of bound errors
            if isinstance(env.action_space, gym.spaces.Box):
                action = np.clip(action, env.action_space.low, env.action_space.high)
            
            obs, reward, done, infos = env.step(action)
            # NOTE action, reward and done are arrays since we're using a vectorized env
            _env = env.envs[0]
            # tqdm.write(f"env.envs[0].episode_rewards: {env.envs[0].episode_rewards}")
            # tqdm.write(f"env.envs[0].episode_lengths: {env.envs[0].episode_lengths}")
            # tqdm.write(f"env.envs[0].episode_times: {env.envs[0].episode_times}")
            # tqdm.write(f"env.envs[0].rewards: {env.envs[0].rewards}")

            # NOTE the return reward is not the Atari score
            #      so we have to get it from the infos dict
            ep_infos = infos[0].get('episode')
            if ep_infos is not None:
                tqdm.write(f"\nAtari Episode Score: {ep_infos['r']:.2f}")
                tqdm.write(f"Atari Episode Length: {ep_infos['l']}")
                #tqdm.write(f"\nAtari Episode Score: {round(sum(_env.rewards), 6):.2f}")
                #tqdm.write(f"Atari Episode Length: {len(_env.rewards)}")
            
            # FIXME ep_infos is always None 
            # try checking stable-baselines' Monitor.step()
            if infos is not None: # debug
                # if step == 0:
                #     tqdm.write(f"(DEBUG) step == 0: {infos}")
                if len(infos) > 1:
                    tqdm.write(f"(DEBUG) infos: {infos}")
                elif len(infos[0].keys()) > 1:
                    if 'terminal_observation' not in infos[0].keys() or len(infos[0].keys()) > 2:
                        tqdm.write(f"(DEBUG) infos[0]: {infos[0]}")

            ep_len += 1
            ep_reward += reward[0]
            if done:
                obs = env.reset()
                ep_rewards.append(ep_reward)
                if VERBOSE > 1:
                    tqdm.write(f"\nEpisode Reward: {ep_reward:.2f}")
                    tqdm.write(f"Episode Length: {ep_len}")
                ep_reward = 0.0
                ep_len = 0
        
        if VERBOSE > 0:
            tqdm.write("\nMean reward: {:.2f}, len(ep_rewards) == {}".format(
                       np.mean(ep_rewards) if len(ep_rewards) > 0 else 0.0, 
                       len(ep_rewards)))

    env.close()
    print(f"Δt = {(time() - time_start_env):.2f}s")
    print("================")

print(f"Total Δt = {(time() - time_start):.2f}s")

In [0]:
!ls data/results/

## Generate trajectories

Note that we use `make_atari_env` + `VecFrameStack` for `NoFrameskip-v4` environments, so each frame is converted to grayscale and downscaled from 210x160 to 84x84. Therefore, the $observation$ shape is `(84, 84, 4)` (four stacked frames), and **not** `(210, 160, 3)`, nor `(84, 84, 1)`.

In [0]:
PRINT_EARLY_DONE = False
PRINT_ACTIONS_TAKEN = False

PRINT_EVERY_N_TRAJECTORIES = N_OF_TRAJECTORIES // 10
# uncomment below not to print
# PRINT_EVERY_N_TRAJECTORIES = N_OF_TRAJECTORIES + 1

**TODO: evaluate the trajectories before saving the final datasets**  
**TODO: add button to load from data/ or save to drive**

In [0]:
time_start = time()
print("PRINT_EVERY_N_TRAJECTORIES:", PRINT_EVERY_N_TRAJECTORIES)
print("N_OF_TRAJECTORIES:", N_OF_TRAJECTORIES)
print("N_OF_STEPS:", N_OF_STEPS)
print("================")
for algo, env_id in GAMES:
    time_start_env = time()

    env = make_atari_env(env_id, num_env=1, seed=0)
    env = VecFrameStack(env, n_stack=4) # Frame-stacking with 4 frames
    agent_path = os.path.join(PATH_TO_AGENTS, algo.lower(), env_id + '.pkl')
    
    print(f"('{algo}', '{env_id}')")
    print(f"Getting pre-trained agent from: '{agent_path}'\n")
    
    model = ALGO_IMPL[algo].load(agent_path, env)
    
    for trajectory in tqdm(range(N_OF_TRAJECTORIES), position=0, leave=True):
        # store the "obs -> action" mapping
        observed_states, actions_taken = [], []

        obs = env.reset() # (84, 84, 4)
        for step in range(N_OF_STEPS):
            observed_states.append(obs)
            action = model.predict(obs)
            actions_taken.append(action)
            obs, reward, done, infos = env.step(action)
            if done:
                obs = env.reset()
                if PRINT_EARLY_DONE:
                    print(f"done at step {step + 1} (reseting env)")
        
        # NOTE action, reward and done are arrays since we're using a vectorized env
        observed_states = [obs[0] for obs in observed_states]
        actions_taken = [action[0][0] for action in actions_taken]
        
        np.savez_compressed(file=os.path.join(SAVE_DIR, f"{env_id}_{algo}_t{trajectory+1}_{N_OF_STEPS}s"), 
                            observations=observed_states, actions=actions_taken)
        
        if (trajectory + 1) % 10 == 0:
            print(f" Saved trajectory {trajectory+1} (of {N_OF_TRAJECTORIES})")

        if PRINT_ACTIONS_TAKEN and trajectory == N_OF_TRAJECTORIES - 1:
            print("\nActions taken:", ", ".join([ACTION_MEANING[action] for action in set(actions_taken)]))

    del observed_states
    del actions_taken
    env.close()
    print(f"Δt = {(time() - time_start_env):.2f}s")
    print("================")

print(f"Total Δt = {(time() - time_start):.2f}s")

In [0]:
trajectory_filenames = []
for r, ds, fs in os.walk(SAVE_DIR): # r=root, d=directories, f=files
    print(r + '/')
    for f in fs:
        print("|___", f)
        trajectory_filenames.append(f)

In [0]:
test_trajectory_filename = trajectory_filenames[0]
print(f"Loading from '{test_trajectory_filename}'\n")

test_trajectory_load = np.load(os.path.join(SAVE_DIR, test_trajectory_filename), 
                               allow_pickle=True)

print("observations shape:", test_trajectory_load['observations'].shape)
print("actions shape:", test_trajectory_load['actions'].shape)

In [0]:
# https://github.com/araffin/rl-baselines-zoo/blob/master/utils/record_video.py
# https://github.com/araffin/rl-baselines-zoo/blob/master/enjoy.py
# https://github.com/hill-a/stable-baselines#try-it-online-with-colab-notebooks-

## Old

In [0]:
# def save_as_image(observation, save_dir, img_name, prefix="img_", downscale=False):
#     # downscaling the image
#     if downscale:
#         im_array = cv2.resize(observation, INP_IMAGE_SHAPE) # TODO test tf.image.resize
#         im_array = np.array(im_array, dtype='float32')
#         im_array = (im_array/127.5) - 1
#         im = PIL.Image.fromarray(im_array, 'RGB')
#     else:
#         try:
#             im = PIL.Image.fromarray(observation, 'RGB')
#         except:
#             print(type(observation))
#     imname = "{}{}.png".format(prefix, img_name)
#     im.save(os.path.join(save_dir, imname))

In [0]:
# # you can change the default values here
# save_dir = SAVE_DIR
# num_images = IMAGES_TO_GENERATE

In [0]:
# os.makedirs(save_dir, exist_ok=True)

In [0]:
# envs = [gym.make(env_id) for env_id in ENV_IDS]

In [0]:
# for env_id, env in zip(ENV_IDS, envs):
#     print(env_id)
#     env_dir = os.path.join(save_dir, f"{env_id}_{IMAGES_TO_GENERATE}")
#     os.makedirs(env_dir, exist_ok=True)
    
#     env.reset()
#     i, current_env_images = 0, 0
    
#     actions_taken = []
#     while i < num_images:
#         # take a random action (sampled from the action space)
#         action = env.action_space.sample()
#         actions_taken.append(action)
#         assert 0 <= action < 18, f"action = {action}"
#         obs, _, done, _ = env.step(action)
#         if np.mean(obs) > 0.01:
#             save_as_image(obs, env_dir, str(i))
#             i += 1
#         else:
#             print("should I have been reached?")
#             continue
#         if done:
#             print(f"reseting {env_id} at i={i}")
#             env.reset()
    
#     actions_taken = np.asarray(actions_taken, dtype='int8')
#     print(actions_taken.shape, actions_taken.size, actions_taken.dtype)
#     np.save(os.path.join(save_dir, f"{env_id}_{IMAGES_TO_GENERATE}_actions"), actions_taken)

In [0]:
# IMG_SIZE = 160 # All images will be resized to 160x160

# def load_image(image_path):
#     image = tf.io.read_file(image_path)
#     image = tf.image.decode_png(image, channels=3)
#     image = tf.cast(image, tf.float32)
#     image = (image/127.5) - 1
#     image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))
#     return image, image_path

# IMG_SHAPE = (IMG_SIZE, IMG_SIZE, 3)

# # Create the base model from the pre-trained model MobileNet V2
# base_model = tf.keras.applications.MobileNetV2(input_shape=IMG_SHAPE,
#                                                include_top=False,
#                                                weights='imagenet')

# s = time()
# # Get unique images
# encode_train = img_name_vector

# # Feel free to change batch_size according to your system configuration
# image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
# image_dataset = image_dataset.map(
#   load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)

# print((time()-s)/1000)

# for img, path in image_dataset:
#   batch_features = image_features_extract_model(img)
#   batch_features = tf.reshape(batch_features,
#                               (batch_features.shape[0], -1, batch_features.shape[3]))

#   for bf, p in zip(batch_features, path):
#     path_of_feature = p.numpy().decode("utf-8")
#     np.save(path_of_feature, bf.numpy())