### based on https://github.com/higgsfield/RL-Adventure and https://medium.com/swlh/introduction-to-reinforcement-learning-coding-sarsa-part-4-2d64d6e37617

In [1]:
%matplotlib inline
import collections
import cv2
import gym
import gzip
import matplotlib.pyplot as plot
import numpy as np
import random
import time
import torch as t
from tqdm import tqdm, tqdm_notebook
from IPython.display import clear_output

In [2]:
class LazyFrames(object):
    def __init__(self, frames):
        """This object ensures that common frames between the observations are only stored once.
        It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
        buffers.
        This object should only be converted to numpy array before being passed to the model.
        You'd not belive how complex the previous solution was."""
        self._frames = frames

    def __array__(self, dtype=None):
        out = np.concatenate(self._frames, axis=0)
        if dtype is not None:
            out = out.astype(dtype)
        return out

class ImageToPyTorch(gym.ObservationWrapper):
    """
    Change image shape to CWH
    """
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]))

    def observation(self, observation):
        return observation.transpose(2, 0, 1)
    
class FrameStack(gym.Wrapper):
    def __init__(self, env, k):
        """Stack k last frames.
        Returns lazy array, which is much more memory efficient.
        See Also
        --------
        baselines.common.atari_wrappers.LazyFrames
        """
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.frames = collections.deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(shp[0]*k, shp[1], shp[2]))

    def reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return LazyFrames(list(self.frames))

class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super(ResizeObservation, self).__init__(env)
        shp = env.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(shp[0] // 2, shp[1] // 2, shp[2]))
        self.resize_to = (shp[1] // 2, shp[0] // 2)

    def observation(self, observation):
        return cv2.resize(observation, self.resize_to, interpolation=cv2.INTER_AREA)

env = gym.make('TennisDeterministic-v4')
env = gym.wrappers.Monitor(env, '.', force=True)
env = ResizeObservation(env)
env = ImageToPyTorch(env)
env = FrameStack(env, 4)

In [3]:
USE_CUDA = t.cuda.is_available()# and False
device = t.device('cuda') if USE_CUDA else t.device('cpu')

In [4]:
import models
def build_model():
    return models.DuelingCnnDqn(env.observation_space.shape, env.action_space.n).to(device)

In [5]:
model = build_model()
MODEL_PATH = f'model-{model.model_type()}.gz'
with gzip.open(MODEL_PATH, 'rb') as f:
    model.load_state_dict(t.load(f))
print(f'model weights loaded from {MODEL_PATH}')

model weights loaded from model-dueling-cnn-dqn.gz


In [6]:
model.eval()
for _ in range(2):
    steps, rewards = 0, 0
    state, done = env.reset(), False
    for _ in tqdm_notebook(range(100000)):
        if random.random() > 0.05:
            state = t.FloatTensor(np.array(state)).to(device)
            action = model(state).argmax().item()
        else:
            action = env.action_space.sample()
        state, reward, done, _ = env.step(action)
        steps += 1
        rewards += reward
        if done:
            break
    print(f'steps = {steps}, rewards = {rewards}')

env.close()
!ls -lh

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))

steps = 100000, rewards = 0.0


HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))

steps = 100000, rewards = 0.0
total 66M
-rw-rw-r-- 1 user user  26M May 11 15:46 model-dueling-cnn-dqn.gz
-rw-rw-r-- 1 user user 3.5M May 11 16:11 model-master-dueling-cnn-dqn.gz
-rw-rw-r-- 1 user user  26M May 11 16:11 model-slave-dueling-cnn-dqn.gz
-rw-rw-r-- 1 user user 1.2K May 11 10:11 models.py
-rw-rw-r-- 1 user user  200 May 11 16:29 openaigym.episode_batch.0.30663.stats.json
-rw-rw-r-- 1 user user  330 May 11 16:29 openaigym.manifest.0.30663.manifest.json
-rw-rw-r-- 1 user user 2.1K May 11 16:23 openaigym.video.0.30663.video000000.meta.json
-rw-rw-r-- 1 user user 5.1M May 11 16:23 openaigym.video.0.30663.video000000.mp4
-rw-rw-r-- 1 user user 2.1K May 11 16:29 openaigym.video.0.30663.video000001.meta.json
-rw-rw-r-- 1 user user 5.1M May 11 16:29 openaigym.video.0.30663.video000001.mp4
drwxrwxr-x 2 user user 4.0K May 11 10:11 __pycache__
-rw-rw-r-- 1 user user  47K May 11 16:27 tennis-dueling-dqn.ipynb
-rw-rw-r-- 1 user user 7.9K May 11 16:25 tennis-dueling-dqn-make