In [None]:
import numpy as np
import tensorflow as tf

class HashDND(object):
    """differentiable neural dictionary, using LSH for approximate
    nearest neighbour lookup. Assumes keys are vectors. Also assumes we only
    use float32 and doesn't handle batched operations :("""

    sentinel_value = np.inf

    @classmethod
    def _setup_variables(cls, hash_bits, max_neighbours, key_size,
                         value_shapes):
        """setup variables with appropriate initializers given the shapes"""
        init = tf.constant_initializer(cls.sentinel_value)

        keys = tf.get_variable(name='keys',
                               shape=[2**hash_bits * max_neighbours, key_size],
                               initializer=init)

        values = []
        for i, shape in enumerate(value_shapes):
            var_shape = [2**hash_bits * max_neighbours] + shape
            var = tf.get_variable(name='value_{}'.format(i),
                                  shape=var_shape,
                                  initializer=init)
            values.append(var)
        return keys, values

    def __init__(self, hash_bits, max_neighbours, key_size, value_shapes,
                 similarity_measure=None, name='dnd'):
        """Set up the dnd.

        Args:
            hash_bits (int): how many bits for the hash. There will be
                `2**num_bits` individual buckets.
            max_neighbours (int): how many entries to store in each bucket.
                This controls the number of neighbours we check against.
                Operations will be linear in this value and it will likely
                effect learning performance significantly as well.
            key_size (int): size of the key vectors. We use the unhashed key
                vectors to compute similarities between keys we find from the
                nearest neighbour lookup.
            value_shapes (list): list of shapes for the values stored in the
                dictionary.
            similarity_measure (Optional[callable]): function which adds ops
                to compare a query key with all of the other keys in the
                bucket. If unspecified, the cosine similarity is used. Should
                be a callable which takes two input tensors: the query key
                (shaped `[key_size]`) and a  `[max_neighbours, key_size]`
                tensor  of keys to compare against. Should return a
                `[max_neighbours]` tensor of similarities, between 0 and 1
                where 1 means the two keys were identical.
            name (Optional[str]): a name under which to group ops and
                variables. Defaults to `dnd`.
        """
        self._name = name
        self._hash_size = hash_bits
        self._key_size = key_size
        self._bucket_size = max_neighbours
        with tf.variable_scope(self._name):
            self._keys, self._values = HashDND._setup_variables(hash_bits,
                                                                max_neighbours,
                                                                key_size,
                                                                value_shapes)
        self._hash_config = get_simhash_config(self._key_size,
                                               self._hash_size)

        if not similarity_measure:
            similarity_measure = cosine_similarity
        self._similarity_measure = similarity_measure

    def _get_bucket(self, key):
        """look up the contents of a bucket by hash. Also return the bucket
        index so we can create updates to the storage variables."""
        idx = simhash(key, self._hash_config)
        bucket_start = idx * self._bucket_size
        bucket_end = (idx + 1) * self._bucket_size
        keys = self._keys[bucket_start:bucket_end, ...]
        values = [val[bucket_start:bucket_end, ...] for val in self._values]
        return keys, values, idx

    def store(self, key, value):
        """Gets an op which will store the key-value pair. This involves the
        following process:
            - compute hash of `key`
            - lookup all keys and values with matching hash
            - if the bucket isn't full
                - assign the values to the values of the next empty position
            - else (if the bucket is full)
                - update, according to some update rule which may be
                  application specific.
                - TODO: figure this out (LRU?)

        Args:
            key (tensor): `[key_size]` key to store
            value (list of tensors): `[???]` tensors to
                be stored.

        Returns:
            op: an op which carries out the above steps.
        """
        with tf.name_scope(self._name + '/store'):
            bucket_keys, bucket_values, idx = self._get_bucket(key)
            # is there space?
            can_store = tf.reduce_any(tf.equal(bucket_keys[:, 0],
                                               self.sentinel_value))

            def _empty_store():
                return self._get_store_op_empty(key, value, idx, bucket_keys)

            def _full_store():
                return self._get_store_op_full(key, value, idx, bucket_keys)

            store_op = tf.cond(can_store, _empty_store, _full_store)
        return store_op

    def _flatten_index(self, index, bucket_index):
        """turn a bucket-level index into a global index"""
        return index + (bucket_index * self._bucket_size)

    def _update_at_index(self, index, new_key, new_vals):
        """make update ops to insert at the appropriate (flattened) index"""
        # update the keys
        key_update = tf.scatter_update(self._keys, [index],
                                       tf.expand_dims(new_key, 0))
        # and update the values
        value_updates = []
        for value_var, new_val in zip(self._values, new_vals):
            val_update = tf.scatter_update(value_var, [index],
                                           tf.expand_dims(new_val, 0))
            value_updates.append(val_update)

        # make sure they all happen at once
        return tf.group(key_update, *value_updates)

    def _get_store_op_empty(self, store_key, store_vals, bucket_index,
                            bucket_keys):
        """get an op to store given key and values in the first empty space.

        Returns an op with no output that will run all of the required updates.
        """
        # first find the first empty spot (assuming there is one)
        with tf.name_scope('empty_store'):
            empty_indices = tf.where(tf.equal(bucket_keys[:, 0],
                                              self.sentinel_value))
            empty_indices = tf.cast(empty_indices, tf.int32)
            store_idx = self._flatten_index(empty_indices[0, 0], bucket_index)
            return self._update_at_index(store_idx, store_key, store_vals)

    def _get_store_op_full(self, store_key, store_vals, bucket_index,
                           bucket_keys):
        """get an op to store given keys and values when there are no empty
        slots.

        Returns an op with no output that will run all of the require updates.
        """
        # TODO: what should this do? LRU? Need some accounting for that,
        # otherwise some kind of interpolation for lossily storing it in the
        # bucket?
        # for now we are just going to choose at random, which is surely a
        # terrible strategy, but at least it will run
        with tf.name_scope('store_full'):
            idx = tf.random_uniform([], minval=0, maxval=self._bucket_size,
                                    dtype=tf.int32)
            store_idx = self._flatten_index(idx, bucket_index)
            return self._update_at_index(store_idx, store_key, store_vals)

    def _get_averaged_value(self, values, similarities):
        """get a weighted sum of values."""
        weighted_values = tf.expand_dims(similarities, 1) * values
        all_values = tf.reduce_sum(weighted_values, axis=0)
        return all_values

    def get(self, key):
        """Get the values in the dictionary corresponding to a particular key,
        or zeros if the key is not present.

        The process is as follows:
            - compute hash of `key`
            - lookup all keys and values with matching hash
            - compute similarities between all matching keys and `key`
            - return average of all matching values, weighted by similarities.

        The default similarity is the cosine distance.

        Args:
            key (tensor): `[key_size]` batch of keys to look up.

        Returns:
            value (tuple): associated values.
        """
        # TODO: what to return when the bucket is empty?
        # at the moment it is all zeros
        with tf.name_scope(self._name + '/get'):
            bucket_keys, bucket_values, _ = self._get_bucket(key)
            # compute similarities
            similarities = self._similarity_measure(key, bucket_keys)
            # where the keys are sentinel, mask it out
            used_positions = tf.not_equal(bucket_keys[:, 0],
                                          self.sentinel_value)
            values = [tf.boolean_mask(val, used_positions)
                      for val in bucket_values]
            similarities = tf.boolean_mask(similarities, used_positions)
            # normalise them to sum to one, and maybe give them a kick
            similarities /= tf.reduce_sum(similarities)

            results = tuple(self._get_averaged_value(val, similarities)
                            for val in values)
        return results
    
    
def cosine_similarity(query, bucket):
    """Cosine similarity: the cosine of the angle between two vectors.
    Also the dot product, if the vectors are normalised in the l2 norm,
    which is how it is implemented here."""
    query = tf.expand_dims(query, 1)
    query = tf.nn.l2_normalize(query, dim=0)
    bucket = tf.nn.l2_normalize(bucket, dim=1)
    return tf.squeeze(tf.matmul(bucket, query), 1, name='cos_sim')


def get_simhash_config(input_size, hash_bits):
    """Gets any necessary configuration and data structures necessary for
    consistent hashing.

    For simhash, this just corresponds to the random matrix used to project the
    input down, but we also store some values used in the conversion from
    binary to integers.

    This function should be run once and the result stored and passed in to all
    subsequent calls to `simhash`, so that we use the same matrix every time.

    Args:
        input_size (int): size of the inputs we are going to hash.
        hash_bits (int): the number of bits we output.

    Returns:
        dict: dictionary with two keys: "matrix" corresponding to a variable
            used for the random projection and "bases" used in the conversion
            to integers.
    """
    with tf.variable_scope('simhash_config'):
        mat = tf.get_variable(
            'projection_matrix',
            shape=[input_size, hash_bits],
            initializer=tf.random_normal_initializer())
        bases = tf.expand_dims(2 ** tf.range(hash_bits), 0)
        return {'matrix': mat,
                'bases': bases}


def simhash(inputs, config):
    """SimHash the inputs into an integer with `num_bits` used bits.
    The process is:
        - flatten inputs into `[batch_size, ?]`
        - multiply by a (fixed) random gaussian matrix.
        - convert to bits with the sign function (and appropriate shifting and
          scaling)
        - convert from a `[batch_size, num_bits]` matrix to a `[batch_size, 1]`
          vector of integers.

    The standard process requires sign(0) = 1, as usual. However, most maths
    libraries define sign(0) = 0. At this stage we ignore it because it is
    highly unlikely, but if the inputs are sparse it's a possibility and should
    probably be adressed (it's not going to break anything, but it will lead to
    some potentially unexpected hashes which _might_ break the locality
    sensitive property).

    If inputs is a vector, then we will return a scalar, otherwise batches of
    inputs will produce batches of outputs.

    Args:
        inputs (tensor): tensor of whatever shape, with the batch on the first
            axis. Apart from the batch size, the shape does need to be defined.
        config (dict): the result of `get_simhash_config`.

    Returns:
        tensor: `[batch_size, 1]` integer tensor.
    """
    with tf.variable_scope('simhash'):
        if len(inputs.get_shape()) == 1:
            inputs = tf.expand_dims(inputs, 0)
            squeeze_output = True
        else:
            squeeze_output = False
        projected = tf.matmul(inputs, config['matrix'])
        bits = tf.sign(projected) * 0.5 + 0.5
        # return bits
        bits = tf.cast(bits, tf.int32)
        # convert to single int
        index = tf.reduce_sum(bits * config['bases'],
                              axis=1, keep_dims=True)
        if squeeze_output:
            index = tf.squeeze(index, [0])
        return index


In [None]:
class BaseDifferentiableMemory(object):
    def __init__(self

In [7]:
import os
import time
import numpy as np
import tensorflow as tf
from logging import getLogger
import random
from collections import deque

from rltensor.agents.agent import Agent
from rltensor.utils import get_shape

logger = getLogger(__name__)


class NEC(Agent):
    def __init__(self, env, conf, controller_cls, memory_cls, default_conf=None, sess=None):
        self.controller_cls = controller_cls
        self.memory_cls = memory_cls
        self.key_dim = conf["key_dim"]
        self.delay = conf["delay"]
        self.recent_rewards = deque(maxlen=self.delay)
        self.recent_terminals = deque(maxlen=self.delay)
        super(NEC, self).__init__(env, conf, default_conf, sess)
        
    def _build_graph(self):
        """Build all of the network and optimizations
        
        just for conveninece of trainig, seprate placehoder for train and target network
        critic network input: [raw_data, smoothed, downsampled]
        """
        # state shape has to be (batch, length,) + input_dim
        self.state = tf.placeholder(tf.float32,
                                     get_shape(self.state_dim, maxlen=self.window_length),
                                     name='state')
        _state = self.processor.tensor_process(self.state)
        # Employ maximal strategy
        self.controller = self.controller_cls(self.key_dim, self.conf["controller"], scope_name="controller")
        self.diff_memory = self.diff_memory_cls(self.action_dim, self.conf["diff_memory"])
        query = self.controller(_state, self.training)
        self.q_val = self.diff_memory.get_q(query)
        self.max_action = tf.argmax(self.q_val, dimension=1)
        self.max_q_val = tf.reduce_max(self.q_val, dimension=1)
        # Build action graph
        self.action = tf.placeholder(tf.int32, (None,), name='action')
        action_one_hot = tf.one_hot(self.action, depth=self.action_dim)
        self.action_q_val = tf.reduce_sum(self.q_val * action_one_hot, axis=1)
        # Build target
        self.target = tf.placeholder(tf.float32, (None,), name="target")
        self.terminal = tf.placeholder(tf.bool, (None,), name="terminal")
        # Clip error to stabilize learning
        self.error = self.target - self.action_q_val
        clipped_error = tf.where(tf.abs(self.error) < self.error_clip,
                                    0.5 * tf.square(self.error),
                                    tf.abs(self.error), name='clipped_error')
        self.weights = tf.placeholder(tf.float32, (None,), name="importance_weights")
        self.loss = tf.reduce_mean(self.weights * clipped_error, name='loss')
        # Build optimization
        self.update_op = self._get_update_op()
        self.optimizer = self._get_optimizer(self.optimizer_name, self.learning_rate_op, self.optimizer_conf)
        grads_vars = self.optimizer.compute_gradients(self.loss)
        if "grad_clip" in self.conf and self.conf["grad_clip"] is not None:
            grads_vars = [
                (tf.clip_by_norm(gv[0], clip_norm=self.conf["grad_clip"]), gv[1]) 
                    for gv in grads_vars]
        self.q_optim = self.optimizer.apply_gradients(grads_vars)

    def observe(self, observation, action, reward, terminal, training):
        # clip reward into  (min_r, max_r)
        reward = max(self.min_r, min(self.max_r, reward))
        assert len(self.memory.observations) == len(self.recent_rewards)
        # We always keep data
        self.recent_observations.append(observation)
        self.recent_rewards.append(reward)
        self.recent_terminals.append(terminal)
        target_val = self._calc_target(observation)
        # we keep target value instead of reward directly
        self.memory.append(observation, action, target_val, terminal, is_store=True)
        step = self.global_step.eval(session=self.sess)
        if (step + 1) % self.t_train_freq:
            is_update = True
        else:
            is_update = False
        if training:
            self.memory.add_weights()
            weights = self.memory.get_weights()
            experiences = self.memory.sample(self.batch_size, weights)
            weights = self.memory.get_importance_weights()
            if weights is None:
                weights = np.ones(self.batch_size)
            result = self.q_learning_minibatch(experiences, weights, is_update)
            return result
        else:
            return None

    def q_learning_minibatch(self, experiences, batch_weights, is_update=True):
        feed_dict = {
            self.state: [experience.state for experience in experiences],
            self.target: [experience.q_val for experience in experiences],
            self.action: [experience.action for experience in experiences],
            self.weights: batch_weights,
            self.training: True,
        }
        if is_update:
            self.sess.run(self.q_optim, feed_dict=feed_dict);
        q_t, loss, error = self.sess.run([self.action_q_val, self.loss, self.error],
                                     feed_dict=feed_dict)
        return q_t, loss, error, is_update
    
    def predict(self, state, ep=None):
        if ep is None:
            ep = self.epsilon.eval(session=self.sess)
        if random.random() < ep:
            action = np.random.randint(0, self.action_dim)
        else:
            action = self.sess.run(self.max_action, 
                                   feed_dict={self.state: [state],
                                              self.training: False})[0]
        return action
    
    def _calc_target(self, observation):
        target_val = 0
        backward = self.delay - 1
        for i in range(len(self.recent_rewards)):
            target_val += (self.gamma) ** i * self.recent_rewards[i]
            backward -= 1
            if recent_terminals[i]:
                break
        state = self.memory.get_delay_state(observation, backward)
        feed_dict = {
            self.state: [state],
            self.training: False}
        max_q_val = self.sess.run(self.max_q_val, feed_dict=feed_dict)[0]
        target_val += self.gamma**(self.delay - backward) * max_q_val
        return target_val
    
    def update_target_q_network(self):
        # We have no operations for updating target network
        pass

In [5]:
from collections import namedtuple

from rltensor.memories import SequentialMemory

QExperience = namedtuple('QExperience', 'state, action, q_val')

class QMemory(SequentialMemory):
    def __init__(self, delay, window_length, limit, *args, **kwargs):
        self.delay = delay
        # Take more observations to make state
        self.dleay_observations = deque(maxlen=self.delay+window_length)
        self.delay_actions = deque(maxlen=self.delay)
        self.delay_terminals = deque(maxlen=self.delay)
        super(DelayMemory, self).__init__(window_length, limit, *args, **kwargs)
        
    def sample(self, batch_size, weights=None, batch_idxs=None):
        if batch_idxs is None:
            if weights is None:
                weights = None
            # Draw random indexes such that we have at least a single entry before each
            # index. Thus, draw samples from [1, self.nb_entries)
            batch_idxs = self._sample_batch_indexes(0, self.nb_entries, batch_size, weights)
        assert np.min(batch_idxs) >= 0
        assert np.max(batch_idxs) < self.nb_entries
        assert len(batch_idxs) == batch_size

        # Create experiences
        experiences = []
        # Each idx is index for state1
        for i, idx in enumerate(batch_idxs):
            state = [self.observations[idx],]
            for offset in xrange(1, self.window_length):
                current_idx = idx - offset
                current_terminal = self.terminals[current_idx] if current_idx >= 0 else False
                if current_idx < 0 or (not self.ignore_episode_boundaries and current_terminal):
                    # The previously handled observation was terminal, don't add the current one.
                    # Otherwise we would leak into a different episode.
                    break
                state.insert(0, self.observations[current_idx])
            # Complete unobserved state with 0
            while len(state) < self.window_length:
                state.insert(0, np.zeros_like(state[0]))
            action = self.actions[idx]
            q_val = self.rewards[idx]
            assert len(state0) == self.window_length
            experiences.append(QExperience(state=state, action=action, q_val=q_val))
        assert len(experiences) == batch_size
        # Keep sampled sampled idx for prioritized sampling
        self.sampled_idx = batch_idxs
        return experiences
    
    def append(self, observation, action, reward, terminal, is_store=True):
        # Reward means Q value just for keeping compatibility
        super(QMemory).append(observation, action, reward, terminal)
        self.delay_observations.append(observation)
        self.delay_actions.append(action)
        self.delay_terminals.append(terminal)
        # This needs to be understood as follows: in `observation`, take `action`, obtain `reward`
        # and weather the next state is `terminal` or not.
        if is_store:
            self.observations.append(self.delay_observations[0])
            self.actions.append(self.delay_actions[0])
            self.rewards.append(reward)
            self.terminals.append(self.delay_terminals[0])
            
    
    def get_delay_state(self, observation=None, backward=0):
        _observations = deepcopy(self.delay_observations)
        if observation is not None:
            _observations.append(observation)
        if backward > 1:
            _observation = _observation[:-backward]
        while len(_observations) < self.window_length:
            _observations.insert(0, np.zeros_like(self.recent_observations[0]))
        # Make sure window length observations
        return np.array(_observations)[-self.window_length:]

In [3]:
for i in range(10):
    i+= 1
    print(i)

1
2
3
4
5
6
7
8
9
10


In [1]:
import tensorflow as tf
import gym

from rltensor.agents import DQN
from rltensor.processors import AtariProcessor
from rltensor.networks import DuelingModel


conf = {"q_conf":[
            {"name": "conv2d", "kernel_size":(8, 8), "num_filter":32, "stride":4,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "conv2d", "kernel_size":(5, 5), "num_filter":64, "stride":2,
             "padding": 'SAME', "is_batch":True, 'activation': tf.nn.relu},
           {"name": "conv2d", "kernel_size": (3, 3), "num_filter":64, "stride":1,
             "padding": 'SAME', "is_batch":True, 'activation': tf.nn.relu},
            {"name": "dense", "is_flatten":True, "is_batch":True, "num_hidden": 512, 'activation': tf.nn.relu},
        ],
        'double_q': True,
        "memory_limit": 100000,
        "window_length": 4,
        "gamma": 0.99,
        "learning_rate": 2.5e-4,
        "learning_rate_minimum": 2.5e-4,
        "learning_rate_decay": 0.9,
        "learning_rate_decay_step": 100,
        "ep": 1e-3,
        "min_r": -np.inf,
        "max_r": np.inf,
        "batch_size": 32,
        "error_clip": 1.0,
        "processor": AtariProcessor(84, 84),
        "t_learn_start": 100,
        "t_train_freq": 4,
        "t_target_q_update_freq": 10000,
        "ep_start": 1.0,
        "ep_end": 0.1,
        "t_ep_end": int(1e6),
        "model_dir": "./dqn_logs",
        "log_freq": 1000,
        "avg_length": 10000,
        "env_name": 'DemonAttack-v0',
        "prioritized": True,
}

conf = {"q_conf":[
            {"name": "conv2d", "kernel_size":(8, 8), "num_filter":32, "stride":4,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "conv2d", "kernel_size":(5, 5), "num_filter":64, "stride":2,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
           {"name": "conv2d", "kernel_size": (3, 3), "num_filter":64, "stride":1,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "dense", "is_flatten":True, "is_batch":False, "num_hidden": 512, 'activation': tf.nn.relu},
        ],
        "env_name": 'Breakout-v0',
        "processor": AtariProcessor(84, 84),
}

env = gym.make('Breakout-v0')
with tf.device('/gpu:1'):
    tf.reset_default_graph()
    dqn = DQN(env, conf, q_network_cls=DuelingModel)
    dqn.train(int(1e7), render_freq=None, save_video_path="./videos")

[2017-08-11 02:42:15,224] Making new env: Breakout-v0


ResourceExhaustedError: OOM when allocating tensor of shape [512] and type float
	 [[Node: q_network/feature_network/layer_3/fully_connected/biases/RMSProp/Initializer/ones = Const[_class=["loc:@q_network/feature_network/layer_3/fully_connected/biases"], dtype=DT_FLOAT, value=Tensor<type: float shape: [512] values: 1 1 1...>, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]

Caused by op 'q_network/feature_network/layer_3/fully_connected/biases/RMSProp/Initializer/ones', defined at:
  File "/home/tomoaki/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/tomoaki/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-8ac51d688646>", line 61, in <module>
    dqn = DQN(env, conf, q_network_cls=DuelingModel)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/rltensor-0.1.0-py3.6.egg/rltensor/agents/deep_q.py", line 21, in __init__
    super(DQN, self).__init__(env, conf, default_conf, sess)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/rltensor-0.1.0-py3.6.egg/rltensor/agents/agent.py", line 69, in __init__
    self._build_graph()
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/rltensor-0.1.0-py3.6.egg/rltensor/agents/deep_q.py", line 78, in _build_graph
    self.q_optim = self.optimizer.apply_gradients(grads_vars)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/optimizer.py", line 446, in apply_gradients
    self._create_slots([_get_variable_for(v) for v in var_list])
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/rmsprop.py", line 100, in _create_slots
    v.dtype, "rms", self._name)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/optimizer.py", line 749, in _get_or_make_slot_with_initializer
    var, initializer, shape, dtype, op_name)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/slot_creator.py", line 146, in create_slot_with_initializer
    dtype)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/slot_creator.py", line 66, in _create_slot_var
    validate_shape=validate_shape)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 1065, in get_variable
    use_resource=use_resource, custom_getter=custom_getter)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 962, in get_variable
    use_resource=use_resource, custom_getter=custom_getter)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 367, in get_variable
    validate_shape=validate_shape, use_resource=use_resource)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 352, in _true_getter
    use_resource=use_resource)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 725, in _get_single_variable
    validate_shape=validate_shape)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 200, in __init__
    expected_shape=expected_shape)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 278, in _init_from_args
    initial_value(), name="initial_value", dtype=dtype)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 701, in <lambda>
    shape.as_list(), dtype=dtype, partition_info=partition_info)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py", line 108, in __call__
    return array_ops.ones(shape, dtype)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 1492, in ones
    output = constant(one, shape=shape, dtype=dtype, name=name)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/constant_op.py", line 106, in constant
    attrs={"value": tensor_value, "dtype": dtype_value}, name=name).outputs[0]
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
    self._traceback = _extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor of shape [512] and type float
	 [[Node: q_network/feature_network/layer_3/fully_connected/biases/RMSProp/Initializer/ones = Const[_class=["loc:@q_network/feature_network/layer_3/fully_connected/biases"], dtype=DT_FLOAT, value=Tensor<type: float shape: [512] values: 1 1 1...>, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]


In [3]:
import tensorflow as tf
import gym

from rltensor.agents import DQN
from rltensor.processors import AtariProcessor
from rltensor.networks import DuelingModel


conf = {"q_conf":[
            {"name": "conv2d", "kernel_size":(8, 8), "num_filter":32, "stride":4,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "conv2d", "kernel_size":(5, 5), "num_filter":64, "stride":2,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
           {"name": "conv2d", "kernel_size": (3, 3), "num_filter":64, "stride":1,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "dense", "is_flatten":True, "is_batch":False, "num_hidden": 512, 'activation': tf.nn.relu},
        ],
        "env_name": 'Breakout-v0',
        "processor": AtariProcessor(84, 84),
}

env = gym.make('Breakout-v0')
tf.reset_default_graph()
dqn = DQN(env, conf, q_network_cls=DuelingModel)
dqn.play(num_episode=10, ep=0.05, load_file_path="./breakout_dqn_params/model.ckpt",
         save_video_path="./breakout_videos", render_freq=1)

[2017-07-30 21:57:02,794] Making new env: Breakout-v0
[2017-07-30 21:57:03,411] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/tomoaki/work/Development/RL/breakout_videos')


INFO:tensorflow:Restoring parameters from ./breakout_dqn_params/model.ckpt


[2017-07-30 21:57:03,497] Restoring parameters from ./breakout_dqn_params/model.ckpt
[2017-07-30 21:57:03,531] Clearing 8 monitor files from previous run (because force=True was provided)
[2017-07-30 21:57:03,538] Starting new video recorder writing to /home/tomoaki/work/Development/RL/breakout_videos/openaigym.video.2.3241.video000000.mp4



Model restored.


[2017-07-30 21:57:35,712] Starting new video recorder writing to /home/tomoaki/work/Development/RL/breakout_videos/openaigym.video.2.3241.video000001.mp4

[2017-07-30 22:01:21,034] Starting new video recorder writing to /home/tomoaki/work/Development/RL/breakout_videos/openaigym.video.2.3241.video000008.mp4



In [None]:
if [1, 2, 3]:
    print("hello")

In [None]:
env.action_space.n

In [None]:
y.get_shape().as_list()

In [None]:
count = 4
while count < 5:
    print(count)
    count += 1

In [None]:
"%s" % True

In [None]:
a.insert(0, 2)

In [15]:
a

[2, 1]

In [11]:
np.random.randint(0, 2, 10)

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0])

In [42]:
from collections import deque

In [44]:
x = deque([1, 2, 3], maxlen=5)

In [45]:
x.append(3)
x.append(3)
x.append(3)

In [46]:
x

deque([2, 3, 3, 3, 3])

In [57]:
 result = tf.select(pred, val_if_true, val_if_false)

AttributeError: module 'tensorflow' has no attribute 'select'

In [60]:
x = tf.placeholder(tf.bool, (None,))
y = tf.cast(x, tf.int32)
z = tf.one_hot(y, 2)

In [62]:
sess = tf.InteractiveSession()
print(y.eval(feed_dict={x:[True, False, True]}))
print(z.eval(feed_dict={x:[True, False, True]}))

[1 0 1]
[[ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]]


In [13]:
type(np.arange(10).astype(int)[0])

numpy.int64

In [16]:
type(np.random.choice(range(0, 5), 3)[0])

numpy.int64

In [44]:
np.random.choice([1, 2, 3, 4], 3, False)

array([1, 4, 3])

In [12]:
x = np.arange(10)
np.append(x, 10)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [13]:
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])