In [1]:
import rltensor

In [2]:
import numpy as np
from scipy.misc import imresize
from PIL import Image


def resize_data(data, width, height, c_dim=3, is_color=True):
    """resize data for trainining dcgan
    Args:
        data: list of image data, each of which has a shape,
            (width, height, color_dim) if is_color==True
            (width, height) otherwisei
    """
    if is_color:
        converted_data = np.array([imresize(d, [width, height]) for d in data
                                if (len(d.shape)==3 and d.shape[-1] == c_dim)])
    else:
        # gray scale data
        converted_data = np.array([imresize(d, [width, height]) for d in data
                                if (len(d.shape)==2)])
    return converted_data

In [3]:
import numpy as np
from scipy.misc import imresize
from PIL import Image
from os import listdir
from os.path import join, isfile, isdir
from skimage.color import rgb2gray


class DefaultProcessor(object):
    def __init__(self, input_shape):
        self.input_shape = input_shape
    
    def preprocess(self, observation, action, reward, terminal):
        return observation, action, reward, terminal
    
    def tensor_process(self, x):
        return x
    
    def get_input_shape(self):
        return self.input_shape
    
class AtariProcessor(DefaultProcessor):
    def __init__(self, height, width):
        self.height = height
        self.width = width
        self.input_shape = (height, width)
    
    def preprocess(self, observation, action, reward, terminal):
        observation = resize_data([observation], self.height, self.width)[0]
        observation = rgb2gray(observation)
        # Make the same rewards for every games
        reward = min(1, max(-1, reward))
        return observation, action, reward, terminal
    
    def tensor_process(self, x):
        # change to (batch, width, hight, window_length)
        return tf.transpose(x, [0, 2, 3, 1])

In [4]:
import numpy as np
from six.moves import xrange

from rltensor.memories import SequentialMemory
from rltensor.memories.utils import sample_batch_indexes


class PrioritizedMemory(SequentialMemory):
    def __init__(self, window_length, limit, alpha=0.5, beta=0.5, annealing_step=1e6, 
                 epsilon=1e-4, *args, **kwargs):
        super(PrioritizedMemory, self).__init__(window_length, limit, *args, **kwargs)
        self.alpha = alpha
        self.beta = beta
        self.beta_init = beta
        self.annealing_step = annealing_step
        self.epsilon = epsilon
        self.priorities = None
        self.sampled_idx = None

    def update_weights(self, step, error=None, *args, **kwargs):
        self.priorities = np.ones(self.nb_entries) * self.epsilon
        self.beta = self._calc_beta(step)
        if error is not None:
            self.priorities[self.sampled_idx] = np.abs(error) + self.epsilon
            
    def add_weights(self):
        while len(self.priorities) < self.nb_entries:
            self.priorities = np.append(self.priorities, np.max(self.priorities))
        
    def get_weights(self):
        if self.priorities == None:
            return None
        else:
            weights = self.priorities**self.alpha
            return weights / np.sum(weights)
        
    def get_importance_weights(self, batch_size=None):
        if self.priorities is None:
            return np.ones(batch_size)
        else:
            weights = self.priorities[self.sampled_idx] ** (-self.alpha*self.beta)
            return weights / np.max(weights)
    
    def _calc_beta(self, step):
        return self.beta_init + (1 - self.beta_init) * (self.annealing_step - step) / self.annealing_step

In [5]:
from tqdm import tqdm
import tensorflow as tf
from logging import getLogger
import time
from collections import deque
from six.moves import xrange
from gym import wrappers 

from rltensor.memories import SequentialMemory

logger = getLogger(__name__)

class Agent(object):
    def __init__(self, env, conf, sess=None):
        if sess is None:
            sess = tf.Session()
        self.sess = sess
        self.conf = conf
        self.model_dir = conf["model_dir"]
        self.limit = conf["memory_limit"]
        self.window_length = conf["window_length"]
        self.memory = self._get_memory(self.window_length, self.limit, conf["prioritized"])
        self.gamma = conf["gamma"]
        self.error_clip = conf["error_clip"]
        self.processor = conf["processor"]
        self.ep_start = conf["ep_start"]
        self.ep_end = conf["ep_end"]
        self.t_ep_end = conf["t_ep_end"]
        self.t_learn_start = conf["t_learn_start"]
        self.t_train_freq = conf["t_train_freq"]
        self.t_target_q_update_freq = conf["t_target_q_update_freq"]
        # Get input and action dim info from env
        self.env = env
        self.env_name = conf["env_name"]
        self.state_dim = self.processor.get_input_shape()
        self.action_dim = env.action_space.n
        # configure for learning schedule 
        self.learning_rate = conf["learning_rate"]
        self.learning_rate_minimum = conf["learning_rate_minimum"]
        self.learning_rate_decay = conf["learning_rate_decay"]
        self.learning_rate_decay_step = conf["learning_rate_decay_step"]
        self.global_step = tf.Variable(0, trainable=False)
        # reward is in (min_r, max_r)
        self.min_r = conf["min_r"]
        self.max_r = conf["max_r"]
        self.batch_size = conf["batch_size"]
        self.log_freq = conf["log_freq"]
        self.avg_length = conf["avg_length"]
        # Build tensorflow network
        st = time.time()
        logger.debug("Building tensorflow graph...")
        with self.sess.as_default():
            self._build_graph()
            self.saver = tf.train.Saver()
        logger.debug("Finished building tensorflow graph, spent time:", time.time() - st)
        if "load_file_path" in conf:
            self.load_params(conf["load_file_path"])
        
    def _get_memory(self, window_length, limit, is_prioritized=True):
        if is_prioritized:
            return PrioritizedMemory(window_length, limit)
        else:
            return SequentialMemory(window_length, limit)
        
    def _build_graph(self):
        raise NotImplementedError()

    def train(self, t_max, num_max_start_steps=0, save_file_path=None, 
              load_file_path=None, save_video_path=None, overwrite=True, render_freq=None):
        tf.global_variables_initializer().run(session=self.sess);
        if load_file_path is not None:
            self.load_params(load_file_path)
        # Save Model
        self.save_params(save_file_path, overwrite)
        # Record Viodeo
        if save_video_path is not None:
            self.env = wrappers.Monitor(self.env, save_video_path, force=overwrite)
        # initialize target netwoork
        self.update_target_q_network()
        # initialize enviroment
        observation = self.env.reset()
        action = env.action_space.sample()
        # Perform random starts at beginning of episode and do not record them into the experience.
        # This slightly changes the start position between games.
        if num_max_start_steps == 0:
            num_random_start_steps = 0
        else:
            num_random_start_steps = np.random.randint(num_max_start_steps)
        for _ in xrange(num_random_start_steps):
            action = env.action_space.sample()
            observation, reward, terminal, info = env.step(action)
            observation = deepcopy(observation)
        # initialize memory
        terminal = False
        reward = 0
        observation, action, reward, terminal = self.processor.preprocess(observation, action, reward, terminal)
        self.memory.append(observation, action, reward, terminal, is_store=True)
        # accumulate results
        total_reward = deque(maxlen=self.avg_length)
        total_loss = deque(maxlen=self.avg_length)
        total_q_val = deque(maxlen=self.avg_length)
        ep_rewards = []
        ep_losses = []
        ep_q_vals = []
        ep_actions = []
        num_ep = 1
        step = self.global_step.eval(session=self.sess)
        # for t in tqdm(xrange(t_max)):
        st = time.time()
        _st = st
        for t in tqdm(xrange(t_max)): 
            try:
                # 1. predict
                state = self.memory.get_recent_state()
                action = self.predict(state)
                # 2. act
                observation, reward, terminal, info = self.env.step(action)
                observation, action, reward, terminal\
                    = self.processor.preprocess(observation, action, reward, terminal)
                # 3. store data and train network
                if t < self.t_learn_start:
                    result = self.observe(observation, action, reward, terminal, False)
                    self.memory.update_weights(step)
                    continue
                else:
                    result = self.observe(observation, action, reward, terminal, True)
                q, loss, error, is_update = result
                logger.debug("a: %d, r:%f, t:%s, q:%.4f, l: %.4f" % \
                    (action, reward, terminal, np.mean(q), loss))
                # Update step
                self.sess.run(self.step_update_op);
                step = self.global_step.eval(session=self.sess)
                self.memory.update_weights(step, error)
                # Update target network
                if (step + 1) % self.t_target_q_update_freq == 0:
                    self.update_target_q_network()
                # update statistics
                total_reward.append(reward)
                total_loss.append(loss)
                total_q_val.append(np.mean(q))
                ep_actions.append(action)
                ep_rewards.append(reward)
                ep_losses.append(loss)
                ep_q_vals.append(np.mean(q))
                # Visualize reuslts
                if render_freq is not None:
                    if step % render_freq == 0:
                        self.env.render()
                # Write summary
                if step % self.log_freq == 0:
                    num_per_sec = self.log_freq / (time.time() - _st)
                    _st = time.time()
                    epsilon = self.epsilon.eval(session=self.sess)
                    learning_rate = self.learning_rate_op.eval(session=self.sess)
                    avg_r = np.mean(total_reward)
                    avg_loss = np.mean(total_loss)
                    avg_q_val = np.mean(total_q_val)
                    tag_dict = {'episode.num_of_game': num_ep,
                                'average.reward': avg_r,
                                'average.loss': avg_loss,
                                'average.q': avg_q_val, 
                                'training.epsilon': epsilon,
                                'training.learning_rate': learning_rate,
                                'training.num_step_per_sec': num_per_sec,
                                'training.time': time.time() - st
                                }
                    self._inject_summary(tag_dict, step)
            
                if terminal:
                    try:
                        cum_ep_reward = np.sum(ep_rewards)
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        cum_ep_reward, max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0, 0
                    tag_dict = {'episode.cumulative_reward': cum_ep_reward,
                                'episode.max_reward': max_ep_reward, 
                                'episode.min_reward': min_ep_reward,
                                'episode.avg_reward': avg_ep_reward, 
                                'episode.rewards': ep_rewards,
                                # 'episode.actions': ep_actions}
                                'episode.actions': self.memory.priorities}
                    self._inject_summary(tag_dict, num_ep)
                    observation = self.env.reset()
                    observation, action, reward, terminal\
                        = self.processor.preprocess(observation, None, 0, False)
                    self.memory.reset()
                    self.memory.append(observation, action, reward, terminal, is_store=False)
                    ep_rewards = []
                    ep_losses = []
                    ep_q_vals = []
                    ep_actions = []
                    num_ep += 1
            except KeyboardInterrupt:
                break
        # Update parameters before finishing
        self.save_params(save_file_path, True)

    def predict(self, s_t, ep):
        raise NotImplementedError()
    
    def _get_learning_rate(self):
        learning_rate_op = tf.maximum(self.learning_rate_minimum,
          tf.train.exponential_decay(
              self.learning_rate,
              self.global_step,
              self.learning_rate_decay_step,
              self.learning_rate_decay,
              staircase=True))
        return learning_rate_op
    
    def _get_epsilon(self):
        rest_steps  = tf.maximum(0., 
            self.t_ep_end - tf.maximum(0., tf.cast(self.global_step - self.t_learn_start, tf.float32)))
        delta_ep = max(0, self.ep_start - self.ep_end)
        epsilon = self.ep_end + delta_ep * rest_steps / self.t_ep_end
        return epsilon

    def update_target_q_network(self):
        self.sess.run(self.update_op);
        
    def _build_summaries(self):
        self.writer = tf.summary.FileWriter(self.model_dir, self.sess.graph)
        self.summary_placeholders = {}
        self.summary_ops = {}
        scalar_summary_tags = [
                'average.reward', 'average.loss', 'average.q', 'episode.cumulative_reward', 
                'episode.max_reward', 'episode.min_reward', 'episode.avg_reward', 
                'episode.num_of_game', 'training.epsilon', 'training.learning_rate',
                'training.num_step_per_sec', 'training.time']
        for tag in scalar_summary_tags:
            self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace(' ', '_'))
            self.summary_ops[tag] =\
                tf.summary.scalar("%s/%s" % (self.env_name, tag), self.summary_placeholders[tag])
        
        histogram_summary_tags = ['episode.rewards', 'episode.actions']
        for tag in histogram_summary_tags:
            self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace(' ', '_'))
            self.summary_ops[tag]  = tf.summary.histogram(tag, self.summary_placeholders[tag])
    
    def _inject_summary(self, tag_dict, step):
        summary_str_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dict.keys()], {
          self.summary_placeholders[tag]: value for tag, value in tag_dict.items()
        })
        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str, step)

    def load_params(self, file_path):
        """Loads parameters of an estimator from a file.
        
        Args:
            file_path: str, The path to the file.
        """
        self.saver.restore(self.sess, file_path)
        print("Model restored.")

    def save_params(self, file_path=None, overwrite=True):
        """Saves parameters of an estimator as a file.
        
        Args:
            file_path: str, The path to where the parameters should be saved.
            overwrite: bool, If `False` and `file_path` already exists, raises an error.
        """
        if file_path is None:
            if not os.path.isdir("params"):
                os.mkdir("params")
            file_path = "params/model.ckpt"
        if not overwrite:
            _path = ".".join([file_path, "meta"])
            if os.path.isfile(_path):
                raise NameError("%s already exists." % file_path)
        save_path = self.saver.save(self.sess, file_path)
        print("Model saved in file: %s" % save_path)

In [6]:
import os
import time
import numpy as np
import tensorflow as tf
from logging import getLogger
import random

# from .agent import Agent
from rltensor.utils import get_shape

logger = getLogger(__name__)


class DQN(Agent):
    def __init__(self, env, conf, q_network_cls, sess=None):
        self.q_network_cls = q_network_cls
        # Strategy configure
        self.double_q = conf["double_q"]
        super(DQN, self).__init__(env, conf, sess)
        
    def _build_graph(self):
        """Build all of the network and optimizations
        
        just for conveninece of trainig, seprate placehoder for train and target network
        critic network input: [raw_data, smoothed, downsampled]
        """
        self.step_update_op = tf.assign(self.global_step, self.global_step + 1)
        # training flag
        self.training = tf.placeholder(tf.bool, name="training")
        # state shape has to be (batch, length,) + input_dim
        self.state = tf.placeholder(tf.float32,
                                     get_shape(self.state_dim, maxlen=self.window_length),
                                     name='state')
        _state = self.processor.tensor_process(self.state)
        self.target_state = tf.placeholder(tf.float32,
                                            get_shape(self.state_dim, maxlen=self.window_length),
                                            name='target_state')
        _target_state = self.processor.tensor_process(self.target_state)
        # Employ maximal strategy
        self.q_network = self.q_network_cls(self.action_dim, self.conf["q_conf"], scope_name="q_network")
        self.q_val = self.q_network(_state, self.training)
        assert self.q_val.get_shape().as_list()[-1] == self.action_dim
        self.max_action = tf.argmax(self.q_val, dimension=1)
        # Build action graph
        self.action = tf.placeholder(tf.int32, (None,), name='action')
        action_one_hot = tf.one_hot(self.action, depth=self.action_dim)
        self.action_q_val = tf.reduce_sum(self.q_val * action_one_hot, axis=1)
        # Build target network
        self.target_q_network = self.q_network_cls(self.action_dim, self.conf["q_conf"],
                                                   scope_name="target_q_network")
        target_q_val = self.target_q_network(_target_state, self.training)
        self.reward = tf.placeholder(tf.float32, (None,), name='reward')
        if self.double_q:
            max_one_hot = tf.one_hot(self.max_action, depth=self.action_dim)
            max_q_val = tf.reduce_sum(target_q_val * max_one_hot, axis=1)
        else:
            max_q_val = tf.reduce_max(target_q_val, axis=1)
        # Make sure to have only reward for singla when teminal=False
        reward_q = self.reward  + self.gamma * max_q_val
        self.terminal = tf.placeholder(tf.bool, (None,), name="terminal")
        _target_val = tf.concat([tf.expand_dims(reward_q, 1), tf.expand_dims(self.reward, 1)], axis=1)
        onehot_terminal = tf.one_hot(tf.cast(self.terminal, tf.int32),  2)
        target_val = tf.reduce_sum(_target_val * onehot_terminal, axis=1)
        # Clip error to stabilize learning
        self.error = target_val - self.action_q_val
        clipped_error = tf.where(tf.abs(self.error) < self.error_clip,
                                    0.5 * tf.square(self.error),
                                    tf.abs(self.error), name='clipped_error')
        self.weights = tf.placeholder(tf.float32, (None,), name="importance_weights")
        self.loss = tf.reduce_mean(self.weights * clipped_error, name='loss')
        # Build optimization
        self.learning_rate_op = self._get_learning_rate()
        self.epsilon = self._get_epsilon()
        self.loss = tf.reduce_mean(tf.square(target_val - self.action_q_val), name='loss')
        self.q_optim = tf.train.AdamOptimizer(self.learning_rate_op)\
            .minimize(self.loss, var_list=self.q_network.variables)
        self.update_op = self._get_update_op()
        with tf.name_scope('summaries'):
            self._build_summaries()

    def observe(self, observation, action, reward, terminal, training):
        # clip reward into  (min_r, max_r)
        reward = max(self.min_r, min(self.max_r, reward))
        # We always keep data
        self.memory.append(observation, action, reward, terminal, is_store=True)
        step = self.global_step.eval(session=self.sess)
        if (step + 1) % self.t_train_freq:
            is_update = True
        else:
            is_update = False
        if training:
            self.memory.add_weights()
            weights = self.memory.get_weights()
            experiences = self.memory.sample(self.batch_size, weights)
            weights = self.memory.get_importance_weights()
            if weights is None:
                weights = np.ones(self.batch_size)
            result = self.q_learning_minibatch(experiences, weights, is_update)
            return result
        else:
            return None

    def q_learning_minibatch(self, experiences, batch_weights, is_update=True):
        feed_dict = {
            self.state: [experience.state0 for experience in experiences],
            self.target_state: [experience.state1 for experience in experiences],
            self.reward: [experience.reward for experience in experiences],
            self.action: [experience.action for experience in experiences],
            self.terminal: [experience.terminal1 for experience in experiences],
            self.weights: batch_weights,
            self.training: False,
        }
        if is_update:
            self.sess.run(self.q_optim, feed_dict=feed_dict);
        q_t, loss, error = self.sess.run([self.action_q_val, self.loss, self.error],
                                     feed_dict=feed_dict)
        return q_t, loss, error, is_update
    
    def predict(self, state):
        ep = self.epsilon.eval(session=self.sess)
        if random.random() < ep:
            action = np.random.randint(0, self.action_dim)
        else:
            action = self.sess.run(self.max_action, 
                                   feed_dict={self.state: [state],
                                              self.training: False})[0]
        return action
    
    def _get_update_op(self):
        update_op = []
        for target_var, var in zip(self.target_q_network.variables, self.q_network.variables):
            update_op.append(tf.assign(target_var, var))
        return update_op

In [7]:
from rltensor.networks import FeedForward

class DuelingModel(FeedForward):
    def __init__(self, output_dim, model_params=None, scope_name=None, *args, **kwargs):
        self.output_dim = output_dim
        if model_params is None:
            model_params = mlp_conf["model"]
        if scope_name is None:
            scope_name = "dueling"
        super().__init__(model_params, scope_name)
        self.feature_model = FeedForward(model_params, scope_name="feature_network")
        self.advantage_model = FeedForward([{"name": "dense", "num_hidden": 512},
                                         {"name": "dense", "num_hidden": output_dim},],
                                          scope_name="advantage_network")
        self.state_model = FeedForward([{"name": "dense", "num_hidden": 512},
                                        {"name": "dense", "num_hidden": 1},],
                                         scope_name="state_network")
        
    def __call__(self, x, training=True):
        with tf.variable_scope(self.scope_name, reuse=self.reuse):
            x = self.feature_model(x, training)
            advantage = self.advantage_model(x, training)
            state = self.state_model(x, training)
            if self.reuse is False:
                self.global_scope_name = tf.get_variable_scope().name
                self.variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.global_scope_name)
            mean_advantage = tf.concat([tf.reduce_mean(advantage, axis=1, keep_dims=True)
                                        for _ in range(self.output_dim)], 
                                       axis=1)
            advantage = advantage - mean_advantage
            state = tf.concat([state for _ in range(self.output_dim)], axis=1)
        self.reuse = True
        return state + advantage
            
            
class MLPModel(FeedForward):
    def __init__(self, output_dim, model_params=None, scope_name=None, *args, **kwargs):
        self.output_dim = output_dim
        if model_params is None:
            model_params = mlp_conf["model"]
        if scope_name is None:
            scope_name = "mlp"
        model_params.append({"name": "dense", "num_hidden": output_dim})
        super().__init__(model_params, scope_name)

In [None]:
from rltensor.networks import FeedForward
import gym

env = gym.make('DemonAttack-v0')

conf = {"q_conf":[
            {"name": "conv2d", "kernel_size":(8, 8), "num_filter":32, "stride":4,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "conv2d", "kernel_size":(5, 5), "num_filter":64, "stride":2,
             "padding": 'SAME', "is_batch":True, 'activation': tf.nn.relu},
           {"name": "conv2d", "kernel_size": (3, 3), "num_filter":64, "stride":1,
             "padding": 'SAME', "is_batch":True, 'activation': tf.nn.relu},
            {"name": "dense", "is_flatten":True, "is_batch":True, "num_hidden": 512, 'activation': tf.nn.relu},
        ],
        'double_q': True,
        "memory_limit": 100000,
        "window_length": 4,
        "gamma": 0.99,
        "learning_rate": 2.5e-4,
        "learning_rate_minimum": 2.5e-4,
        "learning_rate_decay": 0.9,
        "learning_rate_decay_step": 100,
        "ep": 1e-3,
        "min_r": -1,
        "max_r": 1,
        "batch_size": 32,
        "error_clip": 1.0,
        "processor": AtariProcessor(84, 84),
        "t_learn_start": 100,
        "t_train_freq": 4,
        "t_target_q_update_freq": 10000,
        "ep_start": 1.0,
        "ep_end": 0.1,
        "t_ep_end": int(1e6),
        "model_dir": "./logs/dqn",
        "log_freq": 1000,
        "avg_length": 10000,
        "env_name": 'DemonAttack-v0',
        "prioritized": True,
}
# logger.setLevel("DEBUG")
tf.reset_default_graph()
dqn = DQN(env, conf, q_network_cls=DuelingModel)
dqn.train(int(1e7), render_freq=None, save_video_path="./videos")

[2017-07-20 01:29:18,539] Making new env: DemonAttack-v0
[2017-07-20 01:29:19,632] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/tomoaki/work/Development/RL/videos')
[2017-07-20 01:29:19,976] Clearing 6 monitor files from previous run (because force=True was provided)
[2017-07-20 01:29:19,991] Starting new video recorder writing to /home/tomoaki/work/Development/RL/videos/openaigym.video.1.21792.video000000.mp4


  0%|          | 0/10000000 [00:00<?, ?it/s][A
  0%|          | 67/10000000 [00:00<4:09:42, 667.43it/s]

Model saved in file: params/model.ckpt



  0%|          | 101/10000000 [00:00<6:13:22, 446.37it/s][A
  0%|          | 123/10000000 [00:00<20:05:07, 138.30it/s][A
  0%|          | 2775/10000000 [00:49<56:24:29, 49.23it/s][2017-07-20 01:30:09,713] Starting new video recorder writing to /home/tomoaki/work/Development/RL/videos/openaigym.video.1.21792.video000001.mp4
  0%|          | 15077/10000000 [04:43<54:01:53, 51.33it/s][2017-07-20 01:34:04,151] Starting new video recorder writing to /home/tomoaki/work/Development/RL/videos/openaigym.video.1.21792.video000008.mp4
  0%|          | 44030/10000000 [14:44<59:51:31, 46.20it/s][2017-07-20 01:44:04,700] Starting new video recorder writing to /home/tomoaki/work/Development/RL/videos/openaigym.video.1.21792.video000027.mp4
  1%|          | 93585/10000000 [34:31<72:14:58, 38.09it/s][2017-07-20 02:03:51,946] Starting new video recorder writing to /home/tomoaki/work/Development/RL/videos/openaigym.video.1.21792.video000064.mp4
  2%|▏         | 171929/10000000 [1:10:29<78:26:44, 34.80

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.hist(dqn.memory.priorities, bins=100)

In [None]:
if [1, 2, 3]:
    print("hello")

In [None]:
env.action_space.n

In [None]:
y.get_shape().as_list()

In [None]:
count = 4
while count < 5:
    print(count)
    count += 1

In [None]:
"%s" % True

In [None]:
a.insert(0, 2)

In [15]:
a

[2, 1]

In [11]:
np.random.randint(0, 2, 10)

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0])

In [42]:
from collections import deque

In [44]:
x = deque([1, 2, 3], maxlen=5)

In [45]:
x.append(3)
x.append(3)
x.append(3)

In [46]:
x

deque([2, 3, 3, 3, 3])

In [57]:
 result = tf.select(pred, val_if_true, val_if_false)

AttributeError: module 'tensorflow' has no attribute 'select'

In [60]:
x = tf.placeholder(tf.bool, (None,))
y = tf.cast(x, tf.int32)
z = tf.one_hot(y, 2)

In [62]:
sess = tf.InteractiveSession()
print(y.eval(feed_dict={x:[True, False, True]}))
print(z.eval(feed_dict={x:[True, False, True]}))

[1 0 1]
[[ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]]


In [13]:
type(np.arange(10).astype(int)[0])

numpy.int64

In [16]:
type(np.random.choice(range(0, 5), 3)[0])

numpy.int64

In [44]:
np.random.choice([1, 2, 3, 4], 3, False)

array([1, 4, 3])

In [12]:
x = np.arange(10)
np.append(x, 10)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [13]:
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])