In [None]:
from tqdm import tqdm
import tensorflow as tf
import time
from collections import deque
from six.moves import xrange
from gym import wrappers
from copy import deepcopy
import numpy as np


class Runner(object):
    def __init__(self, agent, env, env_name="env",
                 tensorboard_dir="./logs", scalar_summary_tags=None,
                 histogram_summary_tags=None, load_file_path=None,
                 *args, **kwargs):
        self.agent = agent
        self.env = env
        self.env_name = env_name
        self.tensorboard_dir = tensorboard_dir

        if scalar_summary_tags is None:
            scalar_summary_tags = [
                'average.reward', 'average.loss', 'average.q',
                'episode.cumulative_reward', 'episode.max_reward',
                'episode.min_reward', 'episode.avg_reward',
                'episode.num_of_game', 'training.epsilon',
                'training.learning_rate',
                'training.num_step_per_sec', 'training.time']
        self.scalar_summary_tags = scalar_summary_tags

        if histogram_summary_tags is None:
            histogram_summary_tags = ['episode.rewards', 'episode.actions']
        self.histogram_summary_tags = histogram_summary_tags

        self.load_file_path = load_file_path

    def fit(self, t_max, num_max_start_steps=0,
            save_file_path=None,
            save_video_path=None, overwrite=True,
            render_freq=None,
            log_freq=10,
            avg_length=1000):
        self._initialization()
        # Save Model
        self.agent.save_params(save_file_path, overwrite)
        # Record Viodeo
        if save_video_path is not None:
            self.env = wrappers.Monitor(self.env,
                                        save_video_path,
                                        force=overwrite)
        # initialize target netwoork
        self.agent.init_update()
        # initialize enviroment
        observation = self.env.reset()
        action = self.env.action_space.sample()
        # This slightly changes the start position between games.
        terminal = False
        reward = 0
        if num_max_start_steps == 0:
            num_random_start_steps = 0
        else:
            num_random_start_steps = np.random.randint(num_max_start_steps)
        for _ in xrange(num_random_start_steps):
            action = self.env.action_space.sample()
            observation, reward, terminal, info = self.env.step(action)
            observation = deepcopy(observation)
            print(np.mean(observation), observation.shape)
            if terminal:
                observation = self.env.reset()
        self.agent.observe(observation, action, reward, terminal,
                           training=False, is_store=False)

        # accumulate results
        total_reward = deque(maxlen=avg_length)
        total_loss = deque(maxlen=avg_length)
        total_q_val = deque(maxlen=avg_length)
        ep_rewards = []
        ep_losses = []
        ep_q_vals = []
        ep_actions = []
        num_ep = 1
        step = self.agent.global_step
        st = time.time()
        _st = st
        for t in tqdm(xrange(t_max)):
            try:
                # Update step
                self.agent.update_step()
                # 1. predict
                state = self.agent.get_recent_state()
                action = self.agent.predict(state)
                # 2. act
                observation, reward, terminal, info = self.env.step(action)
                # 3. store data and train network
                if t < self.agent.t_learn_start:
                    response = self.agent.observe(observation, action, reward,
                                                  terminal, training=False,
                                                  is_store=True)
                    if terminal:
                        observation = self.env.reset()
                    continue
                else:
                    response = self.agent.observe(observation, action, reward,
                                                  terminal, training=True,
                                                  is_store=True)
                q, loss, error, is_update = response
                step = self.agent.global_step
                # update statistics
                total_reward.append(reward)
                total_loss.append(loss)
                total_q_val.append(np.mean(q))
                ep_actions.append(action)
                ep_rewards.append(reward)
                ep_losses.append(loss)
                ep_q_vals.append(np.mean(q))
                # Visualize reuslts
                if render_freq is not None:
                    if step % render_freq == 0:
                        self.env.render()
                # Write summary
                if log_freq is not None and step % log_freq == 0:
                    num_per_sec = log_freq / (time.time() - _st)
                    _st = time.time()
                    epsilon = self.agent.epsilon
                    learning_rate = self.agent.learning_rate
                    avg_r = np.mean(total_reward)
                    avg_loss = np.mean(total_loss)
                    avg_q_val = np.mean(total_q_val)
                    tag_dict = {'episode.num_of_game': num_ep,
                                'average.reward': avg_r,
                                'average.loss': avg_loss,
                                'average.q': avg_q_val,
                                'training.epsilon': epsilon,
                                'training.learning_rate': learning_rate,
                                'training.num_step_per_sec': num_per_sec,
                                'training.time': time.time() - st}
                    self._inject_summary(tag_dict, step)
                if terminal:
                    try:
                        cum_ep_reward = np.sum(ep_rewards)
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        cum_ep_reward = 0
                        max_ep_reward = 0
                        min_ep_reward = 0
                        avg_ep_reward = 0

                    tag_dict = {'episode.cumulative_reward': cum_ep_reward,
                                'episode.max_reward': max_ep_reward,
                                'episode.min_reward': min_ep_reward,
                                'episode.avg_reward': avg_ep_reward,
                                'episode.rewards': ep_rewards}
                    if hasattr(self.agent.memory, "priorities"):
                        tag_dict['episode.actions'] =\
                            self.agent.memory.priorities
                    self._inject_summary(tag_dict, num_ep)
                    observation = self.env.reset()
                    response = self.agent.observe(observation, None, 0, False,
                                                  training=False,
                                                  is_store=False)
                    self.agent.memory.reset()
                    ep_rewards = []
                    ep_losses = []
                    ep_q_vals = []
                    ep_actions = []
                    num_ep += 1
            except KeyboardInterrupt:
                break
        # Update parameters before finishing
        self.agent.save_params(save_file_path, True)

    def play(self, num_episode=1, ep=0.05, overwrite=True, load_file_path=None, save_video_path=None, render_freq=None):
        tf.global_variables_initializer().run(session=self.sess);
        if load_file_path is not None:
            self.load_params(load_file_path)
        # Record Viodeo
        if save_video_path is not None:
            self.env = wrappers.Monitor(self.env, save_video_path, force=overwrite)
        for num_ep in range(1, num_episode + 1):
            # initialize enviroment
            observation = self.env.reset()
            self.memory.reset()
            action = self.env.action_space.sample()
            reward = 0
            terminal = False
            observation, action, reward_, terminal = self.processor.preprocess(observation, action, reward, terminal)
            self.memory.append(observation, action, reward_, terminal, is_store=False)
            ep_rewards = []
            step = 1
            while not terminal:
                # 1. predict
                state = self.memory.get_recent_state()
                action = self.predict(state, ep)
                # 2. act
                observation, reward, terminal, info = self.env.step(action)
                # initialize memory
                observation, action, reward_, terminal = self.processor.preprocess(observation, action, reward, terminal)
                self.memory.append(observation, action, reward_, terminal, is_store=False)
                # accumulate results
                ep_rewards.append(reward)
                # Visualize reuslts
                if render_freq is not None:
                    if step % render_freq == 0:
                        self.env.render()
                if terminal:
                    try:
                        cum_ep_reward = np.sum(ep_rewards)
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        cum_ep_reward, max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0, 0
                    tag_dict = {'episode.cumulative_reward': cum_ep_reward,
                                'episode.max_reward': max_ep_reward,
                                'episode.min_reward': min_ep_reward,
                                'episode.avg_reward': avg_ep_reward,
                                'episode.rewards': ep_rewards}
                    self._inject_summary(tag_dict, num_ep)
                step += 1

    def _initialization(self):
        # Initialize graph
        with self.agent.sess.as_default():
            with tf.name_scope("summary"):
                self._build_summaries()
            if self.load_file_path is not None:
                self.load_params(self.load_file_path)
            tf.global_variables_initializer().run()

    def _build_summaries(self):
        self.writer = tf.summary.FileWriter(
            self.tensorboard_dir, self.agent.sess.graph)
        self.summary_placeholders = {}
        self.summary_ops = {}
        for tag in self.scalar_summary_tags:
            self.summary_placeholders[tag] =\
                tf.placeholder('float32', None, name=tag.replace(' ', '_'))
            self.summary_ops[tag] =\
                tf.summary.scalar("%s/%s" % (self.env_name, tag),
                                  self.summary_placeholders[tag])

        for tag in self.histogram_summary_tags:
            self.summary_placeholders[tag] =\
                tf.placeholder('float32', None, name=tag.replace(' ', '_'))
            self.summary_ops[tag] = tf.summary.histogram(
                tag,
                self.summary_placeholders[tag])

    def _inject_summary(self, tag_dict, step):
        summary_str_lists = self.agent.sess.run(
            [self.summary_ops[tag] for tag in tag_dict.keys()],
            {self.summary_placeholders[tag]: value for tag, value in tag_dict.items()})
        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str, step)


  return f(*args, **kwds)


In [None]:
import tensorflow as tf
import gym

from rltensor.agents import DQN
from rltensor.processors import AtariProcessor
# from rltensor.executions import Runner
from rltensor.configs import dqn_config, fit_config


env = gym.make('Breakout-v0')

conf = dict(
    action_spec={"type": "int", "shape": env.action_space.n},
)
default_config = dqn_config()
conf.update(default_config)

_fit_config = fit_config()
env_name = 'Breakout-v0'
env = gym.make(env_name)
with tf.device('/cpu:0'):
    tf.reset_default_graph()
    dqn = DQN(**conf)
    runner = Runner(agent=dqn, env=env, env_name=env_name, tensorboard_dir="./logs")
    runner.fit(save_video_path="./video", **_fit_config)

[2017-12-19 09:25:31,256] Making new env: Breakout-v0
[2017-12-19 09:25:31,392] Making new env: Breakout-v0


Building tensorflow graph...


In [None]:
from collections import deque

x = list(deque(maxlen=3))

In [2]:
get_config("fit")

mappingproxy({'__dict__': <attribute '__dict__' of 'FitConfig' objects>,
              '__doc__': None,
              '__module__': 'rltensor.configs',
              '__weakref__': <attribute '__weakref__' of 'FitConfig' objects>,
              'log_freq': (1001,),
              'num_max_start_steps': (30,),
              't_max': (50000000,)})

In [None]:
from functools

In [3]:
import tensorflow as tf
import gym

from rltensor.agents import DQN
from rltensor.processors import AtariProcessor
from rltensor.networks import DuelingModel


conf = {"q_conf":[
            {"name": "conv2d", "kernel_size":(8, 8), "num_filter":32, "stride":4,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "conv2d", "kernel_size":(5, 5), "num_filter":64, "stride":2,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
           {"name": "conv2d", "kernel_size": (3, 3), "num_filter":64, "stride":1,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "dense", "is_flatten":True, "is_batch":False, "num_hidden": 512, 'activation': tf.nn.relu},
        ],
        "env_name": 'Breakout-v0',
        "processor": AtariProcessor(84, 84),
}

env = gym.make('Breakout-v0')
tf.reset_default_graph()
dqn = DQN(env, conf, q_network_cls=DuelingModel)
dqn.play(num_episode=10, ep=0.05, load_file_path="./breakout_dqn_params/model.ckpt",
         save_video_path="./breakout_videos", render_freq=1)

[2017-07-30 21:57:02,794] Making new env: Breakout-v0
[2017-07-30 21:57:03,411] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/tomoaki/work/Development/RL/breakout_videos')


INFO:tensorflow:Restoring parameters from ./breakout_dqn_params/model.ckpt


[2017-07-30 21:57:03,497] Restoring parameters from ./breakout_dqn_params/model.ckpt
[2017-07-30 21:57:03,531] Clearing 8 monitor files from previous run (because force=True was provided)
[2017-07-30 21:57:03,538] Starting new video recorder writing to /home/tomoaki/work/Development/RL/breakout_videos/openaigym.video.2.3241.video000000.mp4



Model restored.


[2017-07-30 21:57:35,712] Starting new video recorder writing to /home/tomoaki/work/Development/RL/breakout_videos/openaigym.video.2.3241.video000001.mp4

[2017-07-30 22:01:21,034] Starting new video recorder writing to /home/tomoaki/work/Development/RL/breakout_videos/openaigym.video.2.3241.video000008.mp4



In [None]:
if [1, 2, 3]:
    print("hello")

In [None]:
env.action_space.n

In [None]:
y.get_shape().as_list()

In [None]:
count = 4
while count < 5:
    print(count)
    count += 1

In [None]:
"%s" % True

In [None]:
a.insert(0, 2)

In [15]:
a

[2, 1]

In [11]:
np.random.randint(0, 2, 10)

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0])

In [42]:
from collections import deque

In [44]:
x = deque([1, 2, 3], maxlen=5)

In [45]:
x.append(3)
x.append(3)
x.append(3)

In [46]:
x

deque([2, 3, 3, 3, 3])

In [57]:
 result = tf.select(pred, val_if_true, val_if_false)

AttributeError: module 'tensorflow' has no attribute 'select'

In [60]:
x = tf.placeholder(tf.bool, (None,))
y = tf.cast(x, tf.int32)
z = tf.one_hot(y, 2)

In [62]:
sess = tf.InteractiveSession()
print(y.eval(feed_dict={x:[True, False, True]}))
print(z.eval(feed_dict={x:[True, False, True]}))

[1 0 1]
[[ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]]


In [13]:
type(np.arange(10).astype(int)[0])

numpy.int64

In [16]:
type(np.random.choice(range(0, 5), 3)[0])

numpy.int64

In [44]:
np.random.choice([1, 2, 3, 4], 3, False)

array([1, 4, 3])

In [12]:
x = np.arange(10)
np.append(x, 10)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [13]:
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])