In [1]:
import pandas as pd

names = ["Open", "High", "Low", "Close", "Volume"]
df = pd.read_excel("data/stock_data.xlsx", sheetname=names, index_col=0)
panel_stock = pd.Panel.from_dict(df)

Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  """


In [115]:
from tqdm import tqdm
import tensorflow as tf
from logging import getLogger
import time
from collections import deque
from six.moves import xrange
from gym import wrappers 
from copy import deepcopy
import os
import numpy as np

from rltensor.memories import SequentialMemory, PrioritizedMemory
from rltensor.params import default_conf as d_conf

logger = getLogger(__name__)

class Agent(object):
    def __init__(self, env, conf, default_conf=None, sess=None, *args, **kwargs):
        if sess is None:
            sess = tf.Session()
        self.sess = sess
        if default_conf is None:
            default_conf = d_conf
        self.default_conf = default_conf
        conf = self._set_conf(conf)
        self.conf = conf
        self.model_dir = conf["model_dir"]
        self.limit = conf["memory_limit"]
        self.window_length = conf["window_length"]
        self.memory = self._get_memory(self.window_length, self.limit, conf["prioritized"])
        self.gamma = conf["gamma"]
        self.error_clip = conf["error_clip"]
        self.processor = conf["processor"]
        self.ep_start = conf["ep_start"]
        self.ep_end = conf["ep_end"]
        self.t_ep_end = conf["t_ep_end"]
        self.t_learn_start = conf["t_learn_start"]
        self.t_train_freq = conf["t_train_freq"]
        self.t_target_q_update_freq = conf["t_target_q_update_freq"]
        # Get input and action dim info from env
        self.env = env
        self.env_name = conf["env_name"]
        self.state_dim = self.processor.get_input_shape()
        self.action_dim = env.action_space.n
        # configure for learning schedule 
        self.learning_rate = conf["learning_rate"]
        self.learning_rate_minimum = conf["learning_rate_minimum"]
        self.learning_rate_decay = conf["learning_rate_decay"]
        self.learning_rate_decay_step = conf["learning_rate_decay_step"]
        self.global_step = tf.Variable(0, trainable=False)
        # reward is in (min_r, max_r)
        self.min_r = conf["min_r"]
        self.max_r = conf["max_r"]
        self.batch_size = conf["batch_size"]
        self.log_freq = conf["log_freq"]
        self.avg_length = conf["avg_length"]
        self.update_op = None
        # Build tensorflow network
        st = time.time()
        logger.debug("Building tensorflow graph...")
        with self.sess.as_default():
            self.learning_rate_op = self._get_learning_rate()
            self.epsilon = self._get_epsilon()
            self.step_update_op = tf.assign(self.global_step, self.global_step + 1)
            self._build_graph()
            self.saver = tf.train.Saver()
        logger.debug("Finished building tensorflow graph, spent time:", time.time() - st)
        if "load_file_path" in conf:
            self.load_params(conf["load_file_path"])
        
    def _get_memory(self, window_length, limit, is_prioritized=True):
        if is_prioritized:
            return PrioritizedMemory(window_length, limit)
        else:
            return SequentialMemory(window_length, limit)
        
    def _build_graph(self):
        raise NotImplementedError()

    def train(self, t_max, num_max_start_steps=0, save_file_path=None, 
              load_file_path=None, save_video_path=None, overwrite=True, render_freq=None):
        tf.global_variables_initializer().run(session=self.sess);
        if load_file_path is not None:
            self.load_params(load_file_path)
        # Save Model
        self.save_params(save_file_path, overwrite)
        # Record Viodeo
        if save_video_path is not None:
            self.env = wrappers.Monitor(self.env, save_video_path, force=overwrite)
        # initialize target netwoork
        self.update_target_q_network()
        # initialize enviroment
        observation = self.env.reset()
        action = self.env.action_space.sample()
        # Perform random starts at beginning of episode and do not record them into the experience.
        # This slightly changes the start position between games.
        if num_max_start_steps == 0:
            num_random_start_steps = 0
        else:
            num_random_start_steps = np.random.randint(num_max_start_steps)
        for _ in xrange(num_random_start_steps):
            action = self.env.action_space.sample()
            observation, reward, terminal, info = self.env.step(action)
            observation = deepcopy(observation)
        # initialize memory
        terminal = False
        reward = 0
        observation, action, reward_, terminal = self.processor.preprocess(observation, action, reward, terminal)
        self.memory.append(observation, action, reward_, terminal, is_store=False)
        # accumulate results
        total_reward = deque(maxlen=self.avg_length)
        total_loss = deque(maxlen=self.avg_length)
        total_q_val = deque(maxlen=self.avg_length)
        ep_rewards = []
        ep_losses = []
        ep_q_vals = []
        ep_actions = []
        num_ep = 1
        step = self.global_step.eval(session=self.sess)
        # for t in tqdm(xrange(t_max)):
        st = time.time()
        _st = st
        for t in tqdm(xrange(t_max)): 
            try:
                # 1. predict
                state = self.memory.get_recent_state()
                action = self.predict(state)
                # 2. act
                observation, reward, terminal, info = self.env.step(action)
                observation, action, reward_, terminal\
                    = self.processor.preprocess(observation, action, reward, terminal)
                # 3. store data and train network
                if t < self.t_learn_start:
                    result = self.observe(observation, action, reward_, terminal, False)
                    self.memory.update_weights(step)
                    continue
                else:
                    result = self.observe(observation, action, reward_, terminal, True)
                q, loss, error, is_update = result
                # Update step
                if self.update_op is not None:
                    self.sess.run(self.step_update_op);
                step = self.global_step.eval(session=self.sess)
                self.memory.update_weights(step, error)
                # Update target network
                if (step + 1) % self.t_target_q_update_freq == 0:
                    self.update_target_q_network()
                # update statistics
                total_reward.append(reward)
                total_loss.append(loss)
                total_q_val.append(np.mean(q))
                ep_actions.append(action)
                ep_rewards.append(reward)
                ep_losses.append(loss)
                ep_q_vals.append(np.mean(q))
                # Visualize reuslts
                if render_freq is not None:
                    if step % render_freq == 0:
                        self.env.render()
                # Write summary
                if self.log_freq is not None and step % self.log_freq == 0:
                    num_per_sec = self.log_freq / (time.time() - _st)
                    _st = time.time()
                    epsilon = self.epsilon.eval(session=self.sess)
                    learning_rate = self.learning_rate_op.eval(session=self.sess)
                    avg_r = np.mean(total_reward)
                    avg_loss = np.mean(total_loss)
                    avg_q_val = np.mean(total_q_val)
                    tag_dict = {'episode.num_of_game': num_ep,
                                'average.reward': avg_r,
                                'average.loss': avg_loss,
                                'average.q': avg_q_val, 
                                'training.epsilon': epsilon,
                                'training.learning_rate': learning_rate,
                                'training.num_step_per_sec': num_per_sec,
                                'training.time': time.time() - st}
                    self._inject_summary(tag_dict, step)
            
                if terminal:
                    try:
                        cum_ep_reward = np.sum(ep_rewards)
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        cum_ep_reward, max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0, 0
                        
                    tag_dict = {'episode.cumulative_reward': cum_ep_reward,
                                'episode.max_reward': max_ep_reward, 
                                'episode.min_reward': min_ep_reward,
                                'episode.avg_reward': avg_ep_reward, 
                                'episode.rewards': ep_rewards}
                    if hasattr(self.memory, "priorities"):
                        self.memory['episode.actions'] = self.memory.priorities
                    self._inject_summary(tag_dict, num_ep)
                    observation = self.env.reset()
                    observation, action, reward_, terminal\
                        = self.processor.preprocess(observation, None, 0, False)
                    self.memory.reset()
                    self.memory.append(observation, action, reward_, terminal, is_store=False)
                    ep_rewards = []
                    ep_losses = []
                    ep_q_vals = []
                    ep_actions = []
                    num_ep += 1
            except KeyboardInterrupt:
                break
        # Update parameters before finishing
        self.save_params(save_file_path, True)
        
    def play(self, num_episode=1, ep=0.05, overwrite=True, load_file_path=None, save_video_path=None, render_freq=None):
        tf.global_variables_initializer().run(session=self.sess);
        if load_file_path is not None:
            self.load_params(load_file_path)
        # Record Viodeo
        if save_video_path is not None:
            self.env = wrappers.Monitor(self.env, save_video_path, force=overwrite)
        for num_ep in range(1, num_episode + 1):
            # initialize enviroment
            observation = self.env.reset()
            self.memory.reset()
            action = self.env.action_space.sample()
            reward = 0
            terminal = False
            observation, action, reward_, terminal = self.processor.preprocess(observation, action, reward, terminal)
            self.memory.append(observation, action, reward_, terminal, is_store=False)
            ep_rewards = []
            step = 1
            while not terminal:
                # 1. predict
                state = self.memory.get_recent_state()
                action = self.predict(state, ep)
                # 2. act
                observation, reward, terminal, info = self.env.step(action)
                # initialize memory
                observation, action, reward_, terminal = self.processor.preprocess(observation, action, reward, terminal)
                self.memory.append(observation, action, reward_, terminal, is_store=False)
                # accumulate results
                ep_rewards.append(reward)
                # Visualize reuslts
                if render_freq is not None:
                    if step % render_freq == 0:
                        self.env.render()
                if terminal:
                    try:
                        cum_ep_reward = np.sum(ep_rewards)
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        cum_ep_reward, max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0, 0
                    tag_dict = {'episode.cumulative_reward': cum_ep_reward,
                                'episode.max_reward': max_ep_reward, 
                                'episode.min_reward': min_ep_reward,
                                'episode.avg_reward': avg_ep_reward, 
                                'episode.rewards': ep_rewards}
                    self._inject_summary(tag_dict, num_ep)
                step += 1

    def predict(self, s_t, ep):
        raise NotImplementedError()
        
    def _set_conf(self, conf):
        conf = deepcopy(conf)
        for key in self.default_conf.keys():
            if key not in conf:
                conf[key] = self.default_conf[key]
        return conf
    
    def _get_learning_rate(self):
        learning_rate_op = tf.maximum(self.learning_rate_minimum,
          tf.train.exponential_decay(
              self.learning_rate,
              self.global_step,
              self.learning_rate_decay_step,
              self.learning_rate_decay,
              staircase=True))
        return learning_rate_op
    
    def _get_epsilon(self):
        rest_steps  = tf.maximum(0., 
            self.t_ep_end - tf.maximum(0., tf.cast(self.global_step - self.t_learn_start, tf.float32)))
        delta_ep = max(0, self.ep_start - self.ep_end)
        epsilon = self.ep_end + delta_ep * rest_steps / self.t_ep_end
        return epsilon

    def update_target_q_network(self):
        self.sess.run(self.update_op);
        
    def _build_summaries(self):
        self.writer = tf.summary.FileWriter(self.model_dir, self.sess.graph)
        self.summary_placeholders = {}
        self.summary_ops = {}
        scalar_summary_tags = [
                'average.reward', 'average.loss', 'average.q', 'episode.cumulative_reward', 
                'episode.max_reward', 'episode.min_reward', 'episode.avg_reward', 
                'episode.num_of_game', 'training.epsilon', 'training.learning_rate',
                'training.num_step_per_sec', 'training.time']
        for tag in scalar_summary_tags:
            self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace(' ', '_'))
            self.summary_ops[tag] =\
                tf.summary.scalar("%s/%s" % (self.env_name, tag), self.summary_placeholders[tag])
        
        histogram_summary_tags = ['episode.rewards', 'episode.actions']
        for tag in histogram_summary_tags:
            self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace(' ', '_'))
            self.summary_ops[tag]  = tf.summary.histogram(tag, self.summary_placeholders[tag])
    
    def _inject_summary(self, tag_dict, step):
        summary_str_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dict.keys()], {
          self.summary_placeholders[tag]: value for tag, value in tag_dict.items()
        })
        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str, step)

    def load_params(self, file_path):
        """Loads parameters of an estimator from a file.
        
        Args:
            file_path: str, The path to the file.
        """
        self.saver.restore(self.sess, file_path)
        print("Model restored.")

    def save_params(self, file_path=None, overwrite=True):
        """Saves parameters of an estimator as a file.
        
        Args:
            file_path: str, The path to where the parameters should be saved.
            overwrite: bool, If `False` and `file_path` already exists, raises an error.
        """
        if file_path is None:
            if not os.path.isdir("params"):
                os.mkdir("params")
            file_path = "params/model.ckpt"
        if not overwrite:
            _path = ".".join([file_path, "meta"])
            if os.path.isfile(_path):
                raise NameError("%s already exists." % file_path)
        save_path = self.saver.save(self.sess, file_path)
        print("Model saved in file: %s" % save_path)

In [116]:
from rltensor.environments.core import Env
from time import time
import numpy as np

class ActionSpace(object):
    n = None
    def sample(self):
        action = np.random.randn(self.n) ** 2
        action = action / np.sum(action)
        return action

class TradingEnv(Env):
    metadata = {'render.modes': []}
    reward_range = (-np.inf, np.inf)
    
    def __init__(self, data,  st=None, end=None):
        self.data = data
        self.action_space = ActionSpace()
        self.action_space.n = data.axes[2].values.shape[0]
        if st is None:
            st = data.axes[0]
        if end is None:
            end = data.axes[-1]
        self.st = st
        self.end = end
        self._sepc = None
        self.volume = data.ix["Volume"]
        self.close = data.ix["Close"]
        self.open = data.ix["Open"]
        self.high  = data.ix["High"]
        self.low = data.ix["Low"]
        self.time_idx = self.open.index
        self._preprocess()
        self.returns = self._calc_returns(self.close)
        
        
    def _step(self, action):
        self.time_step += 1
        r_t = self.returns.values[self.time_step]
        reward = np.dot(r_t, action)
        # OHLC x num_stock
        observation = np.stack((self.open.values[self.time_step],
                                 self.high.values[self.time_step],
                                 self.low.values[self.time_step],
                                 self.close.values[self.time_step],
                                 self.volume.values[self.time_step]), axis=1)
        if self.low.shape[0] - 1 <= self.time_step:
            done = True
        else:
            done = False
        return observation, reward, done, None
        
    def _reset(self):
        self.time_step = 0
        observation = np.stack((self.open.values[self.time_step],
                                 self.high.values[self.time_step],
                                 self.low.values[self.time_step],
                                 self.close.values[self.time_step],
                                 self.volume.values[self.time_step]), axis=1)
        return observation
        
    def _render(self, *args, **kwargs):
        pass
    
    def _close(self):
        pass
    
    def _seed(self, seed=None):
        return time()
    
    def _calc_returns(self, df):
        returns = df.pct_change(1)
        returns.values[0] = np.zeros_like(df.values[0])
        return returns
        
    def _preprocess(self):
        self.volume = self.volume.replace(np.nan, 0)
        cols = self.open.columns.values
        init_val = np.array([self.open[c].dropna().values[0] for c in cols])
        self.open = self._normalize(self.open, init_val)
        self.high = self._normalize(self.high, init_val)
        self.low = self._normalize(self.low, init_val)
        self.close = self._normalize(self.close, init_val)
        
    def _normalize(self, df, init_val):
        df = df / init_val
        df = df.replace(np.nan, 1)
        return  df

In [117]:
env = TradingEnv(panel_stock)

In [118]:
from rltensor.memories import SequentialMemory
# from rltensor.agents import Agent

class RandomTradingAgent(Agent):
    def __init__(self, env, conf, default_conf=None, sess=None, *args, **kwargs):
        super().__init__(env, conf, default_conf, sess, *args, **kwargs)
        
    def _build_graph(self):
        """Build all of the network and optimizations
        
        just for conveninece of trainig, seprate placehoder for train and target network
        critic network input: [raw_data, smoothed, downsampled]
        """
        with tf.name_scope('summaries'):
            self._build_summaries()

    def observe(self, observation, action, reward, terminal, training):
        # clip reward into  (min_r, max_r)
        reward = max(self.min_r, min(self.max_r, reward))
        # We always keep data
        self.memory.append(observation, action, reward, terminal, is_store=True)
        step = self.global_step.eval(session=self.sess)
        return np.zeros(self.batch_size), np.zeros(self.batch_size), np.zeros(self.batch_size), False
    
    def predict(self, state, ep=None):
        action = np.random.randn(self.action_dim) ** 2
        action = action / np.sum(action)
        return action
    
    def update_target_q_network(self, *args, **kwargs):
        pass

In [122]:
class DefaultProcessor(object):
    def __init__(self, input_shape=None):
        self.input_shape = input_shape
    
    def preprocess(self, observation, action, reward, terminal):
        return observation, action, reward, terminal
    
    def tensor_process(self, x):
        return x
    
    def get_input_shape(self):
        return self.input_shape
    
    def get_action_value(self, *args, **kwargs):
        raise NotImplementedError()
    
class TradingProcessor(DefaultProcessor):
    def __init__(self, num_stock, num_feature):
        self.height = num_stock
        self.width = num_feature
        self.input_shape = (num_stock, num_feature)
    
    def preprocess(self, observation, action, reward, terminal):
        reward = np.log(1 + reward)
        # we use high, low close
        observation = observation[:, 1:]
        return observation, action, reward, terminal
    
    def tensor_process(self, x):
        return x
    
    def get_reward(self, state0,  state1, action):
        # feature 2 has to be close prices
        returns = state1[:, -1, :, 2] / state0[:, -1, :, 2]
        return tf.log(tf.reduce_sum(returns * action, axis=1))

In [123]:
from rltensor.memories import SequentialMemory
# from rltensor.agents.agent import Agent
from rltensor.utils import get_shape
from time import time
import time

class PolicyGradient(Agent):
    def __init__(self, env, conf, action_network_cls, default_conf=None, sess=None, *args, **kwargs):
        self.action_network_cls = action_network_cls
        super().__init__(env, conf, default_conf, sess, *args, **kwargs)
        
    def _build_graph(self):
        """Build all of the network and optimizations
        
        just for conveninece of trainig, seprate placehoder for train and target network
        critic network input: [raw_data, smoothed, downsampled]
        """
        # training flag
        self.training = tf.placeholder(tf.bool, name="training")
        # state shape has to be (batch, length,) + input_dim
        self.state = tf.placeholder(tf.float32,
                                     get_shape(self.state_dim, maxlen=self.window_length),
                                     name='state')
        _state = self.processor.tensor_process(self.state)
        self.target_state = tf.placeholder(tf.float32,
                                            get_shape(self.state_dim, maxlen=self.window_length),
                                            name='target_state')
        _target_state = self.processor.tensor_process(self.target_state)
        # Employ maximal strategy
        self.action_network = self.action_network_cls(self.action_dim, self.conf["action_conf"],
                                              scope_name="action_network")
        self.action = self.action_network(_state, self.training)
        reward = self.processor.get_reward(_state, _target_state, self.action)
        self.terminal = tf.placeholder(tf.bool, (None,), name="terminal")
        self.loss = tf.reduce_mean(-reward, name='loss')
        # Build optimization
        self.action_optim = tf.train.AdamOptimizer(self.learning_rate_op)\
            .minimize(self.loss, var_list=self.action_network.variables)
        with tf.name_scope('summaries'):
            self._build_summaries()

    def observe(self, observation, action, reward, terminal, training):
        # clip reward into  (min_r, max_r)
        reward = max(self.min_r, min(self.max_r, reward))
        # We always keep data
        self.memory.append(observation, action, reward, terminal, is_store=True)
        step = self.global_step.eval(session=self.sess)
        if (step + 1) % self.t_train_freq:
            is_update = True
        else:
            is_update = False
        if training:
            experiences = self.memory.sample(self.batch_size)
            result = self.action_learning_minibatch(experiences, is_update)
            return result
        else:
            return None

    def action_learning_minibatch(self, experiences, batch_weights, is_update=True):
        feed_dict = {
            self.state: [experience.state0 for experience in experiences],
            self.target_state: [experience.state1 for experience in experiences],
            self.terminal: [experience.terminal1 for experience in experiences],
            self.training: True}
        if is_update:
            self.sess.run(self.action_optim, feed_dict=feed_dict);
        loss = self.sess.run([self.loss], feed_dict=feed_dict)
        # To have compatibility with other q-learning, we return pseudo values q, loss, error, is_udpate
        return np.zeros(self.batch_size), loss, np.zeros(self.batch_size), is_update
    
    def predict(self, state, ep=None):
        action = self.sess.run(self.action, feed_dict={self.state: [state], self.training: False})[0]
        return action
    
    def update_target_q_network(self, *args, **kwargs):
        pass

In [124]:
import tensorflow as tf
from rltensor.networks import MLPModel

shape = (env.open.shape[1], 4)
conf = {"action_conf":[
            {"name": "conv2d", "kernel_size":(4, 1), "num_filter":32, "stride":(2, 1),
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "conv2d", "kernel_size":(4, 1), "num_filter":64, "stride":(2, 1),
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "dense", "is_flatten":True, "is_batch":False, "num_hidden": 512, 'activation': tf.nn.relu},
        ],
        'double_q': True,
        "memory_limit": 100000,
        "window_length": 20,
        "learning_rate": 2.5e-4,
        "learning_rate_minimum": 2.5e-4,
        "learning_rate_decay": 0.9,
        "learning_rate_decay_step": 100,
        "min_r": -np.inf,
        "max_r": np.inf,
        "batch_size": 32,
        "t_learn_start": 100,
        "t_train_freq": 1,
        "model_dir": "./logs/trading",
        "processor": TradingProcessor(shape[0], shape[1]),
        "log_freq": 1000,
        "avg_length": 10000,
        "env_name": 'Trading',
        "prioritized": False
}
tf.reset_default_graph()
agent = PolicyGradient(env, conf, action_network_cls=MLPModel)
agent.train(int(1e7), render_freq=None)

Exception ignored in: <bound method Env.__del__ of <__main__.TradingEnv object at 0x7f1e0791ce80>>
Traceback (most recent call last):
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/rltensor-0.1.0-py3.6.egg/rltensor/environments/core.py", line 179, in __del__
TypeError: 'DataFrame' object is not callable
Exception ignored in: <bound method Env.__del__ of <__main__.TradingEnv object at 0x7f1e03a1a1d0>>
Traceback (most recent call last):
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/rltensor-0.1.0-py3.6.egg/rltensor/environments/core.py", line 179, in __del__
TypeError: 'DataFrame' object is not callable


  0%|          | 34/10000000 [00:00<8:14:33, 336.99it/s]

Model saved in file: params/model.ckpt
(501, 4)


[A
  0%|          | 68/10000000 [00:00<8:13:52, 337.47it/s][A
  0%|          | 101/10000000 [00:00<12:02:39, 230.63it/s][A
  0%|          | 118/10000000 [00:01<75:07:32, 36.97it/s] [A
  0%|          | 130/10000000 [00:02<118:40:38, 23.41it/s][A
  0%|          | 372/10000000 [00:22<224:45:03, 12.36it/s]

KeyboardInterrupt: 

            0%|          | 372/10000000 [00:33<246:40:54, 11.26it/s]

In [None]:
env.volume.shape

In [42]:
ts = panel_stock.axes[1]

In [49]:
np.max(df.loc[ts[0]])

1.0

In [47]:
df = env.open

In [52]:
np.vstack((np.arange(10), np.arange(10))).shape

(2, 10)

In [62]:
panel_stock.axes[2].values.shape[0]

501

In [7]:
x = tf.Variable(np.ones((3, 3)))

In [10]:
x.get_shape().as_list()

[3, 3]

In [98]:
np.stack((np.arange(10), np.arange(10)), axis=1).shape

(10, 2)

In [3]:
import tensorflow as tf
import gym

from rltensor.agents import DQN
from rltensor.processors import AtariProcessor
from rltensor.networks import DuelingModel


conf = {"q_conf":[
            {"name": "conv2d", "kernel_size":(8, 8), "num_filter":32, "stride":4,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "conv2d", "kernel_size":(5, 5), "num_filter":64, "stride":2,
             "padding": 'SAME', "is_batch":True, 'activation': tf.nn.relu},
           {"name": "conv2d", "kernel_size": (3, 3), "num_filter":64, "stride":1,
             "padding": 'SAME', "is_batch":True, 'activation': tf.nn.relu},
            {"name": "dense", "is_flatten":True, "is_batch":True, "num_hidden": 512, 'activation': tf.nn.relu},
        ],
        'double_q': True,
        "memory_limit": 100000,
        "window_length": 4,
        "gamma": 0.99,
        "learning_rate": 2.5e-4,
        "learning_rate_minimum": 2.5e-4,
        "learning_rate_decay": 0.9,
        "learning_rate_decay_step": 100,
        "ep": 1e-3,
        "min_r": -1,
        "max_r": 1,
        "batch_size": 32,
        "error_clip": 1.0,
        "processor": AtariProcessor(84, 84),
        "t_learn_start": 100,
        "t_train_freq": 4,
        "t_target_q_update_freq": 10000,
        "ep_start": 1.0,
        "ep_end": 0.1,
        "t_ep_end": int(1e6),
        "model_dir": "./logs/dqn",
        "log_freq": 1000,
        "avg_length": 10000,
        "env_name": 'DemonAttack-v0',
        "prioritized": True,
}

conf = {"q_conf":[
            {"name": "conv2d", "kernel_size":(8, 8), "num_filter":32, "stride":4,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "conv2d", "kernel_size":(5, 5), "num_filter":64, "stride":2,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
           {"name": "conv2d", "kernel_size": (3, 3), "num_filter":64, "stride":1,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "dense", "is_flatten":True, "is_batch":False, "num_hidden": 512, 'activation': tf.nn.relu},
        ],
        "env_name": 'DemonAttack-v0',
        "processor": AtariProcessor(84, 84),
}

env = gym.make('Breakout-v0')
tf.reset_default_graph()
dqn = DQN(env, conf, q_network_cls=DuelingModel)
dqn.train(int(1e7), render_freq=None, save_video_path="./videos")

[2017-07-26 19:23:53,161] Making new env: Breakout-v0
Exception ignored in: <bound method Monitor.__del__ of <Monitor<TimeLimit<AtariEnv<Breakout-v0>>>>>
Traceback (most recent call last):
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/gym/wrappers/monitoring.py", line 239, in __del__
    self.close()
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/gym/wrappers/monitoring.py", line 145, in close
    self.stats_recorder.close()
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/gym/monitoring/stats_recorder.py", line 89, in close
    self.flush()
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/gym/monitoring/stats_recorder.py", line 96, in flush
    with atomic_write.atomic_write(self.path) as f:
  File "/home/tomoaki/anaconda3/lib/python3.6/contextlib.py", line 82, in __enter__
    return next(self.gen)
  File "/home/tomoaki/anaconda3/lib/python3.6/site-packages/gym/utils/atomic_write.py", line 45, in atomic_write
    with open(tmppath, 'wb

Model saved in file: params/model.ckpt


[A
  0%|          | 101/10000000 [00:00<5:22:55, 516.12it/s][A
  0%|          | 126/10000000 [00:00<18:38:44, 148.98it/s][A
  0%|          | 145/10000000 [00:00<27:55:37, 99.46it/s] [A
  0%|          | 160/10000000 [00:01<34:28:29, 80.57it/s][A
  0%|          | 172/10000000 [00:01<40:20:48, 68.85it/s][A
  0%|          | 182/10000000 [00:01<43:22:30, 64.04it/s][A
  0%|          | 199/10000000 [00:01<45:48:05, 60.65it/s][2017-07-26 19:23:56,161] Starting new video recorder writing to /home/tomoaki/work/Development/RL/videos/openaigym.video.2.3383.video000001.mp4
  0%|          | 1869/10000000 [00:31<47:04:30, 59.00it/s][2017-07-26 19:24:26,189] Starting new video recorder writing to /home/tomoaki/work/Development/RL/videos/openaigym.video.2.3383.video000008.mp4
  0%|          | 6499/10000000 [01:55<50:48:13, 54.64it/s][2017-07-26 19:25:49,843] Starting new video recorder writing to /home/tomoaki/work/Development/RL/videos/openaigym.video.2.3383.video000027.mp4
  0%|          | 149

Model saved in file: params/model.ckpt


In [4]:
import tensorflow as tf
import gym

from rltensor.agents import DQN
from rltensor.processors import AtariProcessor
from rltensor.networks import DuelingModel


conf = {"q_conf":[
            {"name": "conv2d", "kernel_size":(8, 8), "num_filter":32, "stride":4,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "conv2d", "kernel_size":(5, 5), "num_filter":64, "stride":2,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
           {"name": "conv2d", "kernel_size": (3, 3), "num_filter":64, "stride":1,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "dense", "is_flatten":True, "is_batch":False, "num_hidden": 512, 'activation': tf.nn.relu},
        ],
        "env_name": 'Breakout-v0',
        "processor": AtariProcessor(84, 84),
}

env = gym.make('Breakout-v0')
tf.reset_default_graph()
dqn = DQN(env, conf, q_network_cls=DuelingModel)
dqn.play(num_episode=10, ep=0.05, load_file_path="./breakout_dqn_params/model.ckpt",
         save_video_path="./breakout_videos", render_freq=1)

[2017-07-28 00:48:07,087] Making new env: Breakout-v0
[2017-07-28 00:48:07,764] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/tomoaki/work/Development/RL/breakout_videos')


INFO:tensorflow:Restoring parameters from ./breakout_dqn_params/model.ckpt


[2017-07-28 00:48:07,834] Restoring parameters from ./breakout_dqn_params/model.ckpt
[2017-07-28 00:48:07,868] Clearing 8 monitor files from previous run (because force=True was provided)
[2017-07-28 00:48:07,876] Starting new video recorder writing to /home/tomoaki/work/Development/RL/breakout_videos/openaigym.video.3.20170.video000000.mp4


Model restored.


[2017-07-28 00:48:27,366] Starting new video recorder writing to /home/tomoaki/work/Development/RL/breakout_videos/openaigym.video.3.20170.video000001.mp4
[2017-07-28 00:52:21,563] Starting new video recorder writing to /home/tomoaki/work/Development/RL/breakout_videos/openaigym.video.3.20170.video000008.mp4


In [None]:
if [1, 2, 3]:
    print("hello")

In [None]:
env.action_space.n

In [None]:
y.get_shape().as_list()

In [None]:
count = 4
while count < 5:
    print(count)
    count += 1

In [None]:
"%s" % True

In [None]:
a.insert(0, 2)

In [15]:
a

[2, 1]

In [11]:
np.random.randint(0, 2, 10)

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0])

In [42]:
from collections import deque

In [44]:
x = deque([1, 2, 3], maxlen=5)

In [45]:
x.append(3)
x.append(3)
x.append(3)

In [46]:
x

deque([2, 3, 3, 3, 3])

In [57]:
 result = tf.select(pred, val_if_true, val_if_false)

AttributeError: module 'tensorflow' has no attribute 'select'

In [60]:
x = tf.placeholder(tf.bool, (None,))
y = tf.cast(x, tf.int32)
z = tf.one_hot(y, 2)

In [62]:
sess = tf.InteractiveSession()
print(y.eval(feed_dict={x:[True, False, True]}))
print(z.eval(feed_dict={x:[True, False, True]}))

[1 0 1]
[[ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]]


In [13]:
type(np.arange(10).astype(int)[0])

numpy.int64

In [16]:
type(np.random.choice(range(0, 5), 3)[0])

numpy.int64

In [44]:
np.random.choice([1, 2, 3, 4], 3, False)

array([1, 4, 3])

In [12]:
x = np.arange(10)
np.append(x, 10)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [13]:
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])