In [1]:
import rltensor

In [2]:
import numpy as np
from scipy.misc import imresize
from PIL import Image


def resize_data(data, width, height, c_dim=3, is_color=True):
    """resize data for trainining dcgan
    Args:
        data: list of image data, each of which has a shape,
            (width, height, color_dim) if is_color==True
            (width, height) otherwisei
    """
    if is_color:
        converted_data = np.array([imresize(d, [width, height]) for d in data
                                if (len(d.shape)==3 and d.shape[-1] == c_dim)])
    else:
        # gray scale data
        converted_data = np.array([imresize(d, [width, height]) for d in data
                                if (len(d.shape)==2)])
    return converted_data

In [3]:
import numpy as np
from scipy.misc import imresize
from PIL import Image
from os import listdir
from os.path import join, isfile, isdir
from skimage.color import rgb2gray


class DefaultProcessor(object):
    def __init__(self, input_shape):
        self.input_shape = input_shape
    
    def preprocess(self, x):
        return np.array(x)
    
    def tensor_process(self, x):
        return x
    
    def get_input_shape(self):
        return self.input_shape
    
class AtariProcessor(DefaultProcessor):
    def __init__(self, height, width):
        self.height = height
        self.width = width
        self.input_shape = (height, width)
    
    def preprocess(self, x):
        x = resize_data([x], self.height, self.width)[0]
        x = rgb2gray(x)
        return x
    
    def tensor_process(self, x):
        # change to (batch, width, hight, window_length)
        return tf.transpose(x, [0, 2, 3, 1])

In [4]:
from tqdm import tqdm
import tensorflow as tf
from logging import getLogger
import time
from collections import deque
from six.moves import xrange

from rltensor.memories import SequentialMemory

logger = getLogger(__name__)

class Agent(object):
    def __init__(self, env, conf, sess=None):
        if sess is None:
            sess = tf.Session()
        self.sess = sess
        self.conf = conf
        self.model_dir = conf["model_dir"]
        self.limit = conf["memory_limit"]
        self.window_length = conf["window_length"]
        self.memory = self._get_memory(self.window_length, self.limit)
        self.gamma = conf["gamma"]
        self.error_clip = conf["error_clip"]
        self.processor = conf["processor"]
        self.ep_start = conf["ep_start"]
        self.ep_end = conf["ep_end"]
        self.t_ep_end = conf["t_ep_end"]
        self.t_learn_start = conf["t_learn_start"]
        self.t_train_freq = conf["t_train_freq"]
        self.t_target_q_update_freq = conf["t_target_q_update_freq"]
        # Get input and action dim info from env
        self.env = env
        self.env_name = conf["env_name"]
        self.state_dim = self.processor.get_input_shape()
        self.action_dim = env.action_space.n
        # configure for learning schedule 
        self.learning_rate = conf["learning_rate"]
        self.learning_rate_minimum = conf["learning_rate_minimum"]
        self.learning_rate_decay = conf["learning_rate_decay"]
        self.learning_rate_decay_step = conf["learning_rate_decay_step"]
        self.global_step = tf.Variable(0, trainable=False)
        # reward is in (min_r, max_r)
        self.min_r = conf["min_r"]
        self.max_r = conf["max_r"]
        self.batch_size = conf["batch_size"]
        self.log_freq = conf["log_freq"]
        self.avg_length = conf["avg_length"]
        # Build tensorflow network
        st = time.time()
        logger.debug("Building tensorflow graph...")
        with self.sess.as_default():
            self._build_graph()
            self.saver = tf.train.Saver()
        logger.debug("Finished building tensorflow graph, spent time:", time.time() - st)
        if "load_file_path" in conf:
            self.load_params(conf["load_file_path"])
        
    def _get_memory(self, window_length, limit):
        return SequentialMemory(window_length, limit)
        
    def _build_graph(self):
        raise NotImplementedError()

    def train(self, t_max, num_max_start_steps=0, save_file_path=None, 
              load_file_path=None,overwrite=True):
        tf.global_variables_initializer().run(session=self.sess)
        if load_file_path is not None:
            self.load_params(load_file_path)
        # Save Model
        self.save_params(save_file_path, overwrite)
        # initialize target netwoork
        self.update_target_q_network()
        # initialize enviroment
        observation = self.env.reset()
        action = env.action_space.sample()
        # Perform random starts at beginning of episode and do not record them into the experience.
        # This slightly changes the start position between games.
        if num_max_start_steps == 0:
            num_random_start_steps = 0
        else:
            num_random_start_steps = np.random.randint(num_max_start_steps)
        for _ in range(num_random_start_steps):
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            observation = deepcopy(observation)
        # initialize memory
        observation = self.processor.preprocess(observation)
        self.memory.append(observation=observation, reward=0,
                              action=action, terminal=False, training=True)
        # accumulate results
        total_reward = deque(maxlen=self.avg_length)
        total_loss = deque(maxlen=self.avg_length)
        total_q_val = deque(maxlen=self.avg_length)
        ep_rewards = []
        ep_losses = []
        ep_q_vals = []
        ep_actions = []
        num_ep = 1
        step = self.global_step.eval(session=self.sess)
        for t in tqdm(xrange(t_max)):
            try:
                # 1. predict
                state = self.memory.get_recent_state()
                action = self.predict(state)
                # 2. act
                observation, reward, terminal, info = self.env.step(action)
                observation = self.processor.preprocess(observation)
                # 3. store data and train network
                if t < self.t_learn_start:
                    result = self.observe(observation, reward, action, terminal, False)
                    continue
                else:
                    result = self.observe(observation, reward, action, terminal, True)
                q, loss, is_update = result
                logger.debug("a: %d, r:%f, t:%s, q:%.4f, l: %.4f" % \
                    (action, reward, terminal, np.mean(q), loss))
                # Update step
                self.sess.run(self.step_update_op)
                step = self.global_step.eval(session=self.sess)
                # Update target network
                if (step + 1) % self.t_target_q_update_freq == 0:
                    self.update_target_q_network()
                # update statistics
                total_reward.append(reward)
                total_loss.append(loss)
                total_q_val.append(np.mean(q))
                ep_actions.append(action)
                ep_rewards.append(reward)
                ep_losses.append(loss)
                ep_q_vals.append(np.mean(q))
                # Write summary
                if step % self.log_freq == 0:
                    epsilon = self.epsilon.eval(session=self.sess)
                    learning_rate = self.learning_rate_op.eval(session=self.sess)
                    avg_r = np.mean(total_reward)
                    avg_loss = np.mean(total_loss)
                    avg_q_val = np.mean(total_q_val)
                    try:
                        max_ep_reward = np.max(ep_rewards)
                        min_ep_reward = np.min(ep_rewards)
                        avg_ep_reward = np.mean(ep_rewards)
                    except:
                        max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0
                    tag_dict = {'average.reward': avg_r,
                                'average.loss': avg_loss,
                                'average.q': avg_q_val, 
                                'episode.max_reward': max_ep_reward, 
                                'episode.min_reward': min_ep_reward,
                                'episode.avg_reward': avg_ep_reward, 
                                'episode.num_of_game': num_ep,
                                'training.epsilon': epsilon,
                                'training.learning_rate': learning_rate,
                                'episode.rewards': ep_rewards,
                                'episode.actions': ep_actions}
                    self._inject_summary(tag_dict)
            
                if terminal:
                    observation = self.env.reset()
                    observation = self.processor.preprocess(observation)
                    self.memory.reset()
                    self.memory.append(observation=observation, reward=0,
                                   action=action, terminal=False, training=True)
                    ep_rewards = []
                    ep_losses = []
                    ep_q_vals = []
                    ep_actions = []
                    num_ep += 1
            except KeyboardInterrupt:
                break
        # Update parameters before finishing
        self.save_params(save_file_path, True)

    def predict(self, s_t, ep):
        raise NotImplementedError()
    
    def _get_learning_rate(self):
        learning_rate_op = tf.maximum(self.learning_rate_minimum,
          tf.train.exponential_decay(
              self.learning_rate,
              self.global_step,
              self.learning_rate_decay_step,
              self.learning_rate_decay,
              staircase=True))
        return learning_rate_op
    
    def _get_epsilon(self):
        rest_steps  = tf.maximum(0., 
            self.t_ep_end - tf.maximum(0., tf.cast(self.global_step - self.t_learn_start, tf.float32)))
        delta_ep = max(0, self.ep_start - self.ep_end)
        epsilon = self.ep_end + delta_ep * rest_steps / self.t_ep_end
        return epsilon

    def update_target_q_network(self):
        self.sess.run(self.update_op)
        
    def _build_summaries(self):
        self.writer = tf.summary.FileWriter(self.model_dir, self.sess.graph)
        self.summary_placeholders = {}
        self.summary_ops = {}
        scalar_summary_tags = [
                'average.reward', 'average.loss', 'average.q', 
                'episode.max_reward', 'episode.min_reward', 'episode.avg_reward', 
                'episode.num_of_game', 'training.epsilon', 'training.learning_rate']
        for tag in scalar_summary_tags:
            self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace(' ', '_'))
            self.summary_ops[tag] =\
                tf.summary.scalar("%s/%s" % (self.env_name, tag), self.summary_placeholders[tag])
        
        histogram_summary_tags = ['episode.rewards', 'episode.actions']
        for tag in histogram_summary_tags:
            self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace(' ', '_'))
            self.summary_ops[tag]  = tf.summary.histogram(tag, self.summary_placeholders[tag])
    
    def _inject_summary(self, tag_dict):
        step = self.global_step.eval(session=self.sess)
        summary_str_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dict.keys()], {
          self.summary_placeholders[tag]: value for tag, value in tag_dict.items()
        })
        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str, step)

    def load_params(self, file_path):
        """Loads parameters of an estimator from a file.
        
        Args:
            file_path: str, The path to the file.
        """
        self.saver.restore(self.sess, file_path)
        print("Model restored.")

    def save_params(self, file_path=None, overwrite=True):
        """Saves parameters of an estimator as a file.
        
        Args:
            file_path: str, The path to where the parameters should be saved.
            overwrite: bool, If `False` and `file_path` already exists, raises an error.
        """
        if file_path is None:
            if not os.path.isdir("params"):
                os.mkdir("params")
            file_path = "params/model.ckpt"
        if not overwrite:
            _path = ".".join([file_path, "meta"])
            if os.path.isfile(_path):
                raise NameError("%s already exists." % file_path)
        save_path = self.saver.save(self.sess, file_path)
        print("Model saved in file: %s" % save_path)

In [5]:
import os
import time
import numpy as np
import tensorflow as tf
from logging import getLogger
import random

# from .agent import Agent
from rltensor.utils import get_shape

logger = getLogger(__name__)


class DQN(Agent):
    def __init__(self, env, conf, q_network_cls, sess=None):
        self.q_network_cls = q_network_cls
        super(DQN, self).__init__(env, conf, sess)
        
    def _build_graph(self):
        """Build all of the network and optimizations
        
        just for conveninece of trainig, seprate placehoder for train and target network
        critic network input: [raw_data, smoothed, downsampled]
        """
        self.step_update_op = tf.assign(self.global_step, self.global_step + 1)
        # training flag
        self.training = tf.placeholder(tf.bool, name="training")
        # state shape has to be (batch, length,) + input_dim
        self.state = tf.placeholder(tf.float32,
                                     get_shape(self.state_dim, maxlen=self.window_length),
                                     name='state')
        _state = self.processor.tensor_process(self.state)
        self.target_state = tf.placeholder(tf.float32,
                                            get_shape(self.state_dim, maxlen=self.window_length),
                                            name='target_state')
        _target_state = self.processor.tensor_process(self.target_state)
        # Employ maximal strategy
        self.q_network = self.q_network_cls(self.conf["q_conf"], scope_name="q_network")
        self.q_val = self.q_network(_state, self.training)
        assert self.q_val.get_shape().as_list()[-1] == self.action_dim
        self.max_action = tf.argmax(self.q_val, dimension=1)
        # Build action graph
        self.action = tf.placeholder(tf.int32, (None,), name='action')
        action_one_hot = tf.one_hot(self.action, depth=self.action_dim)
        self.action_q_val = tf.reduce_sum(self.q_val * action_one_hot, axis=1)
        # Build target network
        self.target_q_network = self.q_network_cls(self.conf["q_conf"], scope_name="target_q_network")
        target_q_val = self.target_q_network(_target_state, self.training)
        self.reward = tf.placeholder(tf.float32, (None,), name='reward')
        max_one_hot = tf.one_hot(self.max_action, depth=self.action_dim)
        max_q_val = tf.reduce_sum(self.q_val * max_one_hot, axis=1)
        target_val = self.reward  + self.gamma * max_q_val
        # Clip error to stabilize learning
        delta = target_val - self.action_q_val
        clipped_error = tf.where(tf.abs(delta) < self.error_clip,
                                    0.5 * tf.square(delta),
                                    tf.abs(delta), name='clipped_error')
        self.loss = tf.reduce_mean(clipped_error, name='loss')
        # Build optimization
        self.learning_rate_op = self._get_learning_rate()
        self.epsilon = self._get_epsilon()
        self.loss = tf.reduce_mean(tf.square(target_val - self.action_q_val), name='loss')
        self.q_optim = tf.train.AdamOptimizer(self.learning_rate_op)\
            .minimize(self.loss, var_list=self.q_network.variables)
        self.update_op = self._get_update_op()
        with tf.name_scope('summaries'):
            self._build_summaries()

    def observe(self, observation, reward, action, terminal, training):
        # clip reward into  (min_r, max_r)
        reward = max(self.min_r, min(self.max_r, reward))
        # We always keep data
        self.memory.append(observation, action, reward, terminal, training=training)
        step = self.global_step.eval(session=self.sess)
        if (step + 1) % self.t_train_freq:
            is_update = True
        else:
            is_update = False
        if training:
            experiences = self.memory.sample(self.batch_size)
            result = self.q_learning_minibatch(experiences, is_update)
            return result
        else:
            return None

    def q_learning_minibatch(self, experiences, is_update=True):
        feed_dict = {
            self.state: [experience.state0 for experience in experiences],
            self.target_state: [experience.state1 for experience in experiences],
            self.reward: [experience.reward for experience in experiences],
            self.action: [experience.action for experience in experiences],
            self.training: True,
        }
        if is_update:
            self.sess.run(self.q_optim, feed_dict=feed_dict)
        q_t, loss = self.sess.run([self.action_q_val, self.loss],
                                     feed_dict=feed_dict)
        return q_t, loss, is_update
    
    def predict(self, state):
        ep = self.epsilon.eval(session=self.sess)
        if random.random() < ep:
            action = np.random.randint(0, self.action_dim)
        else:
            action = self.sess.run(self.max_action, 
                                   feed_dict={self.state: [state],
                                              self.training: False})[0]
        return action
    
    def _get_update_op(self):
        update_op = []
        for target_var, var in zip(self.target_q_network.variables, self.q_network.variables):
            update_op.append(tf.assign(target_var, var))
        return update_op

In [None]:
from rltensor.networks import FeedForward
import gym
env = gym.make('DemonAttack-v0')

conf = {"q_conf":[
            {"name": "conv2d", "kernel_size":(8, 8), "num_filter":32, "stride":4,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "conv2d", "kernel_size":(5, 5), "num_filter":64, "stride":2,
             "padding": 'SAME', "is_batch":True, 'activation': tf.nn.relu},
           {"name": "conv2d", "kernel_size": (3, 3), "num_filter":64, "stride":1,
             "padding": 'SAME', "is_batch":True, 'activation': tf.nn.relu},
            {"name": "dense", "is_flatten":True, "is_batch":True, "num_hidden": 512, 'activation': tf.nn.relu},
            {"name": "dense", 'num_hidden': 6, 'activation': tf.nn.softmax}
        ],
        "memory_limit": 1000000,
        "window_length": 4,
        "gamma": 0.99,
        "learning_rate": 2.5e-4,
        "learning_rate_minimum": 2.5e-4,
        "learning_rate_decay": 0.9,
        "learning_rate_decay_step": 100,
        "ep": 1e-3,
        "min_r": -1,
        "max_r": 1,
        "batch_size": 32,
        "error_clip": 1.0,
        "processor": AtariProcessor(84, 84),
        "t_learn_start": 100,
        "t_train_freq": 4,
        "t_target_q_update_freq": 10000,
        "ep_start": 1.0,
        "ep_end": 0.1,
        "t_ep_end": int(1e6),
        "model_dir": "./logs/dqn",
        "log_freq": 1000,
        "avg_length": 10000,
        "env_name": 'DemonAttack-v0'
}
# logger.setLevel("DEBUG")
tf.reset_default_graph()
dqn = DQN(env, conf, q_network_cls=FeedForward)
dqn.train(int(1e7))

[2017-07-12 23:30:05,296] Making new env: DemonAttack-v0
  0%|          | 84/10000000 [00:00<3:19:52, 833.83it/s]

Model saved in file: params/model.ckpt


  7%|▋         | 657351/10000000 [8:19:21<481:37:35,  5.39it/s]  

In [77]:
dqn.memory.nb_entries

2103

In [29]:
np.random.randint(0, 5, 100)

array([1, 2, 4, 0, 2, 2, 3, 1, 0, 1, 1, 0, 3, 0, 4, 0, 3, 0, 2, 0, 1, 1, 2,
       2, 3, 4, 2, 3, 3, 3, 0, 0, 1, 2, 4, 2, 2, 1, 0, 0, 2, 1, 4, 4, 2, 0,
       4, 4, 2, 3, 3, 1, 4, 4, 3, 3, 0, 2, 0, 4, 1, 2, 3, 4, 4, 3, 2, 2, 0,
       1, 4, 4, 1, 3, 1, 3, 2, 1, 2, 2, 4, 0, 2, 2, 0, 3, 3, 2, 0, 1, 0, 0,
       3, 2, 4, 3, 4, 3, 1, 2])

In [19]:
sess = tf.InteractiveSession()
x = tf.Variable(0)
y = x + 1
# tf.global_variables_initializer().run()

In [38]:
if [1, 2, 3]:
    print("hello")

hello


In [25]:
env.action_space.n

6

In [23]:
y.get_shape().as_list()

[]

In [11]:
count = 4
while count < 5:
    print(count)
    count += 1

4


In [6]:
"%s" % True

'True'

In [14]:
a.insert(0, 2)

In [15]:
a

[2, 1]

In [11]:
np.random.randint(0, 2, 10)

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0])

In [42]:
from collections import deque

In [44]:
x = deque([1, 2, 3], maxlen=5)

In [45]:
x.append(3)
x.append(3)
x.append(3)

In [46]:
x

deque([2, 3, 3, 3, 3])