In [1]:
import rltensor

In [2]:
from skimage.color import rbg2gray

ImportError: cannot import name 'rbg2gray'

In [3]:
import numpy as np
from scipy.misc import imresize
from PIL import Image


def resize_data(data, width, height, c_dim=3, is_color=True):
    """resize data for trainining dcgan
    Args:
        data: list of image data, each of which has a shape,
            (width, height, color_dim) if is_color==True
            (width, height) otherwisei
    """
    if is_color:
        converted_data = np.array([imresize(d, [width, height]) for d in data
                                if (len(d.shape)==3 and d.shape[-1] == c_dim)])
    else:
        # gray scale data
        converted_data = np.array([imresize(d, [width, height]) for d in data
                                if (len(d.shape)==2)])
    return converted_data

In [21]:
import numpy as np
from scipy.misc import imresize
from PIL import Image
from os import listdir
from os.path import join, isfile, isdir
from skimage.color import rgb2gray


class DefaultProcessor(object):
    def __init__(self, input_shape):
        self.input_shape = input_shape
    
    def preprocess(self, x):
        return np.array(x)
    
    def tensor_process(self, x):
        return x
    
    def get_input_shape(self):
        return self.input_shape
    
class AtariProcessor(DefaultProcessor):
    def __init__(self, height, width):
        self.height = height
        self.width = width
        self.input_shape = (height, width)
    
    def preprocess(self, x):
        x = resize_data([x], self.height, self.width)[0]
        x = rgb2gray(x)
        return x
    
    def tensor_process(self, x):
        # change to (batch, width, hight, window_length)
        return tf.transpose(x, [0, 2, 3, 1])

In [22]:
from PIL import Image

f = Image.open("/home/tomoaki/work/Development/web_scraping/idle_images/cont_img20130_464.jpg")

In [23]:
processor = AtariProcessor(84, 84)

np.max(processor.preprocess(np.array(f)))

0.9949701960784314

In [25]:
processor.get_input_shape()

(84, 84)

In [26]:
from tqdm import tqdm
import tensorflow as tf
from logging import getLogger
import time
from six.moves import xrange

from rltensor.memories import SequentialMemory

logger = getLogger(__name__)

class Agent(object):
    def __init__(self, env, conf, sess=None):
        if sess is None:
            sess = tf.Session()
        self.sess = sess
        self.conf = conf
        self.limit = conf["memory_limit"]
        self.window_length = conf["window_length"]
        self.memory = self._get_memory(self.window_length, self.limit)
        self.gamma = conf["gamma"]
        self.error_clip = conf["error_clip"]
        self.processor = conf["processor"]
        # Get input and action dim info from env
        self.env = env
        self.state_dim = processor.get_input_shape()
        self.action_dim = env.action_space.n
        # configure for learning schedule 
        self.learning_rate = conf["learning_rate"]
        self.learning_rate_minimum = conf["learning_rate_minimum"]
        self.learning_rate_decay = conf["learning_rate_decay"]
        self.learning_rate_decay_step = conf["learning_rate_decay_step"]
        self.global_step = tf.Variable(0, trainable=False)
        # reward is in (min_r, max_r)
        self.min_r = conf["min_r"]
        self.max_r = conf["max_r"]
        self.batch_size = conf["batch_size"]
        # Build tensorflow network
        st = time.time()
        logger.debug("Building tensorflow graph...")
        with self.sess.as_default():
            self._build_graph()
            self.saver = tf.train.Saver()
        logger.debug("Finished building tensorflow graph, spent time:", time.time() - st)
        if "load_file_path" in conf:
            self.load_params(conf["load_file_path"])
        
    def _get_memory(self, window_length, limit):
        return SequentialMemory(window_length, limit)
        
    def _build_graph(self):
        raise NotImplementedError()

    def train(self, t_max):
        tf.global_variables_initializer().run(session=self.sess)
        # initialize target netwoork
        self.update_target_q_network()

        observation = self.env.reset()
        for t in tqdm(xrange(t_max)):
            # 1. predict
            if t > 0:
                state = self.memory.get_recent_state()
                action = self.predict(state)
            else:
                action = self.random_action()
            # 2. act
            observation, reward, terminal, info = self.env.step(action)
            observation = self.processor.preprocess(observation)
            # 3. store data and train network
            if t < 10:
                result = self.observe(observation, reward, action, terminal, False)
                continue
            else:
                result = self.observe(observation, reward, action, terminal, True)
            q, loss, is_update = result
            logger.debug("a: %d, r:%f, t:%s, q:%.4f, l: %.4f" % \
                (action, reward, terminal, np.mean(q), loss))

    def predict(self, s_t, ep):
        raise NotImplementedError()

    def update_target_q_network(self):
        self.sess.run(self.update_op)

In [35]:
import os
import time
import numpy as np
import tensorflow as tf
from logging import getLogger
import random

# from .agent import Agent
from rltensor.utils import get_shape

logger = getLogger(__name__)


class DQN(Agent):
    def __init__(self, env, conf, q_network_cls, sess=None):
        self.q_network_cls = q_network_cls
        super(DQN, self).__init__(env, conf, sess)
        
    def _build_graph(self):
        """Build all of the network and optimizations
        
        just for conveninece of trainig, seprate placehoder for train and target network
        critic network input: [raw_data, smoothed, downsampled]
        """
        # training flag
        self.training = tf.placeholder(tf.bool, name="training")
        # state shape has to be (batch, length,) + input_dim
        self.state = tf.placeholder(tf.float32,
                                     get_shape(self.state_dim, maxlen=self.window_length),
                                     name='state')
        _state = self.processor.tensor_process(self.state)
        self.target_state = tf.placeholder(tf.float32,
                                            get_shape(self.state_dim, maxlen=self.window_length),
                                            name='target_state')
        _target_state = self.processor.tensor_process(self.target_state)
        # Employ maximal strategy
        self.q_network = self.q_network_cls(self.conf["q_conf"], scope_name="q_network")
        self.q_val = self.q_network(_state, self.training)
        assert self.q_val.get_shape().as_list()[-1] == self.action_dim
        self.max_action = tf.argmax(self.q_val, dimension=1)
        # Build action graph
        self.action = tf.placeholder(tf.int32, (None,), name='action')
        action_one_hot = tf.one_hot(self.action, depth=self.action_dim)
        self.action_q_val = tf.reduce_sum(self.q_val * action_one_hot, axis=1)
        # Build target network
        self.target_q_network = self.q_network_cls(self.conf["q_conf"], scope_name="target_q_network")
        target_q_val = self.target_q_network(_target_state, self.training)
        self.reward = tf.placeholder(tf.float32, (None,), name='reward')
        max_one_hot = tf.one_hot(self.max_action, depth=self.action_dim)
        max_q_val = tf.reduce_sum(self.q_val * max_one_hot, axis=1)
        target_val = self.reward  + self.gamma * max_q_val
        # Clip error to stabilize learning
        delta = target_val - self.action_q_val
        clipped_error = tf.where(tf.abs(delta) < self.error_clip,
                                    0.5 * tf.square(delta),
                                    tf.abs(delta), name='clipped_error')
        self.loss = tf.reduce_mean(clipped_error, name='loss')
        # Build optimization
        self.learning_rate_op = tf.maximum(self.learning_rate_minimum,
          tf.train.exponential_decay(
              self.learning_rate,
              self.global_step,
              self.learning_rate_decay_step,
              self.learning_rate_decay,
              staircase=True))
        self.loss = tf.reduce_mean(tf.square(target_val - self.action_q_val), name='loss')
        self.q_optim = tf.train.AdamOptimizer(self.learning_rate_op)\
            .minimize(self.loss, var_list=self.q_network.variables)
        self.update_op = self._get_update_op()

    def observe(self, observation, reward, action, terminal, training):
        # clip reward into  (min_r, max_r)
        reward = max(self.min_r, min(self.max_r, reward))
        # We always keep data
        self.memory.append(observation, action, reward, terminal, training=True)
        if training:
            experiences = self.memory.sample(self.batch_size)
            result = self.q_learning_minibatch(experiences)
            self.sess.run(tf.assign(self.global_step, self.global_step + 1))
            return result
        else:
            return None

    def q_learning_minibatch(self, experiences):
        feed_dict = {
            self.state: [experience.state0 for experience in experiences],
            self.target_state: [experience.state1 for experience in experiences],
            self.reward: [experience.reward for experience in experiences],
            self.action: [experience.action for experience in experiences],
            self.training: True,
        }
        _, q_t, loss = self.sess.run([self.q_optim, self.action_q_val, self.loss],
                                     feed_dict=feed_dict)
        return q_t, loss, True
    
    def predict(self, state, ep=1e-3):
        if random.random() < ep:
            action = random.randint(0, self.action_dim)
        else:
            action = self.sess.run(self.max_action, 
                                   feed_dict={self.state: [state],
                                              self.training: False})[0]
        return action
    
    def _get_update_op(self):
        update_op = []
        for target_var, var in zip(self.target_q_network.variables, self.q_network.variables):
            update_op.append(tf.assign(target_var, var))
        return update_op
    
    def random_action(self):
        return np.random.randint(0, self.action_dim)

In [36]:
from rltensor.networks import FeedForward
import gym
env = gym.make('DemonAttack-v0')

conf = {"q_conf":[
            {"name": "conv2d", "kernel_size":(8, 8), "num_filter":32, "stride":4,
             "padding": 'SAME', "is_batch":False, 'activation': tf.nn.relu},
            {"name": "conv2d", "kernel_size":(5, 5), "num_filter":64, "stride":2,
             "padding": 'SAME', "is_batch":True, 'activation': tf.nn.relu},
           {"name": "conv2d", "kernel_size": (3, 3), "num_filter":64, "stride":1,
             "padding": 'SAME', "is_batch":True, 'activation': tf.nn.relu},
            {"name": "dense", "is_flatten":True, "is_batch":True, "num_hidden": 512, 'activation': tf.nn.relu},
            {"name": "dense", 'num_hidden': 6, 'activation': tf.nn.softmax}
        ],
        "memory_limit": 10000,
        "window_length": 4,
        "gamma": 0.9,
        "learning_rate": 0.1,
        "learning_rate_minimum": 1e-3,
        "learning_rate_decay": 0.9,
        "learning_rate_decay_step": 100,
        "ep": 1e-3,
        "min_r": -1,
        "max_r": 1,
        "batch_size": 32,
        "error_clip": 1.0,
        "processor": AtariProcessor(84, 84)
}
tf.reset_default_graph()
dqn = DQN(env, conf, q_network_cls=FeedForward)
dqn.train(1000)

[2017-07-10 01:51:16,331] Making new env: DemonAttack-v0
100%|██████████| 1000/1000 [00:48<00:00, 20.54it/s]


In [12]:
dqn.memory.nb_entries

1

In [36]:
get_shape(3)

(None, None, 3)

In [19]:
sess = tf.InteractiveSession()
x = tf.Variable(0)
y = x + 1
# tf.global_variables_initializer().run()

In [38]:
if [1, 2, 3]:
    print("hello")

hello


In [25]:
env.action_space.n

6

In [23]:
y.get_shape().as_list()

[]

In [11]:
count = 4
while count < 5:
    print(count)
    count += 1

4


In [6]:
"%s" % True

'True'

In [14]:
a.insert(0, 2)

In [15]:
a

[2, 1]

In [11]:
np.random.randint(0, 2, 10)

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0])