In [1]:
import random
import numpy as np
import math

import gym

import tensorflow as tf
import os

In [2]:
env = gym.make('CartPole-v0')

STATE_DIM  = env.observation_space.shape[0]
NUM_ACTIONS = env.action_space.n
NUM_EPISODES = 1000
MAX_T = 250
STREAK_TO_END = 50
SOLVED_T = 199

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
def simulate():
    agent = Agent()
    solved = False

    for episode in range(NUM_EPISODES):
        state = env.reset()

        for t in range(MAX_T):
            # Gym allows for rendering pictures and animations of the environment,
            # but due to additional configuration needed for a remote server will not run on the collab
            #env.render()

            action = agent.act(state)

            # Execute the action
            next_state, reward, done, _ = env.step(action)
            if done: # terminal state
                next_state = None

            agent.observe((state, action, reward, next_state))
            agent.replay()

            # Setting up for the next iteration
            state = next_state
            if done:
                print("Episode %d finished after %f time steps" % (episode, t))
                if (t >= SOLVED_T) and evaluate(agent, STREAK_TO_END - 1):
                    solved = True
                break
        if solved:
            print("Task successfully solved after %d episodes" % episode)
            break
        agent.signal_episode_end()

def evaluate(agent, streak):
    for _ in range(streak):
        state = env.reset()

        for t in range(MAX_T):
            action = agent.act(state, explore=False)
            next_state, _, done, _ = env.step(action)
            state = next_state
            if done:
                if (t < SOLVED_T):
                    return False
                break
    return True


In [4]:
MEMORY_CAPACITY = 100000
DISCOUNT_FACTOR = 0.95
MAX_EXPLORATION_RATE = 1.0
MIN_EXPLORATION_RATE = 0.01
DECAY_RATE = 0.001

In [6]:
class Agent:
    def __init__(self):
        self.explore_rate = MAX_EXPLORATION_RATE
        self.brain = Brain()
        self.memory = Memory(MEMORY_CAPACITY)
        self.steps = 0

    def act(self, s, explore=True):
        if explore and random.random() < self.explore_rate:
            return random.randint(0, NUM_ACTIONS - 1)
        else:
            return np.argmax(self.brain.predict([s])[0])

    def observe(self, sample):
        self.steps += 1
        self.memory.add(sample)

        # Reduces exploration rate linearly
        self.explore_rate = MIN_EXPLORATION_RATE + (MAX_EXPLORATION_RATE - MIN_EXPLORATION_RATE) \
                            * math.exp(-DECAY_RATE * self.steps)

    def replay(self):
        batch = self.memory.get_random_samples(self.brain.BATCH_SIZE)
        batchLen = len(batch)

        states = np.array([sample[0] for sample in batch], dtype=np.float32)
        no_state = np.zeros(STATE_DIM)
        resultant_states = np.array([(no_state if sample[3] is None else sample[3]) for sample in batch], dtype=np.float32)

        q_values_batch = self.brain.target_predict(states)
        future_q_values_batch = self.brain.target_predict(resultant_states)
        # q_values_batch = self.brain.predict(states)
        # future_q_values_batch = self.brain.predict(resultant_states)

        x = np.zeros((batchLen, STATE_DIM)).astype(np.float32)
        y = np.zeros((batchLen, NUM_ACTIONS)).astype(np.float32)

        for i in range(batchLen):
            state, action, reward, resultant_state = batch[i]

            q_values = q_values_batch[i]
            if resultant_state is None:
                q_values[action] = reward
            else:
                q_values[action] = reward + DISCOUNT_FACTOR * np.amax(future_q_values_batch[i])

            x[i] = state
            y[i] = q_values

        self.brain.train(x, y)
        if not self.steps%50:
            self.brain.transfer_variables()

    def signal_episode_end(self):
        pass


In [7]:

from collections import deque

class Brain:
    def __init__(self):
        self.BATCH_SIZE = 50
        
        self.__observation = tf.placeholder(tf.float32, [None, STATE_DIM])
        self.__q_target = tf.placeholder(tf.float32, [None, NUM_ACTIONS])

        self.model = self.create_multi_layer_neural_network(self.__observation, NUM_ACTIONS, 3, "train")
        self.target_net = self.create_multi_layer_neural_network(self.__observation, NUM_ACTIONS, 3, "target_net")
        with tf.variable_scope("None"):
            self.learning_rate = 0.05
            self.loss = tf.reduce_mean(tf.square(self.model-self.__q_target))
            self.trainer = tf.train.AdamOptimizer(learning_rate=self.learning_rate/self.BATCH_SIZE).minimize(self.loss)

        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        self.transfer_variables()

    def train(self, x, y):
        self.session.run([self.trainer], {self.__observation: x, self.__q_target: y})

    def predict(self, s):
        return self.session.run(self.model, {self.__observation: s})

    def target_predict(self, s):
        return self.session.run(self.target_net, {self.__observation: s})

    def transfer_variables(self):
        col1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='train')
        col2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
        assign_op = []
        assert(len(col1)==len(col2))
        assert([v.name.split("/")[-1] for v in col1]==[v.name.split("/")[-1] for v in col2])
        assign_op = [v2.assign(v1) for v1, v2 in zip(col1, col2)]
        self.session.run(assign_op)

    @staticmethod
    def create_multi_layer_neural_network(input_vars, out_dims, num_hidden_layers, namescope):
        model = None
        with tf.variable_scope(namescope):
            input_dims = input_vars.shape[1].value
            num_hidden_neurons = 50
            last_layer = input_vars

            for k in range(num_hidden_layers):
                last_layer = tf.contrib.layers.fully_connected(last_layer, num_hidden_neurons,\
                            activation_fn=tf.nn.relu, biases_initializer=tf.zeros_initializer())

            model = tf.contrib.layers.fully_connected(last_layer, out_dims,\
                            activation_fn=None, biases_initializer=tf.zeros_initializer())
        return model

    @staticmethod
    def create_single_layer_neural_network(input_vars, out_dims):
        return Brain.create_multi_layer_neural_network(input_vars, out_dims, 1)


class Memory:
    def __init__(self, capacity):
        self.examplers = deque(maxlen=capacity)
        self.capacity = capacity

    def add(self, sample):
        self.examplers.append(sample)

    def get_random_samples(self, num_samples):
        num_samples = min(num_samples, len(self.examplers))
        return random.sample(tuple(self.examplers), num_samples)

In [8]:
simulate()

Episode 0 finished after 16.000000 time steps
Episode 1 finished after 35.000000 time steps
Episode 2 finished after 13.000000 time steps
Episode 3 finished after 21.000000 time steps
Episode 4 finished after 10.000000 time steps
Episode 5 finished after 68.000000 time steps
Episode 6 finished after 35.000000 time steps
Episode 7 finished after 18.000000 time steps
Episode 8 finished after 25.000000 time steps
Episode 9 finished after 21.000000 time steps
Episode 10 finished after 14.000000 time steps
Episode 11 finished after 22.000000 time steps
Episode 12 finished after 17.000000 time steps
Episode 13 finished after 8.000000 time steps
Episode 14 finished after 18.000000 time steps
Episode 15 finished after 9.000000 time steps
Episode 16 finished after 14.000000 time steps
Episode 17 finished after 13.000000 time steps
Episode 18 finished after 12.000000 time steps
Episode 19 finished after 12.000000 time steps
Episode 20 finished after 14.000000 time steps
Episode 21 finished after

In [5]:

from collections import deque

NUM_BINS = 8
MIN_VALUE = -10
MAX_VALUE = 10

def bin_mids():
    return np.array([MIN_VALUE+(i+0.5)*(MAX_VALUE-MIN_VALUE)/NUM_BINS for i in range(NUM_BINS)])

def values_to_bin_probs(values, probs):
    new_probs = []
    right_end = MIN_VALUE
    for i in range(NUM_BINS):
        left_end = right_end
        right_end = left_end + (MAX_VALUE-MIN_VALUE)/NUM_BINS
        prob = sum(p for p, v in zip(probs, values) if right_end > v >= left_end)
        new_probs.append(prob)
    return new_probs

class Agent:
    def __init__(self):
        self.explore_rate = MAX_EXPLORATION_RATE
        self.brain = Brain()
        self.memory = Memory(MEMORY_CAPACITY)
        self.steps = 0

    def act(self, s, explore=True):
        if explore and random.random() < self.explore_rate:
            return random.randint(0, NUM_ACTIONS - 1)
        else:
            probs = self.brain.predict([s])[0]
            return np.argmax(np.sum(probs*bin_mids(), axis=1))

    def observe(self, sample):
        self.steps += 1
        self.memory.add(sample)

        # Reduces exploration rate linearly
        self.explore_rate = MIN_EXPLORATION_RATE + (MAX_EXPLORATION_RATE - MIN_EXPLORATION_RATE) \
                            * math.exp(-DECAY_RATE * self.steps)

    def replay(self):
        batch = self.memory.get_random_samples(self.brain.BATCH_SIZE)
        batchLen = len(batch)

        states = np.array([sample[0] for sample in batch], dtype=np.float32)
        no_state = np.zeros(STATE_DIM)
        resultant_states = np.array([(no_state if sample[3] is None else sample[3]) for sample in batch], dtype=np.float32)

        # q_values_batch = self.brain.target_predict(states)
        # future_q_values_batch = self.brain.target_predict(resultant_states)
        q_values_batch = self.brain.predict(states)
        future_q_values_batch = self.brain.predict(resultant_states)

        x = np.zeros((batchLen, STATE_DIM)).astype(np.float32)
        y = np.zeros((batchLen, NUM_ACTIONS, NUM_BINS)).astype(np.float32)

        for i in range(batchLen):
            state, action, reward, resultant_state = batch[i]

            q_values = q_values_batch[i]
            if resultant_state is None:
                q_values[action] = reward
            else:
                next_action = self.act(resultant_state)
                next_probs = future_q_values_batch[i][next_action]
                next_vals = reward + DISCOUNT_FACTOR * bin_mids()
                q_values[action] = values_to_bin_probs(next_vals, next_probs)

            x[i] = state
            y[i] = q_values

        self.brain.train(x, y)
        if not self.steps%50:
            self.brain.transfer_variables()

    def signal_episode_end(self):
        pass

class Brain:
    def __init__(self):
        self.BATCH_SIZE = 50
        
        self.__observation = tf.placeholder(tf.float32, [None, STATE_DIM])
        self.__q_target = tf.placeholder(tf.float32, [None, NUM_ACTIONS, NUM_BINS])

        self.model = self.create_multi_layer_neural_network(self.__observation, NUM_ACTIONS*NUM_BINS, 3, "train")
        self.target_net = self.create_multi_layer_neural_network(self.__observation, NUM_ACTIONS*NUM_BINS, 3, "target_net")
        with tf.variable_scope("None"):
            self.learning_rate = 0.05
            self.loss = tf.reduce_mean(tf.square(self.model-self.__q_target))
            self.trainer = tf.train.AdamOptimizer(learning_rate=self.learning_rate/self.BATCH_SIZE).minimize(self.loss)

        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        self.transfer_variables()

    def train(self, x, y):
        self.session.run([self.trainer], {self.__observation: x, self.__q_target: y})

    def predict(self, s):
        return self.session.run(self.model, {self.__observation: s})

    def target_predict(self, s):
        return self.session.run(self.target_net, {self.__observation: s})

    def transfer_variables(self):
        col1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='train')
        col2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
        assign_op = []
        assert(len(col1)==len(col2))
        assert([v.name.split("/")[-1] for v in col1]==[v.name.split("/")[-1] for v in col2])
        assign_op = [v2.assign(v1) for v1, v2 in zip(col1, col2)]
        self.session.run(assign_op)

    @staticmethod
    def create_multi_layer_neural_network(input_vars, out_dims, num_hidden_layers, namescope):
        model = None
        with tf.variable_scope(namescope):
            input_dims = input_vars.shape[1].value
            num_hidden_neurons = 50
            last_layer = input_vars

            for k in range(num_hidden_layers):
                last_layer = tf.contrib.layers.fully_connected(last_layer, num_hidden_neurons,\
                            activation_fn=tf.nn.relu, biases_initializer=tf.zeros_initializer())

            last_layer = tf.contrib.layers.fully_connected(last_layer, out_dims,\
                            activation_fn=None, biases_initializer=tf.zeros_initializer())
            divided = tf.split(last_layer, NUM_ACTIONS, axis=1)
            model = tf.reshape(tf.convert_to_tensor([tf.nn.softmax(part) for part in divided]), (-1, NUM_ACTIONS, NUM_BINS))
            print(model.shape)
        return model

    @staticmethod
    def create_single_layer_neural_network(input_vars, out_dims):
        return Brain.create_multi_layer_neural_network(input_vars, out_dims, 1)


class Memory:
    def __init__(self, capacity):
        self.examplers = deque(maxlen=capacity)
        self.capacity = capacity

    def add(self, sample):
        self.examplers.append(sample)

    def get_random_samples(self, num_samples):
        num_samples = min(num_samples, len(self.examplers))
        return random.sample(tuple(self.examplers), num_samples)