In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim # slim is a wrapper that makes building networks easier
import matplotlib.pyplot as plot
from matplotlib import animation
from matplotlib.patches import Rectangle, Circle
from collections import deque 
from IPython.display import display, HTML

# Linear problem introduction

We'll start with a very simple problem: A linear track with 6 different locations. At each location, the agent will be able to take the action "left," which will move it to the position to the left, or "right" which will move it to the right. (If it is at the left end of the track, the "left" action will just leave the agent in the same position.) The agent will start at the left-most end of the track, and we'll give it a reward of +1 for reaching the end of track, which we'll consider a terminal state. 

<img src="linear.png">

Clearly the optimal policy is for the agent to move "Right" until it reaches the end of the track. In this part of the homework, we'll explore learning this task with a tabular Q-learning system.

## Mathematical questions:

(There are 16 questions across 5 sections on this homework, some with code chunks interspersed, make sure you answer all of them! Please answer the questions in a separate document.)

1\. Assuming an optimal, completely greedy policy, and a discount factor of gamma = 0.9, calculate the Q-value of each (state, action) pair. 

2\. Under the same assumptions, calculate the value of every state (this shouldn't be much work given the last part).


## Linear problem, random, and tabular Q controller implementations

In [None]:
class linear_problem(object):
    """Class implementing the linear problem"""
    def __init__(self, length=6, rewards=[0, 0, 0, 0, 0, 1], max_lifetime=100):
       self.min_state = 0 
       self.max_state = length - 1
       if len(rewards) != length:
           raise ValueError("The number of rewards does not match the length... Put zeros on the positions where you don't want a reward")
       self.rewards = rewards # assume that the reward depends only on the state you end up in
       self.max_lifetime = max_lifetime

       self.reset_state()

    def get_state(self):
        """Returns tuple of current state, which in this problem is just position"""
        return (self.x,)

    def reset_state(self):
        """Resets state variables to initial conditions"""
        self.x = 0

    def update_state(self, action):
        """Updates state, returns reward of this state"""
        if action == "left":
            if self.x > self.min_state:
                self.x -= 1
        else: #action == "right"
            self.x += 1

        return self.rewards[self.x]

    def terminal(self):
        """Checks if state is end"""
        return self.x == self.max_state

    def run_trial(self, controller, testing=False):
        self.reset_state()
        total_reward = 0.
        for i in range(self.max_lifetime):
            this_state = self.get_state()
            this_action = controller.choose_action(this_state)
            reward = self.update_state(this_action)
            total_reward += reward
            new_state = self.get_state()

            terminal = self.terminal()
            if not testing:
                controller.update(this_state, this_action, new_state, reward)

            if terminal:
                break

        if testing:
            print("Ran testing trial with %s controller, achieved a total reward of %.2f in %i steps" % (controller.name, total_reward, i)) 

        return total_reward, i

    def run_k_trials(self, controller, k):
        """Runs k trials, using the specified controller. Controller must have
           a choose_action(state) method which returns one of "left" and
           "right," and must have an update(state, action, next state, reward)
           method (if training=True)."""
        avg_tr = 0.
        avg_time = 0
        for i in range(k):
            (tr, time) = self.run_trial(controller)
            avg_tr += tr
            avg_time += time

        avg_tr /= k
        avg_time /= k
        print("Ran %i testing trials with %s controller, achieved an average total reward of %.2f in an average of %i steps" % (k, controller.name, avg_tr, avg_time)) 

            

class random_controller(object):
    """Random controller/base class for fancier ones."""
    def __init__(self):
        self.name = "Random"
        self.testing = False

    def set_testing(self):
        """Can toggle exploration, for instance."""
        self.testing = True

    def set_training(self):
        """Can toggle exploration, for instance."""
        self.testing = False

    def choose_action(self, state):
        """Takes a state and returns an action, "left" or "right," to take.
           this method chooses randomly, should be overridden by fancy
           controllers."""
        return np.random.choice(["left", "right"])

    def update(self, prev_state, action, new_state, reward):
        """Update policy or whatever, override."""
        pass

class linear_tabular_Q_controller(random_controller):
    """Tabular Q-learning controller for the linear problem."""
    def __init__(self, possible_states=range(6), epsilon=0.05, gamma=0.9, eta=0.1):
        """Epsilon: exploration probability (epsilon-greedy)
           gamma: discount factor
           eta: update rate"""
        super().__init__()
        self.name = "Tabular Q"
        self.Q_table = {(x,):  {"left": 0.01-np.random.rand()/50, "right": 0.01-np.random.rand()/50} for x in possible_states} 
        self.possible_states = possible_states
        self.str_possible_states = [str(x) for x in possible_states] # for printing
        self.terminal_state = possible_states[-1]
        self.eta = eta
        self.gamma = gamma
        self.epsilon = epsilon
 

    def choose_action(self, state):
        """Epsilon-greedy w.r.t the current Q-table."""
        if not self.testing and np.random.rand() < self.epsilon:
            return np.random.choice(["left", "right"])
        else:
            curr_Q_vals = self.Q_table[state]
            if curr_Q_vals["left"] > curr_Q_vals["right"]:
                return "left"
            return "right"

    def update(self, prev_state, action, new_state, reward):
        """Update Q table."""
        if new_state == self.terminal_state:
            target = reward 
        else:
            target = reward + self.gamma * max(self.Q_table[new_state].values())

        self.Q_table[prev_state][action] = (1 - self.eta) * self.Q_table[prev_state][action] + self.eta * target

    def print_pretty_Q_table(self):
        """Prints a Q-table where the L-R dimension represents state and the
           top row represents the Q-value of the "right" action, the bottom row
           represents the Q-value of the "left" action."""
        print("x:\t" + "\t".join(self.str_possible_states))
        right_Qs = map(lambda x: "%.2f" % self.Q_table[(x,)]["right"], self.possible_states[:-1])
        print("right:\t"+ "\t".join(right_Qs) + "\tend") 
        left_Qs = map(lambda x: "%.2f" % self.Q_table[(x,)]["left"], self.possible_states[:-1])
        print("left:\t"+ "\t".join(left_Qs) + "\tend") 


## Linear problem questions

(To answer these questions, run both code chunks below, which print the Q tables over different time scales of learning.)

3\. About how long (how many training episodes) does it take the tabular Q-system to converge to the optimal Q values you calculated above?

4\. For which states do the Q-values converge earlier? For which actions? Why? 

5\. How does changing epsilon affect this? (Try epsilon = 0.2)

In [None]:
lp = linear_problem()

# create a tabular Q controller 
np.random.seed(1)
tq = linear_tabular_Q_controller(epsilon=0.05)

num_train_per_cycle = 20 # how many training episodes to run between tests
num_train_cycles = 5 # how many times train/test cycles to run

tq.set_testing()
lp.run_trial(tq, testing=True)
for i in range(num_train_cycles):

    tq.set_training()
    lp.run_k_trials(tq, num_train_per_cycle)
    tq.set_testing()
    print("After %i training episodes" % ((i+1) * num_train_per_cycle))
    lp.run_trial(tq, testing=True)
    print("Q-values:")
    tq.print_pretty_Q_table()
    print()


In [None]:

# create a tabular Q controller 
np.random.seed(1)
tq = linear_tabular_Q_controller(epsilon=0.05)

num_train_per_cycle = 500 # how many training episodes to run between tests
num_train_cycles = 5 # how many times train/test cycles to run

tq.set_testing()
lp.run_trial(tq, testing=True)
for i in range(num_train_cycles):

    tq.set_training()
    lp.run_k_trials(tq, num_train_per_cycle)
    tq.set_testing()
    print("After %i training episodes" % ((i+1) * num_train_per_cycle))
    lp.run_trial(tq, testing=True)
    print("Q-values:")
    tq.print_pretty_Q_table()
    print()

6\. Why does the random controller do better than a randomly initialized tabular Q-learner (before learning)? (see code chunk below for a few comparisons):

In [None]:
for seed in range(5):
    # create a random controller and run a trial with it
    rc = random_controller()
    np.random.seed(seed)
    print("Random")
    lp.run_trial(rc, testing=True)

    # create a tabular Q controller and run a trial with it,
    # then run 10000 training trials and run another testing trial
    np.random.seed(seed)
    tq = linear_tabular_Q_controller()
    tq.set_testing()
    print("Tabular (pre-training)")
    lp.run_trial(tq, testing=True)
    print()

# Cartpole problem introduction

Now we'll explore something a little more interesting: the cartpole problem:

<img src="cartpole.png">

A pole is attached to a pivot on top of a cart which moves along a one-dimensional track. The goal of the task is to keep the pole balanced (standing upright) by moving the cart side to side. To make this into a MDP like we've discussed, we need the following elements:

* *Agent:* the controller of the cart
* *Environment:* the cart/world/physics
* *State:* we'll define the state to be a tuple of (x position of cart, x velocity of cart, angle of pole, angular velocity of pole).
* *Terminal states:* we'll end the episode when the pole tips too far over (> 15 degrees, in this implementation) or when the cart goes too far to either side (> 2.5 units).
* *Actions:* to keep it simple, we'll have only two actions: apply a force of +F toward the right, or -F toward the left, which we'll call "right" and "left," respectively.
* *Rewards:* To keep things simple and clear, we'll only give a reward in terminal states. Since all terminal states are losing, the reward will be -1.

We'll compare two Q-learning approaches to this task in this homework: 

* *Tabular:* "standard" Q-learning
* *DQN:* A deep-Q network that approximates the Q-function, loosely inspired by the Atari game playing paper.

We'll also compare to a baseline controller that takes random actions at every step.

Some of the code chunks in this part of the document have been run for you already since they take a non-trivial amount of time (especially the DQN training), or because they require [ffmpeg]<https://www.ffmpeg.org/> to generate the animations. However, we encourage you to play around with the code and get your hands dirty, and install ffmpeg if you want to animate other trials!

## Conceptual questions

7\. Since the reward for every *episode* (not every action!) will be -1, why would a Q-learning system learn any interesting behavior on this task?

8\. Why might a DQN (or some other function approximator) be an appropriate choice here?

## Cartpole problem and random controller implementation

In [2]:
class cartpole_problem(object):
    """Class implementing the cartpole world -- you may want to glance at the
       methods to see if you can understand what's going on."""
    def __init__(self, max_lifetime=1000):
        self.delta_t = 0.05
        self.gravity = 9.8
        self.force = 1.
        self.cart_mass = 1.
        self.pole_mass = 0.2
        self.mass = self.cart_mass + self.pole_mass
        self.pole_half_length = 1.
        self.max_lifetime = max_lifetime

        self.reset_state()

        # animation constants
        self.cart_half_width = 0.25
        self.cart_height = 0.2
        self.pole_half_width = 0.025
        self.cart_wheel_radius = 0.05
        self.pole_offset = self.cart_height + 2 * self.cart_wheel_radius - self.pole_half_width 
        self.cart_wheel_offset = self.cart_half_width - self.cart_wheel_radius

    def get_state(self):
        """Returns current state as a tuple"""
        return (self.x, self.x_dot, self.phi, self.phi_dot)

    def reset_state(self):
        """Reset state variables to initial conditions"""
        self.x = 0.
        self.x_dot = 0.
        self.phi = 0.
        self.phi_dot = 0.

    def tick(self, action):
        """Time step according to EOM and action."""

        if action == "left":
            action_force = self.force
        else:
            action_force = -self.force

        dt = self.delta_t
        self.x += dt * self.x_dot 
        self.phi += dt * self.phi_dot 

        sin_phi = np.sin(self.phi)
        cos_phi = np.cos(self.phi)

        F = action_force + sin_phi * self.pole_mass * self.pole_half_length * (self.phi_dot**2)
        phi_2_dot = (sin_phi * self.gravity - cos_phi * F/ self.mass) / (0.5 * self.pole_half_length * (4./3 - self.pole_mass * cos_phi**2 / self.mass))
        x_2_dot = (F - self.pole_mass * self.pole_half_length * phi_2_dot) / self.mass 
        
        self.x_dot += dt * x_2_dot 
        self.phi_dot += dt * phi_2_dot 
        

    def loses(self):
        """Loses if not within 2.5 m of start and 15 deg. of vertical"""
        return not (-2.5 < self.x < 2.5 and -0.262 < self.phi < 0.262)

    def animate(self, trial_state_history, ticks_per_second=20):
        """Makes a simple video showing the trial"""
        fig, ax = plot.subplots()

        ax.set_xlim([-2.5, 2.5])
        ax.get_yaxis().set_visible(False)
        ax.set_ylim([-1, 3])

        # create patches, draw first frame
        x, _, phi, _ = trial_state_history[0]

        # fg
        fg_p = Rectangle((-2.5, -1), 5, 1, facecolor="#ccaa99")
        ax.add_patch(fg_p)


        # pole
        pole_p = Rectangle((x-self.pole_half_width, self.pole_offset), 2*self.pole_half_width, 2*self.pole_half_length, facecolor="#777788")
        ax.add_patch(pole_p)
        # cart
        cart_p = Rectangle((x-self.cart_half_width, 2*self.cart_wheel_radius), 2*self.cart_half_width, self.cart_height, facecolor="k")
        ax.add_patch(cart_p)

        wheel1_p = Circle((x-self.cart_wheel_offset, self.cart_wheel_radius), self.cart_wheel_radius, facecolor="k")
        ax.add_patch(wheel1_p)

        wheel2_p = Circle((x+self.cart_wheel_offset, self.cart_wheel_radius), self.cart_wheel_radius, facecolor="k")
        ax.add_patch(wheel2_p)

        def __draw_frame(state):
            x, _, phi, _ = state
            pole_p.set_xy((x-self.pole_half_width, self.pole_offset))
            pole_p.angle = 57.3*phi # to degrees
            cart_p.set_xy((x-self.cart_half_width, 2*self.cart_wheel_radius))
            wheel1_p.center = (x-self.cart_wheel_offset, self.cart_wheel_radius)
            wheel2_p.center = (x+self.cart_wheel_offset, self.cart_wheel_radius)
            
        anim = animation.FuncAnimation(fig, __draw_frame,
                                       frames=trial_state_history,
                                       interval=1000./ticks_per_second,
                                       repeat=False)
        display(HTML(anim.to_jshtml()))

    def run_trial(self, controller, testing=False, animate=False):
        self.reset_state()
        i = 0
        if animate:
            trial_state_history = []
            trial_state_history.append(self.get_state())
        while i < self.max_lifetime:
            i += 1
            this_state = self.get_state()
            this_action = controller.choose_action(this_state)
            self.tick(this_action)
            new_state = self.get_state()

            loss = self.loses()
            reward = -1. if loss else 0.
            if not testing:
                controller.update(this_state, this_action, new_state, reward)

            if animate:
                trial_state_history.append(new_state)

            if loss:
                break

        if testing:
            print("Ran testing trial with %s Controller, achieved a lifetime of %i steps" % (controller.name, i))

        if animate:
            self.animate(trial_state_history)

        return i

    def run_k_trials(self, controller, k):
        """Runs k trials, using the specified controller. Controller must have
           a choose_action(state) method which returns one of "left" and
           "right," and must have an update(state, action, next state, reward)
           method (if training=True)."""
        avg_lifetime = 0.
        for i in range(k):
            avg_lifetime += self.run_trial(controller)

        avg_lifetime /= k
        print("Ran %i trials with %s Controller, (average lifetime of %f steps)" % (k,  controller.name, avg_lifetime))

In [3]:
class random_controller(object):
    """Random controller/base class for fancier ones."""
    def __init__(self):
        self.name = "Random"
        self.testing = False

    def set_testing(self):
        """Can toggle exploration, for instance."""
        self.testing = True

    def set_training(self):
        """Can toggle exploration, for instance."""
        self.testing = False

    def choose_action(self, state):
        """Takes a state and returns an action, "left" or "right," to take.
           this method chooses randomly, should be overridden by fancy
           controllers."""
        return np.random.choice(["left", "right"])

    def update(self, prev_state, action, new_state, reward):
        """Update policy or whatever, override."""
        pass
    
class alternating_controller(object):
    """Just alternates left and right. Try this out if you think it's a good idea!"""
    def __init__(self):
        super().__init__()
        self.name = "Alternating"
        self.left = True

    def choose_action(self, state):
        """Takes a state and returns an action, "left" or "right," to take.
           this method chooses randomly, should be overridden by fancy
           controllers."""
        self.left = not self.left
        if self.left:
            return "left"
        else:
            return "right"


In [4]:
cpp = cartpole_problem()

# try a few random controllers with different random seeds
# this gives a baseline for comparison
for i in range(10):
    np.random.seed(i)
    cpc = random_controller()
    cpp.run_trial(cpc, testing=True)
    
# and animate one!
cpp.run_trial(cpc, testing=True, animate=True)


Ran testing trial with Random Controller, achieved a lifetime of 16 steps
Ran testing trial with Random Controller, achieved a lifetime of 15 steps
Ran testing trial with Random Controller, achieved a lifetime of 40 steps
Ran testing trial with Random Controller, achieved a lifetime of 18 steps
Ran testing trial with Random Controller, achieved a lifetime of 21 steps
Ran testing trial with Random Controller, achieved a lifetime of 26 steps
Ran testing trial with Random Controller, achieved a lifetime of 33 steps
Ran testing trial with Random Controller, achieved a lifetime of 16 steps
Ran testing trial with Random Controller, achieved a lifetime of 17 steps
Ran testing trial with Random Controller, achieved a lifetime of 14 steps
Ran testing trial with Random Controller, achieved a lifetime of 28 steps


28

Notice how the random controller quickly loses control of the pole and lets it tip over.

## Tabular Q learning

There is a difficulty in making this a tabular Q-learning problem: it's not a finite MDP! Since the space of x values, angles, and velocities is continuous, it's actually infinite. In order to avoid trying to make an infinite table, we'll discretize the space (actually quite drastically), by chopping the position and angle dimensions into 3 bins , and the velocity dimensions into 5, thus reducing the continuous state space to 225 discrete states. It's not perfect by any stretch of the imagination, but as you'll see below, it offers quite an improvement over the random controller. 

In [None]:

class tabular_Q_controller(random_controller):
    """Tabular Q-learning controller."""

    def __init__(self, epsilon=0.05, gamma=0.95, eta=0.1):
        """Epsilon: exploration probability (epsilon-greedy)
           gamma: discount factor
           eta: update rate"""
        super().__init__()
        self.name = "Tabular Q"
        disc = [-1, 0, 1]
        disc_dot = [-2, -1, 0, 1, 2]
        self.Q_table = {(x, x_dot, phi, phi_dot): {"left": 0.01-np.random.rand()/50, "right": 0.01-np.random.rand()/50} for x in disc for x_dot in disc_dot for phi in disc for phi_dot in disc_dot}
        self.eta = eta
        self.gamma = gamma
        self.epsilon = epsilon

    def discretize_state(self, state):
        """Convert continuous state into discrete with 3 possible values of each
           position, 5 possible values of each derivative."""
        x, x_dot, phi, phi_dot = state
        if x > 1.:
            x = 1
        elif x < -1.:
            x = -1
        else:
            x = 0

        if x_dot < -0.1:
            x_dot = -2
        elif x_dot > 0.1:
            x_dot = 2
        elif x_dot < -0.03:
            x_dot = -1
        elif x_dot > 0.03:
            x_dot = 1
        else:
            x_dot = 0

        if phi > 0.1:
            phi = 1
        elif phi < -0.1:
            phi = -1
        else:
            phi = 0

        if phi_dot < -0.1:
            phi_dot = -2
        elif phi_dot > 0.1:
            phi_dot = 2
        elif phi_dot < -0.03:
            phi_dot = -1
        elif phi_dot > 0.03:
            phi_dot = 1
        else:
            phi_dot = 0

        return (x, x_dot, phi, phi_dot)

    def choose_action(self, state):
        """Epsilon-greedy w.r.t the current Q-table."""
        state = self.discretize_state(state)
        if not self.testing and np.random.rand() < self.epsilon:
            return np.random.choice(["left", "right"])
        else:
            curr_Q_vals = self.Q_table[state]
            if curr_Q_vals["left"] > curr_Q_vals["right"]:
                return "left"
            return "right"

    def update(self, prev_state, action, new_state, reward):
        """Update Q table."""
        prev_state = self.discretize_state(prev_state)
        new_state = self.discretize_state(new_state)
        if reward != 0.:
            target = reward # reward states are terminal in this task
        else:
            target = self.gamma * max(self.Q_table[new_state].values())

        self.Q_table[prev_state][action] = (1 - self.eta) * self.Q_table[prev_state][action] + self.eta * target


In [None]:
np.random.seed(0)
tqc = tabular_Q_controller()
tqc.set_testing()
cpp.run_trial(tqc, testing=True)
# for trainable controllers, we'll run a few testing trials during
# training to see how they evolve
for step in range(5):
    tqc.set_training()
    cpp.run_k_trials(tqc, 1000)
    tqc.set_testing()
    cpp.run_trial(tqc, testing=True)
    
cpp.run_trial(tqc, testing=True, animate=True)

Notice how the tabular Q system gets the balance pretty well, but is unable to keep the car within bounds while doing it (it tries toward the end, but then the pole tips over...)

## Tabular Q-learning questions

9\. The tabular Q-learning system does much better than a random controller, but it still only lives about 5 times as long. What could we do to improve the tabular Q system's performance on this task further? For whatever you propose, how would it affect training? 

10\. Try setting gamma = 0.0 (living in the moment). What happens? Why?

In [None]:
np.random.seed(0)
tqc = tabular_Q_controller(gamma=0.)
tqc.set_testing()
cpp.run_trial(tqc, testing=True)
for i in range(5):
    tqc.set_training()
    cpp.run_k_trials(tqc, 1000)
    tqc.set_testing()
    cpp.run_trial(tqc, testing=True)
    
cpp.run_trial(tqc, testing=True, animate=True)

11\. What happens if we set gamma = 1 (living in all moments at once)? Naively, one might expect to get random behavior, since all trials get the same total reward, and gamma = 1 is essentially saying that the total reward is all that matters, not when the reward appears. However, this is not what actually happens. Why?

In [None]:
np.random.seed(0)
tqc = tabular_Q_controller(gamma=1.)
tqc.set_testing()
cpp.run_trial(tqc, testing=True)
for i in range(5):
    tqc.set_training()
    cpp.run_k_trials(tqc, 1000)
    tqc.set_testing()
    cpp.run_trial(tqc, testing=True)
    
cpp.run_trial(tqc, testing=True, animate=True)

12\. What happens if you set epsilon = 1 (random behavior while training)? Why?

In [None]:
np.random.seed(0)
tqc = tabular_Q_controller(epsilon=1.)
tqc.set_testing()
cpp.run_trial(tqc, testing=True)
for i in range(5):
    tqc.set_training()
    cpp.run_k_trials(tqc, 1000)
    tqc.set_testing()
    cpp.run_trial(tqc, testing=True)
    
cpp.run_trial(tqc, testing=True, animate=True)

13\. What happens if you set epsilon = 0 (no exploration)? Why does this happen here, and what might be different about other tasks that makes exploration important?

In [None]:
np.random.seed(0)
tqc = tabular_Q_controller(epsilon=0.)
tqc.set_testing()
cpp.run_trial(tqc, testing=True)
for i in range(5):
    tqc.set_training()
    cpp.run_k_trials(tqc, 1000)
    tqc.set_testing()
    cpp.run_trial(tqc, testing=True)
    
cpp.run_trial(tqc, testing=True, animate=True)

Food for thought (no answer necessary): Are the discretization values very important? (The current values were picked by a few quick rounds of trial and error.) If we discretized the space more finely, would we see better results? Is it better to space the breaks linearly or quadratically?

## DQN

In some ways, creating the DQN is simpler than creating the tabular Q-learning system. Neural nets can accept continuous input, so we can simply pass the current state to the network without discretizing. We implemented a simple DQN below, with two hidden layers, and a replay buffer that at each time step stores the current experience and samples one of the previous 1000 time steps to replay. (The buffer persists across episodes.)

As you'll see below, this system does quite a bit better. In fact, it reaches the time limit at which the cartpole code stops by default (1000 steps).

In [None]:
class dqn_controller(random_controller):
    """Simple deep-Q network controller -- 4 inputs (one for each state
       variable), two hidden layers, two outputs (Q-left, Q-right), and an
       optional replay buffer."""

    def __init__(self, epsilon=0.05, gamma=0.95, eta=1e-4, nh1=100, nh2=100, replay_buffer=True):
        """Epsilon: exploration probability (epsilon-greedy)
           gamma: discount factor
           eta: learning rate,
           nh1: number of hidden units in first hidden layer,
           nh2: number of hidden units in second hidden layer,
           replay_buffer: whether to use a replay buffer"""
        super().__init__()
        self.name = "DQN"
        self.eta = eta
        self.gamma = gamma
        self.epsilon = epsilon

        if replay_buffer:
            self.replay_buffer = deque()
            self.replay_buffer_max_size = 1000
        else:
            self.replay_buffer = None

        # network creation
        self.input = tf.placeholder(tf.float32, [1, 4])
        h1 = slim.layers.fully_connected(self.input, nh1, activation_fn=tf.nn.tanh)
        h2 = slim.layers.fully_connected(h1, nh2, activation_fn=tf.nn.tanh)
        self.Q_vals = slim.layers.fully_connected(h2, 2, activation_fn=tf.nn.tanh)

        # training stuff
        self.target =  tf.placeholder(tf.float32, [1, 2])
        self.loss = tf.nn.l2_loss(self.Q_vals - self.target)
        optimizer = tf.train.AdamOptimizer(self.eta, epsilon=1e-3) # (this is an unrelated epsilon)
        self.train = optimizer.minimize(self.loss)

        # session and init
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

    def choose_action(self, state):
        """Takes a state and returns an action, "left" or "right," to take.
           epsilon-greedy w.r.t current Q-function approx."""
        if not self.testing and np.random.rand() < self.epsilon:
            return np.random.choice(["left", "right"])
        else:
            curr_Q_vals = self.sess.run(self.Q_vals, feed_dict={self.input: np.array(state, ndmin=2)})
            if curr_Q_vals[0, 0] > curr_Q_vals[0, 1]:
                return "left"
            return "right"

    def update(self, prev_state, action, new_state, reward):
        """Update policy or whatever, override."""
        if self.replay_buffer is not None:
            # put this (S, A, S, R) tuple in buffer
            self.replay_buffer.append((prev_state, action, new_state, reward))
            rb_len = len(self.replay_buffer)
            # pick a random (S, A, S, R) tuple from buffer
            (prev_state, action, new_state,reward) = self.replay_buffer[np.random.randint(0, rb_len)]

            # remove a memory if getting too full
            if rb_len > self.replay_buffer_max_size:
                self.replay_buffer.popleft()

        if reward != 0.:
            target_val = reward # reward states are terminal in this task
        else:
            new_Q_vals = self.sess.run(self.Q_vals, feed_dict={self.input: np.array(new_state, ndmin=2)})
            target_val = self.gamma * np.max(new_Q_vals)

        # hacky way to update only the correct Q value: make the target for the
        # other its current value
        target_Q_vals = self.sess.run(self.Q_vals, feed_dict={self.input: np.array(prev_state, ndmin=2)})
        if action == "left":
            target_Q_vals[0, 0] = target_val
        else:
            target_Q_vals[0, 1] = target_val

        self.sess.run(self.train, feed_dict={self.input: np.array(prev_state, ndmin=2), self.target: target_Q_vals.reshape([1,2])})


In [None]:
np.random.seed(0)
tf.set_random_seed(0)
dqn = dqn_controller(replay_buffer=True)
dqn.set_testing()
cpp.run_trial(dqn, testing=True)
for i in range(8):
    dqn.set_training()
    cpp.run_k_trials(dqn, 1000)
    dqn.set_testing()
    cpp.run_trial(dqn, testing=True)
    
cpp.run_trial(dqn, testing=True, animate=True)

Notice how the DQN solves both problems: it is able to keep the pole balanced, and stop moving towards the left when it gets too close to the edge of the screen.

## DQN questions

14\. Why does the DQN take more episodes to train than the tabular Q-learning system? 

15\. In my implementation, I used the tanh activation function at the output layer. Why might this be an appropriate choice here? More specifically, what are some activation functions that would probably NOT yield good results at the output layer?

16\. What happens if we turn off the replay buffer? Why might it be important?

In [None]:
np.random.seed(0)
tf.set_random_seed(0)
dqn = dqn_controller(replay_buffer=False)
dqn.set_testing()
cpp.run_trial(dqn, testing=True)
for i in range(8):
    dqn.set_training()
    cpp.run_k_trials(dqn, 1000)
    dqn.set_testing()
    cpp.run_trial(dqn, testing=True)
    
cpp.run_trial(dqn, testing=True, animate=True)

Food for thought: If you gave the DQN the same discretized states that the tabular Q-network gets, would it do any better than the tabular system does? (Try it out if you're curious!)