In [1]:
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future

In [2]:
import gym
import os
import sys
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from gym import wrappers
from datetime import datetime
from scipy.misc import imresize

if '../carpole' not in sys.path:
    sys.path.append('../carpole')
from q_learning_bins import plot_running_avg

In [3]:
# constants
IM_WIDTH = 80
IM_HEIGHT = 80
#MAX_EXPERIENCES = 500000
MIN_EXPERIENCES = 5000

In [4]:
def downsample_image(A):
    B = A[31:195] # select the import parts of the image
    B = B.mean(axis=2) # convert to grayscale
    B = B / 255.0 # scale to 0..1
    
    # downsample image
    # changing aspect ratio doen't significantly distort the image
    # nearest neighbor interpolation produce a much sharper image
    # than default bilinear
    B = imresize(B, size= (IM_HEIGHT, IM_WIDTH), interp= 'nearest')
    return B

In [5]:
class DQN:
    def __init__(self, K, conv_layer_sizes, hidden_layer_sizes, gamma, scope, max_experiences= 10000, min_experiences= 100, batch_sz= 32):
         # K = number of actions
        self.K = K
        self.scope = scope
        
        with tf.variable_scope(scope):
            # input and targets
            self.X = tf.placeholder(tf.float32, shape= (None, 4, IM_HEIGHT, IM_WIDTH), name= 'X')
            # tensorflow convolution needs the order to be:
            # (num_samples, height, width, "color")
            # so we need to tranpose later
            self.G = tf.placeholder(tf.float32, shape= (None,), name= 'G')
            self.actions = tf.placeholder(tf.int32, shape= (None, ), name= 'actions')
            
            # calculate output and cost
            # convolutional layers
            # these built-in layers are faster and don't require us to
            # calculate the size of the output of the final conv layer!
            Z = self.X
            Z = tf.transpose(Z, [0, 2, 3, 1])
            for num_output_filters, filtersz, poolsz in conv_layer_sizes:
                Z = tf.contrib.layers.conv2d(
                    Z,
                    num_output_filters,
                    filtersz,
                    poolsz,
                    activation_fn= tf.nn.relu
                )
            
            # fully connected layers
            Z = tf.contrib.layers.flatten(Z)
            for M in hidden_layer_sizes:
                Z = tf.contrib.layers.fully_connected(Z,M)
                
            # final output layer
            self.predict_op = tf.contrib.layers.fully_connected(Z,K)
            
            selected_action_values = tf.reduce_sum(
                self.predict_op * tf.one_hot(self.actions, K),
                reduction_indices = [1]
            )
            
            cost = tf.reduce_sum(tf.square(self.G - selected_action_values))
            # self.train_op = tf.train.AdamOptimizer(10e-3).minimize(cost)
            # self.train_op = tf.train.AdagradOptimizer(10e-3).minimize(cost)
            self.train_op = tf.train.RMSPropOptimizer(2.5e-4, decay= 0.99, epsilon= 10e-3).minimize(cost)
            # self.train_op = tf.train.MomentumOptimizer(10e-4, momentum= 0.9).minimize(cost)
            # self.train_op = tf.train.GradientDescentOptimizer(10e-5).minimize(cost)
            
            # create replay memory
            self.experience = []
            self.max_experiences = max_experiences
            self.min_experiences = min_experiences
            self.batch_sz = batch_sz
            self.gamma = gamma
            
    def copy_from(self, other):
        print("coping to the target_network")
        mine = [t for t in tf.trainable_variables() if t.name.startswith(self.scope)]
        mine = sorted(mine, key= lambda v: v.name)
        theirs = [t for t in tf.trainable_variables() if t.name.startswith(other.scope)]
        theirs = sorted(theirs, key= lambda v: v.name)
        
        ops = []
        for p, q in zip(mine, theirs):
            actual = self.session.run(q)
            op = p.assign(actual)
            ops.append(op)
        
        self.session.run(ops)
        
    def set_session(self, session):
        self.session = session
        
    def predict(self, states):
        return self.session.run(self.predict_op, feed_dict= {self.X: list(states)})
    
    def train(self, target_network):
        # sample a random batch from buffer, do an iteration of GD
        if len(self.experience) < self.min_experiences:
            # don't do anything if we don't have enough experience
            return
        
        # randomly select a batch
        sample = random.sample(self.experience, self.batch_sz)
        states, actions, rewards, next_states = map(np.array, zip(*sample))
        next_Q = np.max(target_network.predict(next_states), axis= 1)
        targets = [r + self.gamma*next_q for r, next_q in zip(rewards, next_Q)]
        
        # call optimizer
        self.session.run(
            self.train_op,
            feed_dict ={
                self.X: states,
                self.G: targets,
                self.actions: list(actions)
            }
        )
        
    def add_experience(self, s, a, r, s2):
        if len(self.experience) >= self.max_experiences:
            self.experience.pop(0)
        self.experience.append((s, a, r, s2))
        
    def sample_action(self, x, eps):
        if np.random.random() < eps:
            return np.random.choice(self.K)
        else:
            return np.argmax(self.predict([x])[0])
        


In [6]:
def update_state(state, observation):
    # downsample and grayscale observation
    observation_small= downsample_image(observation)
    state.append(observation_small)
    if len(state) > 4:
        state.pop(0)
    return state
    #return np.append(state[1:], np.expand_dims(observation_small, 0), axis=0)
        

In [7]:
def play_one(env, model, tmodel, eps, eps_step, gamma, copy_period):
    observation = env.reset()
    done = False
    totalreward = 0
    iters = 0
    state = []
    prev_state = []
    update_state(state, observation) # add the first observation
    while not done: 
    
    #and iters < 2000:
        # if we reach 2000, just quit, don't want this going forever
        # the 200 limit seems a bit early
        
        if len(state) < 4:
            # we can't choose an action base on model
            action = env.action_space.sample()
        else:
            action = model.sample_action(state, eps)
            
        # copy state to prev state
        prev_state.append(state[-1])
        if len(prev_state) > 4:
            prev_state.pop(0)
            
        # perform the action
        observation, reward, done, info = env.step(action)
        
        # add the new frame to the state
        update_state(state, observation)
        
        totalreward += reward
        if done:
            reward = -200
            
        # update the model
        model.add_experience(prev_state, action, reward, state)
        model.train(tmodel)
        
        iters += 1
        eps = max(eps - eps_step, 0.1) # decrease linealy until 0.1
        
        if iters % copy_period == 0:
            tmodel.copy_from(model)
            
    return totalreward, eps, iters


In [8]:
def main():
    env = gym.make('Breakout-v0')
    gamma = 0.99
    copy_period = 400
    
    D = len(env.observation_space.sample())
    K = env.action_space.n
    conv_sizes = [(32, 8, 4), (64, 4, 2), (64, 3, 1)]
    hidden_sizes = [512]
    model = DQN(K, conv_sizes, hidden_sizes, gamma, scope= 'main')
    target_model = DQN(K, conv_sizes, hidden_sizes, gamma, scope= 'target')
    init = tf.global_variables_initializer()
    session = tf.InteractiveSession()
    session.run(init)
    model.set_session(session)
    target_model.set_session(session)
    
    batch_sz = 32
    num_episodes = 10000
    total_t = 0
    episode_rewards = np.zeros(num_episodes)
    
    # epsilon
    # decays linearly until 0.1
    eps = 1.0
    eps_step = 0.1
    
    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = './' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)
        
    print("Populating experience replay buffer...")
    obs = env.reset()
    obs_small = downsample_image(obs)
    #state = []
    state = np.stack([obs_small] * 4, axis=0).tolist()
    # assert(state.shape == (4, 80, 80))
    
    for i in range(MIN_EXPERIENCES):

        action = np.random.choice(K)
        obs, reward, done, _ = env.step(action)
        next_state = update_state(state, obs)
        # assert(state.shape == (4, 80, 80))
        model.add_experience(state, action, reward, next_state)

        if done:
            obs = env.reset()
            obs_small = downsample_image(obs)
            state = np.stack([obs_small] * 4, axis=0).tolist()
            # assert(state.shape == (4, 80, 80))
        else:
            state = next_state
            
    
    # Play a number of episodes and learn!
    for i in range(num_episodes):
        total_t, episode_reward, num_steps_in_episode= play_one(
            env,
            model,
            target_model,
            eps,
            eps_step,
            gamma,
            copy_period
        )
        episode_rewards[i] = episode_reward

        last_100_avg = episode_rewards[max(0, i - 100):i + 1].mean()
        print("Episode:", i,
            #"Duration:", duration,
            "Num steps:", num_steps_in_episode,
            "Reward:", episode_reward,
            #"Training time per step:", "%.3f" % time_per_step,
            "Avg Reward (Last 100):", "%.3f" % last_100_avg,
            #"Epsilon:", "%.3f" % epsilon
        )
        sys.stdout.flush()


In [9]:
if __name__ == '__main__':
    main()

Populating experience replay buffer...
Episode: 0 Num steps: 354 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 1 Num steps: 275 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 2 Num steps: 315 Reward: 0.1 Avg Reward (Last 100): 0.100
coping to the target_network
Episode: 3 Num steps: 605 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 4 Num steps: 384 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 5 Num steps: 452 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 6 Num steps: 422 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 7 Num steps: 311 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 8 Num steps: 382 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 9 Num steps: 330 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 10 Num steps: 312 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 11 Num steps: 329 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 12 Num steps: 469 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 13 Num steps: 399 Reward: 0.1 Avg Reward (Last 10

Episode: 119 Num steps: 430 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 120 Num steps: 397 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 121 Num steps: 499 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 122 Num steps: 568 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 123 Num steps: 550 Reward: 0.1 Avg Reward (Last 100): 0.100
Episode: 124 Num steps: 197 Reward: 0.1 Avg Reward (Last 100): 0.100


ValueError: setting an array element with a sequence.

In [None]:
list([1,2,3])

In [None]:
list(1,2,3)

In [None]:
list((1,2,3))