In [1]:
import tensorflow as tf
import numpy as np
from collections import deque
import gym
import random

from actor import Actor
from critic import Critic
from ou import OUNoise

RANDOM_SEED = 1234
np.random.seed(RANDOM_SEED)
tf.set_random_seed(RANDOM_SEED)


In [2]:
#Hyperparameters
batch_size = 10
mem_size = 100000
actor_lr = 0.0001
critic_lr = 0.001
tau = 0.99
n_episodes = 100000
env_name = 'LunarLanderContinuous-v2'
env = gym.make(env_name)
env.seed(RANDOM_SEED)

gamma = 0.98
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
low = env.action_space.low
high = env.action_space.high
D = deque(maxlen=mem_size)

sess = tf.Session()


[2019-04-19 22:23:36,109] Making new env: LunarLanderContinuous-v2


In [3]:
actor = Actor(sess, input_size=state_size, output_size=action_size, action_bounds=high)
critic = Critic(sess, input_size=[state_size, action_size], output_size=1)
ou = OUNoise(action_dimension=action_size)

In [4]:
def experience_replay():
    if len(D) < batch_size:
        return
    
    minibatch = random.sample(D, batch_size)

    states = np.zeros((batch_size, state_size))
    targets = np.zeros((batch_size, 1))
    actions = np.zeros((batch_size, action_size))

    i = 0
    for state, action, reward, new_state, done in minibatch:
        states[i] = np.expand_dims(state, axis=0)
        actions[i] = np.expand_dims(action, axis=0)

        n_state = np.expand_dims(new_state, axis=0)
        n_action = actor.predict_target(n_state)
        
        target_q = critic.predict_target(n_state, n_action)

        if done:
            targets[i] = reward
        else:
            targets[i] = reward + gamma*target_q
        i += 1
        
    states_batch = np.asarray([t[0] for t in minibatch])
    actions_batch = np.asarray([t[1] for t in minibatch])

    critic.train(states, actions, targets)
    a_for_grads = actor.predict(states_batch)

    grads = critic.gradients(states_batch, a_for_grads)
    actor.train(states_batch, np.squeeze(grads, axis=0))
    
    actor.train_target
    critic.train_target


In [10]:


for e in range(n_episodes):
    total_reward = 0
    timestep = 0
    
    state = env.reset()
    done=False

    while True:
        env.render()

        action = actor.predict(np.expand_dims(state, axis=0))
        action += ou.noise()
        action = np.clip(action, low, high)

        new_state, reward, done, info = env.step(np.squeeze(action))

        D.append([state, action, reward, new_state, done])
        if len(D) > mem_size:
            D.pop()

        experience_replay()

        state = new_state
        
        total_reward += reward
        timestep += 1
        
        if done:
            state = env.reset()
            ou.reset()
            break
            
    avg_reward = total_reward/timestep

    print('Average reward per episode: ', avg_reward)

Average reward per episode:  -2.548988116727502


KeyboardInterrupt: 

In [None]:
actor.train_target