In [1]:
import tensorflow as tf
import numpy as np
from collections import deque
import gym
import random

from actor import Actor
from critic import Critic
from ou import OUNoise

RANDOM_SEED = 1234
np.random.seed(RANDOM_SEED)
tf.set_random_seed(RANDOM_SEED)


In [2]:
#Hyperparameters
batch_size = 10
mem_size = 100000
actor_lr = 0.0001
critic_lr = 0.001
tau = 0.99
n_episodes = 100000
env_name = 'LunarLanderContinuous-v2'
env = gym.make(env_name)
env.seed(RANDOM_SEED)

gamma = 0.98
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
low = env.action_space.low
high = env.action_space.high
D = deque(maxlen=mem_size)

sess = tf.Session()


[2019-04-19 22:23:36,109] Making new env: LunarLanderContinuous-v2


In [3]:
actor = Actor(sess, input_size=state_size, output_size=action_size, action_bounds=high)
critic = Critic(sess, input_size=[state_size, action_size], output_size=1)
ou = OUNoise(action_dimension=action_size)

In [4]:
def experience_replay():
    if len(D) < batch_size:
        return
    
    minibatch = random.sample(D, batch_size)

    states = np.zeros((batch_size, state_size))
    targets = np.zeros((batch_size, 1))
    actions = np.zeros((batch_size, action_size))

    i = 0
    for state, action, reward, new_state, done in minibatch:
        states[i] = np.expand_dims(state, axis=0)
        actions[i] = np.expand_dims(action, axis=0)

        n_state = np.expand_dims(new_state, axis=0)
        n_action = actor.predict_target(n_state)
        
        target_q = critic.predict_target(n_state, n_action)

        if done:
            targets[i] = reward
        else:
            targets[i] = reward + gamma*target_q
        i += 1
        
    states_batch = np.asarray([t[0] for t in minibatch])
    actions_batch = np.asarray([t[1] for t in minibatch])

    critic.train(states, actions, targets)
    a_for_grads = actor.predict(states_batch)

    grads = critic.gradients(states_batch, a_for_grads)
    actor.train(states_batch, np.squeeze(grads, axis=0))
    
    actor.train_target
    critic.train_target


In [9]:


for e in range(n_episodes):
    total_reward = 0
    timestep = 0
    
    state = env.reset()
    done=False

    while True:
        env.render()

        action = actor.predict(np.expand_dims(state, axis=0))
        action += ou.noise()
        action = np.clip(action, low, high)

        new_state, reward, done, info = env.step(np.squeeze(action))

        D.append([state, action, reward, new_state, done])
        if len(D) > mem_size:
            D.pop()

        experience_replay()

        state = new_state
        
        total_reward += reward
        timestep += 1
        
        if done:
            state = env.reset()
            ou.reset()
            break
            
    avg_reward = total_reward/timestep

    print('Average reward per episode: ', avg_reward)

[0.86579258 0.73694203]
[0.87167086 0.69911429]
[0.82056632 0.69838827]
[0.73960564 0.78914221]
[0.73171911 0.79965186]
[0.65205022 0.83367722]
[0.76131768 0.8159976 ]
[0.81880288 0.85272785]
[0.76154916 0.81235203]
[0.73645792 0.73657873]
[0.78874137 0.64955562]
[0.7915532  0.68342439]
[0.84819634 0.72179489]
[0.794085   0.72239678]
[0.7975787  0.75585726]
[0.78840283 0.67541117]
[0.74971116 0.55180848]
[0.78609421 0.60018315]
[0.8500394  0.64629305]
[0.82741021 0.69616482]
[0.78396555 0.69374627]
[0.77725697 0.68948877]
[0.74306191 0.75506611]
[0.8249539  0.82541663]
[0.82110039 0.7768428 ]
[0.82145628 0.80309682]
[0.82985375 0.71155186]
[0.79911459 0.73874415]
[0.82481438 0.70631954]
[0.774829   0.71479479]
[0.75051689 0.71660027]
[0.74571629 0.64202768]
[0.74496736 0.73159872]
[0.73907538 0.78420028]
[0.70535599 0.76281433]
[0.68055522 0.68812211]
[0.63369501 0.74323022]
[0.70334603 0.79264832]
[0.65804347 0.82089289]
[0.64207351 0.82390333]
[0.72919509 0.8136163 ]
[0.78485903 0.74

[0.69743955 0.65387125]
[0.69261464 0.71956536]
[0.67532282 0.68707113]
[0.63657108 0.72413562]
[0.78873809 0.72414969]
[0.78484945 0.74062122]
[0.81414121 0.7733371 ]
[0.80306365 0.74626672]
[0.81952868 0.83935299]
[0.81890619 0.81779329]
[0.77928641 0.83438627]
[0.72014544 0.78384692]
[0.73940715 0.7354625 ]
[0.65008935 0.71875874]
[0.76275915 0.80647085]
[0.78064819 0.78751225]
[0.70627637 0.78395466]
[0.57434953 0.79956778]
[0.6717026  0.85010645]
[0.76206603 0.83130766]
[0.77579171 0.84738729]
[0.76979999 0.83592987]
[0.75482151 0.82776911]
[0.6881203  0.77039149]
[0.67211853 0.78708137]
[0.61404973 0.66942134]
[0.6881909  0.69749337]
[0.66719565 0.75485416]
[0.62296392 0.73164414]
[0.56506995 0.64943594]
[0.61592642 0.74107448]
[0.62158162 0.74307827]
[0.66707931 0.83471911]
[0.75159039 0.7771783 ]
[0.8050171  0.76279953]
[0.81509691 0.80125373]
[0.85418504 0.75730296]
[0.7961204 0.7221368]
[0.67986933 0.74913361]
[0.62967446 0.67344797]
[0.5931875  0.60121897]
[0.67888535 0.7666

KeyboardInterrupt: 

In [None]:
actor.train_target