In [5]:
import numpy as np
import ParallelEnvironment
import Worker
import Estimator_Cliff_Walk
from ParallelEnvironment import *
from Worker import *
from Estimator_Cliff_Walk import *


In [2]:
import tensorflow as tf
import numpy as np


In [3]:
from lib.envs.cliff_walking import *

In [4]:
%load_ext autoreload
%autoreload 1
%aimport ParallelEnvironment
%aimport Worker
%aimport Estimator_Cliff_Walk

In [6]:
class Provider:
    def __init__(self):
        self.counter = 0
    def create(self):
        env = Environment(CliffWalkingEnv(),self.counter,False)
        self.counter += 1
        return env

In [20]:
tf.reset_default_graph()
provider = Provider()
parallel = ParallelEnvironment(provider, 32, 8, 4)
estimator = Estimator(4, 48, 64, 0.001)

In [8]:
def get_epsilon(step):
    return 0.01

In [9]:
def calculate_q_targets(estimator, states, rewards, done):
    next_max_q_values = np.max(estimator.predict_q_values(states), axis= 1)
    next_max_q_values[done == 1] = 0
    q_targets = rewards + next_max_q_values
    return q_targets

In [10]:
def train(env, estimator, num_global_steps):
    shared_states, shared_rewards, shared_done, shared_actions = env.get_shared_variables()
    
    for i in range(num_global_steps):
        if ((i+1) % 1000) == 0:
            print("curr step: {}".format(i+1))
        epsilon = get_epsilon(i)
        actions = estimator.choose_e_greedy_actions(shared_states, epsilon)
        old_states = np.copy(shared_states)
        #print("shared states before actions:")
        #print(shared_states)
        #print("------------ \n")
        #print("actions to be performed: ")
        #print(actions)
        #print("------------ \n")
        env.step(actions)
        #print("check whether actions and internal actions are equivalent: ")
        #print(np.argmax(shared_actions) == actions)
        #print("external actions: ")
        #print(actions)
        #print("internal actions")
        #print(shared_actions)
        #print("------------ \n")
        #print("shared states:")
        #print(shared_states)
        #print("------------ \n")
        q_targets = calculate_q_targets(estimator, shared_states, shared_rewards, shared_done)
        #print("q targets: ")
        #print(q_targets)
        loss = estimator.update(old_states, actions, q_targets)
        #print("loss: "+ str(loss))
    return estimator
        
        
        
        
    
    
    

In [37]:
estimator = train(parallel, estimator, 25000)
# 3000 steps is sufficient with 32 envs sometimes takes longer, dont fiddle with the hyperparameters

curr step: 1000
curr step: 2000
curr step: 3000
curr step: 4000
curr step: 5000
curr step: 6000
curr step: 7000
curr step: 8000
curr step: 9000
curr step: 10000
curr step: 11000
curr step: 12000
curr step: 13000
curr step: 14000
curr step: 15000
curr step: 16000
curr step: 17000
curr step: 18000
curr step: 19000
curr step: 20000
curr step: 21000
curr step: 22000
curr step: 23000
curr step: 24000
curr step: 25000


In [38]:
estimator.predict_q_values([36])

array([[-15.6539259 , -91.29176331, -20.53508949, -21.19099426]], dtype=float32)

In [23]:
def eval_model(estimator, env, num_episodes):
    for i in range(num_episodes):
        print("episode nr: {}".format(i))
        state = env.reset()
        done = False
        episode = []
        while not done:
            action = estimator.choose_greedy_actions([state])[0]
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward, next_state))
            state = next_state
        
        print("episodenlänge: {}".format(len(episode)))
        print("episode reward: {}".format(sum([transition[2] for transition in episode])))
            
            

In [39]:
eval_model(estimator, CliffWalkingEnv(), 10)

episode nr: 0
episodenlänge: 15
episode reward: -15.0
episode nr: 1
episodenlänge: 15
episode reward: -15.0
episode nr: 2
episodenlänge: 15
episode reward: -15.0
episode nr: 3
episodenlänge: 15
episode reward: -15.0
episode nr: 4
episodenlänge: 15
episode reward: -15.0
episode nr: 5
episodenlänge: 15
episode reward: -15.0
episode nr: 6
episodenlänge: 15
episode reward: -15.0
episode nr: 7
episodenlänge: 15
episode reward: -15.0
episode nr: 8
episodenlänge: 15
episode reward: -15.0
episode nr: 9
episodenlänge: 15
episode reward: -15.0


In [40]:
estimator.predict_q_values([24])

array([[-20.12506104, -15.20067406, -19.74595451, -19.5741291 ]], dtype=float32)

In [280]:
env = CliffWalkingEnv()

In [289]:
state = env.reset()
print(state)
env.render()

36
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T



In [296]:
action = estimator.choose_greedy_actions([state])
state,reward, done, info = env.step(action[0])
env.render()

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  x  C  C  C  C  C  T



In [151]:
action = estimator.choose_greedy_actions([state])
print(action)
env.step(action[0])
env.render()

[0]
x  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T



In [114]:
action = estimator.choose_greedy_actions([state])
print(action)
env.step(action[0])
env.render()

[0]
x  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T



In [20]:
states, rewards, done ,actions = parallel.get_shared_variables()


In [21]:
states

array([36, 36, 36, 36, 36, 36, 36, 36], dtype=uint32)

In [22]:
rewards

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)

In [23]:
done

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)

In [24]:
actions

array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]], dtype=float32)