# 1. Load Library

In [16]:
import tensorflow as tf
import numpy as np
import gym
import matplotlib.pyplot as plt
env = gym.make('Acrobot-v1')
env = env.unwrapped
env.seed(1)

[1]

# 2. Function Set

In [17]:
def discounted_episode_reward_func(episode_reward,discount):
    discounted_episode_reward= np.zeros_like(episode_reward)
    cumulative=0
    for i in reversed(range(len(episode_reward))):
        cumulative=cumulative*discount+episode_reward[i]
        discounted_episode_reward[i]=cumulative
    
    mean = np.mean(discounted_episode_reward)
    std = np.std(discounted_episode_reward)
    discounted_episode_reward=(discounted_episode_reward- mean) / (std)
    
    return discounted_episode_reward

# 3. Parameters

In [18]:
state_size =6 #dimension of state
action_size=env.action_space.n #number of action
max_episodes=500
learning_rate = 0.01
discount= 0.95

# 4. Neural network: soft estimate of actions with state input

In [19]:
with tf.name_scope("inputs"):
    
    #placeholder
    state_=tf.placeholder(tf.float32, [None, state_size], name="state_")
    action_= tf.placeholder(tf.int32, [None, action_size], name="action_")
    discounted_episode_reward_= tf.placeholder(tf.float32, [None,], name="discounted_episode_reward_")
    
    
    with tf.name_scope("fc1"): #hidden layer1
        fc1 = tf.contrib.layers.fully_connected(inputs =state_,
                                                num_outputs=10,
                                                activation_fn=tf.nn.relu,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())

    with tf.name_scope("fc2"):#hidden layer2
        fc2 = tf.contrib.layers.fully_connected(inputs = fc1,
                                                num_outputs=action_size,
                                                activation_fn= tf.nn.relu,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())
    
    with tf.name_scope("fc3"): #hidden layer3
        fc3 = tf.contrib.layers.fully_connected(inputs = fc2,
                                                num_outputs=action_size,
                                                activation_fn= None,
                                                weights_initializer=tf.contrib.layers.xavier_initializer())

    with tf.name_scope("softmax"): #output layer
        soft_action= tf.nn.softmax(fc3)

    with tf.name_scope("loss"):
        neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = fc3,labels=action_)
        loss = tf.reduce_mean(neg_log_prob * discounted_episode_reward_) 
        
    
    with tf.name_scope("train"):
        optimiser=tf.train.AdamOptimizer(learning_rate).minimize(loss)

# 5. Training process

In [20]:
episode_reward_list=[]
ave_reward_list=[]

#saver = tf.train.Saver()

with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    
    for i in range(max_episodes): #start an episode
        
        state = env.reset()
        done=False
        episode_state,episode_action,episode_reward=[],[],[]
        
        while done!=True:
            action_dis= sess.run(soft_action, feed_dict={state_: state.reshape([1,state_size])}) #action distribution from NN
            action = np.random.choice(range(action_dis.shape[1]), p=action_dis.ravel())

            new_state, reward, done, info = env.step(action)#trans2 new state
            cc=np.zeros(action_size)
            cc[action]=1
            
            episode_state.append(state)
            episode_action.append(cc)
            episode_reward.append(reward)
            
            state=new_state
         
        #per episode
        episode_reward_list.append(np.sum(episode_reward))
        discounted_episode_reward=discounted_episode_reward_func(episode_reward,discount)
        
        # NN
        _,loss_output= sess.run([optimiser,loss], feed_dict={state_: np.vstack(np.array(episode_state)),
                                                             action_: np.vstack(np.array(episode_action)),
                                                             discounted_episode_reward_: discounted_episode_reward 
                                                                })
                
        
        
        #average reward
        if (i+1) % 100 == 0:
            ave_reward=np.mean(episode_reward_list)
            ave_reward_list.append(ave_reward)
            episode_reward_list=[]
            print('Episode {} Average Reward:{}'.format(i+1, ave_reward))
                   
        # save model
        #if episode % 100 == 0:
        #    saver.save(sess, "./models/model.ckpt")
        #    print("Model saved")

KeyboardInterrupt: 

In [None]:
# Plot average reward
fig = plt.figure(figsize=(10,5))
plt.plot(100*(np.arange(len(ave_reward_list)) + 1), ave_reward_list)
plt.xlabel('Episode')
plt.ylabel('Average Reward')
plt.savefig('Acrobot_policy optimization_neural network')     
plt.close()  