In [1]:
import numpy as np
import gym
from gym import wrappers
import tensorflow as tf
import json, sys, os
from os import path
import random
from collections import deque


env_to_use = 'Pendulum-v0'
#env_to_use = 'CarRacing-v0'


# hyperparameters

h1_actor = 8
h2_actor = 8
h3_actor = 8
h1_critic = 8
h2_critic = 8
h3_critic = 8
gamma = 0.99
lr_actor = 1e-3
lr_critic = 1e-3
lr_decay = 1
l2_reg_actor = 1e-6
l2_reg_critic = 1e-6
dropout_actor = 0
dropout_critic = 0
num_episodes = 150
max_steps_ep = 10000
tau = 1e-2
train_every = 1
replay_memory_capacity = int(1e5)
minibatch_size = 1024
initial_noise_scale = 0.1
noise_decay = 0.99
exploration_mu = 0.0
exploration_theta = 0.15
exploration_sigma = 0.2

# game parameters
env = gym.make(env_to_use)
state_dim = np.prod(np.array(env.observation_space.shape))
action_dim = np.prod(np.array(env.action_space.shape))

# set seeds to 0
env.seed(0)
np.random.seed(0)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
replay_memory = deque(maxlen=replay_memory_capacity)

def add_to_memory(experience):
    replay_memory.append(experience)

def sample_from_memory(minibatch_size):
    return random.sample(replay_memory, minibatch_size)

In [3]:
import tensorflow as tf
class ANN():
    tf.reset_default_graph()
    state_ph  =  tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
    action_ph = tf.placeholder(dtype=tf.float32, shape=[None,action_dim])
    reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
    next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
    is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None])
    

    
    episodes = tf.Variable(0.0, trainable=False, name='episodes')
    episode_inc_op = episodes.assign_add(1)
    
    def __init__(self):
        with tf.variable_scope('actor'):
            self.actor_net_value = ANN.generate_actor_network(self,trainable = True, reuse = False)

        
        with tf.variable_scope('slow_target_actor', reuse=False):
            self.target_actor_net_value = tf.stop_gradient(ANN.generate_actor_network(self,trainable = False, reuse = False))

        with tf.variable_scope('critic') as scope:
            self.critic_net_value = ANN.generate_critic_network(self,trainable = True, reuse = False)
            self.q_value_for_actor_net = ANN.generate_critic_network(self,trainable = True, reuse = True,mode=2)

        
        with tf.variable_scope('slow_target_critic', reuse=False):
            self.target_critic_net_value = tf.stop_gradient(ANN.generate_critic_network(self,trainable = False, reuse = False,mode=3))
        
        
        self.actor_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')
        self.target_actor_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_actor')
        self.critic_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic')
        self.target_critic_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_critic')

        
    def predict_graph(self):
        return self.actor_net_value
    def generate_actor_network(self,trainable, reuse):
        hidden = tf.layers.dense(ANN.state_ph, h1_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
        hidden_2 = tf.layers.dense(hidden, h2_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
        hidden_3 = tf.layers.dense(hidden_2, h3_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
        actions_unscaled = tf.layers.dense(hidden_3, action_dim, trainable = trainable, name = 'dense_3', reuse = reuse)
        actions = env.action_space.low + tf.nn.sigmoid(actions_unscaled)*(env.action_space.high - env.action_space.low) # bound the actions to the valid range
        return actions

   
    
    def generate_critic_network(self,trainable, reuse,mode=1):
        if mode==1:
            state_action = tf.concat([ANN.state_ph, ANN.action_ph], axis=1)
        if mode==2:
            state_action = tf.concat([ANN.state_ph,self.actor_net_value], axis=1)
        if mode==3:
            state_action = tf.concat([ANN.next_state_ph,self.target_actor_net_value], axis=1)
        hidden = tf.layers.dense(state_action, h1_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
        hidden_2 = tf.layers.dense(hidden, h2_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
        hidden_3 = tf.layers.dense(hidden_2, h3_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
        q_values = tf.layers.dense(hidden_3, 1, trainable = trainable, name = 'dense_3', reuse = reuse)
        return q_values
    def train_graph(self):
        updated_q_values = tf.expand_dims(ANN.reward_ph, 1) + tf.expand_dims(ANN.is_not_terminal_ph, 1) * gamma * self.target_critic_net_value
        td_errors = updated_q_values - self.critic_net_value


        critic_loss = tf.reduce_mean(tf.square(td_errors))
        for var in self.critic_net_vars:
            if not 'bias' in var.name:
                critic_loss += l2_reg_critic * 0.5 * tf.nn.l2_loss(var)


        critic_train_op = tf.train.AdamOptimizer(lr_critic).minimize(critic_loss)
        actor_loss = -1*tf.reduce_mean(self.q_value_for_actor_net)
        for var in self.actor_net_vars:
            if not 'bias' in var.name:
                actor_loss += l2_reg_actor * 0.5 * tf.nn.l2_loss(var)

        actor_train_op = tf.train.AdamOptimizer(lr_actor).minimize(actor_loss, var_list=self.actor_net_vars)
        return actor_train_op,critic_train_op
        
    def update_wts_graph(self):
            update_slow_target_ops = []
            for i, target_actor_var in enumerate(self.target_actor_net_vars):
                update_slow_target_actor_op = target_actor_var.assign(tau*self.actor_net_vars[i]+(1-tau)*target_actor_var)
                update_slow_target_ops.append(update_slow_target_actor_op)

            for i, slow_target_var in enumerate(self.target_critic_net_vars):
                update_slow_target_critic_op = slow_target_var.assign(tau*self.critic_net_vars[i]+(1-tau)*slow_target_var)
                update_slow_target_ops.append(update_slow_target_critic_op)

            update_slow_targets_op = tf.group(*update_slow_target_ops, name='update_slow_targets')
            return update_slow_targets_op 

In [4]:
Model=ANN()
actor_net_value=Model.predict_graph()
actor_train_op,critic_train_op=Model.train_graph()
update_wts_op=Model.update_wts_graph()

In [5]:



# initialize session
sess = tf.Session()	
sess.run(tf.global_variables_initializer())



In [6]:
#####################################################################################################
## Training

total_steps = 0
log_rewards=[]
for ep in range(num_episodes):

    total_reward = 0
    steps_in_ep = 0

    
    noise_process = np.zeros(action_dim)
    noise_scale = (initial_noise_scale * noise_decay**ep) * (env.action_space.high - env.action_space.low)

    
    observation = env.reset()
    for t in range(max_steps_ep):

        action_for_state, = sess.run(actor_net_value, 
            feed_dict = {Model.state_ph: observation[None]})

        
        noise_process = exploration_theta*(exploration_mu - noise_process) + exploration_sigma*np.random.randn(action_dim)
        action_for_state += noise_scale*noise_process

       
        next_observation, reward, done, _info = env.step(action_for_state)
        total_reward += reward

        add_to_memory((observation, action_for_state, reward, next_observation, 
            0.0 if done else 1.0))
        
        if total_steps%train_every == 0 and len(replay_memory) >= minibatch_size:

           
            minibatch = sample_from_memory(minibatch_size)

            _, _ = sess.run([critic_train_op, actor_train_op], 
                feed_dict = {
                    Model.state_ph: np.asarray([elem[0] for elem in minibatch]),
                    Model.action_ph: np.asarray([elem[1] for elem in minibatch]),
                    Model.reward_ph: np.asarray([elem[2] for elem in minibatch]),
                    Model.next_state_ph: np.asarray([elem[3] for elem in minibatch]),
                    Model.is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch]),
                    })


            _ = sess.run(update_wts_op)

        observation = next_observation
        total_steps += 1
        steps_in_ep += 1
        
        if done: 
            
            _ = sess.run(Model.episode_inc_op)
            break
    log_rewards.append([ep,total_reward])
    print('Episode %2i, Reward: %7.3f, Steps: %i, noise: %7.3f'%(ep,total_reward,steps_in_ep, noise_scale))


env.close()

Episode  0, Reward: -1543.501, Steps: 200, noise:   0.400
Episode  1, Reward: -1405.853, Steps: 200, noise:   0.396
Episode  2, Reward: -1347.651, Steps: 200, noise:   0.392
Episode  3, Reward: -1338.051, Steps: 200, noise:   0.388
Episode  4, Reward: -1565.448, Steps: 200, noise:   0.384
Episode  5, Reward: -1381.468, Steps: 200, noise:   0.380
Episode  6, Reward: -1594.927, Steps: 200, noise:   0.377
Episode  7, Reward: -1606.043, Steps: 200, noise:   0.373
Episode  8, Reward: -1559.958, Steps: 200, noise:   0.369
Episode  9, Reward: -1411.514, Steps: 200, noise:   0.365
Episode 10, Reward: -1550.277, Steps: 200, noise:   0.362
Episode 11, Reward: -1410.535, Steps: 200, noise:   0.358
Episode 12, Reward: -1429.524, Steps: 200, noise:   0.355
Episode 13, Reward: -937.257, Steps: 200, noise:   0.351
Episode 14, Reward: -649.325, Steps: 200, noise:   0.347
Episode 15, Reward: -902.520, Steps: 200, noise:   0.344
Episode 16, Reward: -1091.342, Steps: 200, noise:   0.341
Episode 17, Rewar

Episode 143, Reward: -119.584, Steps: 200, noise:   0.095
Episode 144, Reward: -118.971, Steps: 200, noise:   0.094
Episode 145, Reward: -121.931, Steps: 200, noise:   0.093
Episode 146, Reward:  -0.951, Steps: 200, noise:   0.092
Episode 147, Reward: -118.764, Steps: 200, noise:   0.091
Episode 148, Reward:  -1.737, Steps: 200, noise:   0.090
Episode 149, Reward: -387.560, Steps: 200, noise:   0.089


In [7]:
env.close()

In [16]:
import time
import cv2
env=gym.make(env_to_use)
env.seed(0)
obs=env.reset()

#os.mkdir(env_to_use+'Test')
#os.mkdir(env_to_use+'Test/img/')
for i in range(200):
    
    _action, = sess.run(actor_net_value, 
                    feed_dict = {Model.state_ph: obs[None]})
    obs,rew,done,info=env.step(_action)
    img=env.render(mode='rgb_array')
    time.sleep(0.1)
    #cv2.imwrite(env_to_use+'Test/img/'+str(i)+'.jpg',img)
env.close()

In [9]:
from matplotlib import pyplot as plt
rws=np.array(log_rewards)
fig=plt.figure(figsize=(10,10))
plt.plot(rws[:,0],rws[:,1])
plt.title('epoch vs sum of reward')
plt.savefig(env_to_use+'Test/'+'rewards.jpg')
plt.show()

<Figure size 1000x1000 with 1 Axes>

In [10]:
import pandas as pd
df=pd.DataFrame(rws[:,1],columns=['sum_of_rewards'])

In [11]:
df.to_csv(env_to_use+'Test/rewards.csv')