In [1]:
import numpy as np
import gym
from gym import wrappers
import tensorflow as tf
import json, sys, os
from os import path
import random
from collections import deque
from tqdm import tqdm,trange

#env_to_use = 'Pendulum-v0'
env_to_use = 'CarRacing-v0'


# hyperparameters

h1_actor = 8
h2_actor = 8
h3_actor = 8
h1_critic = 8
h2_critic = 8
h3_critic = 8
gamma = 0.99
lr_actor = 1e-3
lr_critic = 1e-3
lr_decay = 1
l2_reg_actor = 1e-6
l2_reg_critic = 1e-6
dropout_actor = 0
dropout_critic = 0
num_episodes = 100
max_steps_ep = 3000
tau = 1e-2
train_every = 1
replay_memory_capacity = int(1e5)
minibatch_size = 64#1024
initial_noise_scale = 0.1
noise_decay = 0.99
exploration_mu = 0.0
exploration_theta = 0.15
exploration_sigma = 0.2

# game parameters
env = gym.make(env_to_use)
state_dim = env.observation_space.shape
action_dim = np.prod(np.array(env.action_space.shape))

# set seeds to 0
env.seed(0)
np.random.seed(0)


In [2]:
env.action_space.high - env.action_space.low

array([2., 1., 1.], dtype=float32)

In [3]:
replay_memory = deque(maxlen=replay_memory_capacity)

def add_to_memory(experience):
    replay_memory.append(experience)

def sample_from_memory(minibatch_size):
    return random.sample(replay_memory, minibatch_size)

In [4]:
#import tensorflow as tf
from tensorflow.python.framework import ops
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

class ANN():
    tf.reset_default_graph()
    state_ph  =  tf.placeholder(dtype=tf.float32, shape=[None,state_dim[0],state_dim[1],state_dim[2]])
    action_ph = tf.placeholder(dtype=tf.float32, shape=[None,action_dim])
    reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
    next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim[0],state_dim[1],state_dim[2]])
    is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None])
    

    
    episodes = tf.Variable(0.0, trainable=False, name='episodes')
    episode_inc_op = episodes.assign_add(1)
    
    def __init__(self):
        with tf.variable_scope('actor'):
            self.actor_net_value = ANN.generate_actor_network(self,trainable = True, reuse = False)
              
        
        with tf.variable_scope('slow_target_actor', reuse=False):
            self.target_actor_net_value = tf.stop_gradient(ANN.generate_actor_network(self,trainable = False, reuse = False))

        with tf.variable_scope('critic') as scope:
            self.critic_net_value = ANN.generate_critic_network(self,trainable = True, reuse = False)
            self.q_value_for_actor_net = ANN.generate_critic_network(self,trainable = True, reuse = True,mode=2)

        
        with tf.variable_scope('slow_target_critic', reuse=False):
            self.target_critic_net_value = tf.stop_gradient(ANN.generate_critic_network(self,trainable = False, reuse = False,mode=3))
        
        
        self.actor_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')
        self.target_actor_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_actor')
        self.critic_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic')
        self.target_critic_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_critic')

        
    def predict_graph(self):
        return self.actor_net_value
    def generate_actor_network(self,trainable, reuse):
        layer1_out = tf.layers.conv2d(ANN.state_ph, filters=16, kernel_size=[8, 8],
                                      strides=[4, 4], padding='same', activation=tf.nn.relu, data_format='channels_last', name='actor_layer1_out')
        layer2_out = tf.layers.conv2d(layer1_out, filters=32, kernel_size=[4, 4],
                                      strides=[2, 2], padding='same', activation=tf.nn.relu, data_format='channels_last', name='actor_layer2_out')
        layer3_out = tf.layers.dense(tf.layers.flatten(layer2_out), 256, activation=tf.nn.relu, name='actor_layer3_out')
        #output = tf.layers.dense(layer3_out, action_size, activation=None, name='output')
        #hidden = tf.layers.dense(ANN.state_ph, h1_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
        #hidden_2 = tf.layers.dense(hidden, h2_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
        #hidden_3 = tf.layers.dense(hidden_2, h3_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
        actions_unscaled = tf.layers.dense(layer3_out, action_dim, trainable = trainable, name = 'dense_3', reuse = reuse)
        actions = env.action_space.low + tf.nn.sigmoid(actions_unscaled)*(env.action_space.high - env.action_space.low) # bound the actions to the valid range
        return actions_unscaled

   
    
    def generate_critic_network(self,trainable, reuse,mode=1):
        layer1_out = tf.layers.conv2d(ANN.state_ph, filters=16, kernel_size=[8, 8],
                                      strides=[4, 4], padding='same', activation=tf.nn.relu, data_format='channels_last', name='critic_layer1_out',reuse = reuse)
        layer2_out = tf.layers.conv2d(layer1_out, filters=32, kernel_size=[4, 4],
                                      strides=[2, 2], padding='same', activation=tf.nn.relu, data_format='channels_last', name='critic_layer2_out',reuse = reuse)
        layer2_flat=tf.layers.flatten(layer2_out)
        if mode==1:
            state_action = tf.concat([layer2_flat, ANN.action_ph], axis=1)
        if mode==2:
            state_action = tf.concat([layer2_flat,self.actor_net_value], axis=1)
        if mode==3:
            state_action = tf.concat([layer2_flat,self.target_actor_net_value], axis=1)
        layer3_out = tf.layers.dense(state_action, 256, activation=tf.nn.relu, name='critic_layer3_out',reuse = reuse)
        #hidden = tf.layers.dense(state_action, h1_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
        #hidden_2 = tf.layers.dense(hidden, h2_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
        #hidden_3 = tf.layers.dense(hidden_2, h3_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
        q_values = tf.layers.dense(layer3_out, 1, trainable = trainable, name = 'dense_3', reuse = reuse)
        return q_values
    def train_graph(self):
        updated_q_values = tf.expand_dims(ANN.reward_ph, 1) + tf.expand_dims(ANN.is_not_terminal_ph, 1) * gamma * self.target_critic_net_value
        td_errors = updated_q_values - self.critic_net_value


        critic_loss = tf.reduce_mean(tf.square(td_errors))
        for var in self.critic_net_vars:
            if not 'bias' in var.name:
                critic_loss += l2_reg_critic * 0.5 * tf.nn.l2_loss(var)


        critic_train_op = tf.train.AdamOptimizer(lr_critic).minimize(critic_loss)
        actor_loss = -1*tf.reduce_mean(self.q_value_for_actor_net)
        for var in self.actor_net_vars:
            if not 'bias' in var.name:
                actor_loss += l2_reg_actor * 0.5 * tf.nn.l2_loss(var)

        actor_train_op = tf.train.AdamOptimizer(lr_actor).minimize(actor_loss, var_list=self.actor_net_vars)
        return actor_train_op,critic_train_op
        
    def update_wts_graph(self):
            update_slow_target_ops = []
            for i, target_actor_var in enumerate(self.target_actor_net_vars):
                update_slow_target_actor_op = target_actor_var.assign(tau*self.actor_net_vars[i]+(1-tau)*target_actor_var)
                update_slow_target_ops.append(update_slow_target_actor_op)

            for i, slow_target_var in enumerate(self.target_critic_net_vars):
                update_slow_target_critic_op = slow_target_var.assign(tau*self.critic_net_vars[i]+(1-tau)*slow_target_var)
                update_slow_target_ops.append(update_slow_target_critic_op)

            update_slow_targets_op = tf.group(*update_slow_target_ops, name='update_slow_targets')
            return update_slow_targets_op 
    def load_weights(self,_actor_net_vars,_critic_net_vars):
            update_wts_ops = []
            for i, actor_var in enumerate(self.actor_net_vars):
                update_actor_op = actor_var.assign(_actor_net_vars[i])
                update_wts_ops.append(update_actor_op)

            for i, critic_var in enumerate(self.critic_net_vars):
                update_critic_op = critic_var.assign(_critic_net_vars[i])
                update_wts_ops.append(update_critic_op)

            update_wts_ops = tf.group(*update_wts_ops, name='update_wts_ops')
            return update_wts_ops 
    

Instructions for updating:
non-resource variables are not supported in the long term


In [5]:
Model=ANN()
actor_net_value=Model.predict_graph()
actor_train_op,critic_train_op=Model.train_graph()
update_wts_op=Model.update_wts_graph()

Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.Dense instead.


In [6]:



# initialize session
#sess = tf.Session()
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
sess.run(tf.global_variables_initializer())



Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce GTX 1660 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5



In [7]:
#####################################################################################################
## Training

total_steps = 0
log_rewards=[]
for ep in range(num_episodes):

    total_reward = 0
    steps_in_ep = 0

    
    noise_process = np.zeros(action_dim)
    noise_scale = (initial_noise_scale * noise_decay**ep) * (env.action_space.high - env.action_space.low)

    
    observation = env.reset()
    for t in range(max_steps_ep):
        
        action_for_state, = sess.run(actor_net_value, 
            feed_dict = {Model.state_ph: observation[None]/255.0})

        
        noise_process = exploration_theta*(exploration_mu - noise_process) + exploration_sigma*np.random.randn(action_dim)
        action_for_state += noise_scale*noise_process

       
        next_observation, reward, done, _info = env.step(action_for_state)
        total_reward += reward

        add_to_memory((observation, action_for_state, reward, next_observation, 
            0.0 if done else 1.0))
        
        if total_steps%train_every == 0 and len(replay_memory) >= minibatch_size:

           
            minibatch = sample_from_memory(minibatch_size)

            _, _ = sess.run([critic_train_op, actor_train_op], 
                feed_dict = {
                    Model.state_ph: np.asarray([elem[0] for elem in minibatch]),
                    Model.action_ph: np.asarray([elem[1] for elem in minibatch]),
                    Model.reward_ph: np.asarray([elem[2] for elem in minibatch]),
                    Model.next_state_ph: np.asarray([elem[3] for elem in minibatch]),
                    Model.is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch]),
                    })


            _ = sess.run(update_wts_op)

        observation = next_observation
        total_steps += 1
        steps_in_ep += 1
        
        if done: 
            
            _ = sess.run(Model.episode_inc_op)
            break
    log_rewards.append([ep,total_reward])
    print('Episode %2i, Reward: %7.3f, Steps: %i, noise: %7.3f'%(ep,total_reward,steps_in_ep, noise_scale[0]))


env.close()

Track generation: 1143..1442 -> 299-tiles track


KeyboardInterrupt: 

In [None]:
#conda install -c conda-forge tqdm
#conda install -c conda-forge ipywidgets
#conda install -c conda-forge nodejs#

In [None]:
#tqdm.write('Episode %2i, Reward: %7.3f, Steps: %i, noise: %7.3f'%(ep,total_reward,steps_in_ep, noise_scale[0]))


In [7]:
def save_weights(actor_net_var,critic_net_vars):
            import pickle
            f=open('net_wts','wb')
            pickle.dump([actor_wts,critic_wts],f)
            f.close() 
def load_weights(Model,sess):
            import pickle
            f=open('net_wts','rb')
            [actor_wts,critic_wts]=pickle.load(f)
            f.close()
            sess.run(Model.load_weights(actor_wts,critic_wts))
            return actor_wts,critic_wts


In [None]:
save=True
if save==True:
    actor_wts=sess.run(Model.actor_net_vars)
    critic_wts=sess.run(Model.critic_net_vars)
    save_weights(actor_wts,critic_wts)

In [8]:
load=True
if load==True:
    load_weights(Model,sess)
    a_wts=sess.run(Model.actor_net_vars)
    c_wts=sess.run(Model.critic_net_vars)

In [None]:
import time
import cv2
env=gym.make(env_to_use)
env.seed(0)
obs=env.reset()

#os.mkdir(env_to_use+'Test')
#os.mkdir(env_to_use+'Test/img/')
for i in range(1000):
    img=obs[None]/255.0
    _action, = sess.run(actor_net_value, 
                    feed_dict = {Model.state_ph: img})
    print(_action)
    obs,rew,done,info=env.step(_action)
    img=env.render(mode='rgb_array')
    time.sleep(0.1)
    #cv2.imwrite(env_to_use+'Test/img/'+str(i)+'.jpg',img)
env.close()

Track generation: 1143..1442 -> 299-tiles track
[0.99999976 1.         0.999992  ]
[1.        1.        0.9999999]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1. 1.]
[1. 1

In [None]:
from matplotlib import pyplot as plt
rws=np.array(log_rewards)
fig=plt.figure(figsize=(10,10))
plt.plot(rws[:,0],rws[:,1])
plt.title('epoch vs sum of reward')
plt.savefig(env_to_use+'Test/'+'rewards.jpg')
plt.show()

In [None]:
import pandas as pd
df=pd.DataFrame(rws[:,1],columns=['sum_of_rewards'])

In [None]:
df.to_csv(env_to_use+'Test/rewards.csv')