In [None]:
#%tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [4]:
#remove " > /dev/null 2>&1" to see what is going on under the hood
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [5]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[box2d] > /dev/null 2>&1

Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (47.3.1)


In [6]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f37d0167390>

In [7]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [8]:
root_dir = "/content/gdrive/My Drive/"
base_dir = root_dir + 'Colab Notebooks/DDPG/BipedalWalker/'

In [9]:
#import gym
#from gym import logger as gymlogger
from gym.wrappers import Monitor
#gymlogger.set_level(40) #error only
#import tensorflow as tf
#import numpy as np
#import random
#import matplotlib
#import matplotlib.pyplot as plt
#%matplotlib inline
#import math
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay


In [10]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [11]:
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt

import numpy as np
import gym
import random
from collections import deque
import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.layers as tfkl
import pickle
import time
import sys
from IPython.core.debugger import set_trace


In [12]:
#utils.py
class ReplayMemory(object):
    def __init__(self, list, max_size):
        self.max_size = max_size
        self.mem = deque(list, maxlen = self.max_size)
        self.cntr = len(self.mem)

    def store(self, state, action, reward, next_state, done):
        self.mem.append((state, action, reward, next_state, done))
        self.cntr += 1

    def sample(self, batch_size):
        batch_size = min(batch_size, self.cntr)
        return batch_size, random.sample(self.mem, batch_size)

# Ornstein-Ulhenbeck Process
# Taken from #https://github.com/vitchyr/rlkit/blob/master/rlkit/exploration_strategies/ou_strategy.py
class OUNoise(object):
    def __init__(self, action_dim, mu=0.0, theta=0.15, max_sigma=0.2, min_sigma=0.2, decay_period=100000):
        self.mu           = mu
        self.theta        = theta
        self.sigma        = max_sigma
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
        self.action_dim   = action_dim #action_space.shape[0]
        self.reset()
        
    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu
        
    def evolve_state(self):
        x  = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.state = x + dx
        return self.state
    
    def get_ounoise(self, t=0):
        ou_state = self.evolve_state()
        #self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
        return ou_state

In [13]:
#Hyper parameters
gamma = 0.99
max_mem_len = 1000000 #15000
batch_size = 128 #64
actor_input_shape = (24, ) #state shape
actor_hidden1_n = 600 #400   
actor_hidden2_n = 300 
actor_output_n = 4 
critic_input1_shape = (24, )  #state shape 
critic_input2_shape = (4, )   #action shape
critic_hidden1_n = 600 #300
critic_hidden2_n = 300 #200
critic_output_n = 1
actor_lr = 0.00005 #0.0001 #10e-4
critic_lr = 0.0005 #0.001 #10e-3
WD = 0.001 #weight decay
tau = 0.001

In [14]:
#models

class Actor(object):
    def __init__(self, input_shape, hidden1_n, hidden2_n, output_n, lr):
        super(Actor, self).__init__()
        self.lr = lr
        self.input_shape = input_shape
        self.hidden1_n = hidden1_n
        self.hidden2_n = hidden2_n
        self.output_n = output_n
        self.model = self.generate_model("actor_w.h5")
        self.target_model = self.generate_model("actor_target_w.h5")
        self.target_model.set_weights(self.model.get_weights())
        self.optimizer = tfk.optimizers.Adam(learning_rate=self.lr)
        #self.epoch_loss_avg = tf.keras.metrics.Mean()
        
    def generate_model(self, model_filename):
      #k_init1 = tfk.initializers.RandomUniform(-1.5e-3, 1.5e-3)
      k_init1 = tfk.initializers.GlorotUniform()
      k_init2 = tfk.initializers.RandomUniform(-3e-3, 3e-3)

      model = tfk.models.Sequential([
            tfkl.Input(self.input_shape),
            tfkl.BatchNormalization(),
            tfkl.Dense(self.hidden1_n, activation='relu', kernel_initializer=k_init1),
            tfkl.BatchNormalization(),
            tfkl.Dense(self.hidden2_n, activation='relu', kernel_initializer=k_init1),
            tfkl.BatchNormalization(),
            tfkl.Dense(self.output_n, activation='tanh', kernel_initializer=k_init2) 
            ])
      
      try:
        model.load_weights(base_dir + model_filename)
        print(model_filename, " loaded.")
      except Exception as e: 
        print(model_filename, "not loaded.")
      
      return model
      
    def get_action(self, state, target=False):
        if target:
            action = self.target_model.predict(state)
        else:
            action = self.model.predict(state)
            #nan_ = np.isnan(action)
            #if len(action[nan_]) > 0 : 
            #  print("actor predicted nan action ", action)
              #set_trace()
        return action    

    def get_target_actions(self, states):
        actions = self.target_model.predict(states)
        return actions #np.clip(action, np.repeat([self.action_space.low], states.shape[0], axis=0), \
                       #np.repeat([self.action_space.high], states.shape[0], axis=0))
        
    def optimize(self, states, qval_grad):
        with tf.GradientTape() as tape:
            #tape.watch(states_t)
            actions = self.model(states)
        policy_grads = tape.gradient(actions, self.model.trainable_weights, \
                                     output_gradients= -1*tf.cast(qval_grad, tf.float32))
        #del tape
        #set_trace()
        #old_w = self.model.get_weights()
        self.optimizer.apply_gradients(zip(policy_grads, self.model.trainable_variables))
        #nan_ = [tf.reduce_sum(tf.cast(tf.math.is_nan(w), dtype=tf.int32)) for w in self.model.trainable_variables]
        #if any(x > 0 for x in nan_) : 
        #  print("update gave nan weight")
        #  set_trace()
        #self.epoch_loss_avg.update_state(policy_grads)
        return 
    
    def update_target_params(self):
        pass
        w = self.model.get_weights()
        target_w = self.target_model.get_weights()
        target_w1 = [tau * w1 + (1 - tau) * w2 for w1, w2 in
                          zip(w, target_w)]
        self.target_model.set_weights(target_w1)
        
    def save_models(self):
        self.model.save_weights(base_dir + "actor_w.h5")
        self.target_model.save_weights(base_dir + "actor_target_w.h5")
        
class Critic(object): 
    def __init__(self, input1_shape, input2_shape, hidden1_n, hidden2_n, output_n, lr):
        super(Critic, self).__init__()
        self.lr = lr
        self.input1_shape = input1_shape
        self.input2_shape = input2_shape
        self.hidden1_n = hidden1_n
        self.hidden2_n = hidden2_n
        self.output_n = output_n
        self.model = self.generate_model("critic_w.h5")
        self.target_model = self.generate_model("critic_target_w.h5")
        self.target_model.set_weights(self.model.get_weights())
        self.model.compile(tfk.optimizers.Adam(learning_rate=self.lr), loss='mse')
        self.optimizer = tfk.optimizers.Adam(learning_rate=self.lr)
        self.mse_loss = tf.keras.losses.MeanSquaredError()
        #self.epoch_loss_avg = tf.keras.metrics.Mean()
        
    def generate_model(self, model_filename):
        #k_init1 = tfk.initializers.RandomUniform(-1.5e-3, 1.5e-3)
        k_init1 = tfk.initializers.GlorotUniform()
        k_init2 = tfk.initializers.RandomUniform(-3e-3, 3e-3)

        state_input_layer = tfkl.Input(shape=self.input1_shape)
        s_bn1 = tfkl.BatchNormalization()(state_input_layer)
        s_hidden1 = tfkl.Dense(self.hidden1_n, activation='relu', kernel_initializer=k_init1, \
                               kernel_regularizer=tfk.regularizers.l2(WD))(s_bn1)
        s_bn2 = tfkl.BatchNormalization()(s_hidden1)
        s_hidden2 = tfkl.Dense(self.hidden1_n, activation='linear', kernel_initializer=k_init1, \
                               kernel_regularizer=tfk.regularizers.l2(WD))(s_bn2)
        s_bn3 = tfkl.BatchNormalization()(s_hidden2)

        action_input_layer = tfkl.Input(shape=self.input2_shape)
        a_hidden1 = tfkl.Dense(self.hidden1_n, activation='relu', kernel_initializer=k_init1, \
                               kernel_regularizer=tfk.regularizers.l2(WD))(action_input_layer)   #linear

        hidden_merge = tfkl.Add()([s_bn3, a_hidden1])

        hidden3 = tfkl.Dense(self.hidden2_n, activation='relu', kernel_initializer=k_init1, \
                             kernel_regularizer=tfk.regularizers.l2(WD))(hidden_merge)
        output_layer = tfkl.Dense(1, activation='linear', kernel_initializer=k_init2, \
                                  kernel_regularizer=tfk.regularizers.l2(WD))(hidden3)
        model = tfk.Model(inputs=[state_input_layer, action_input_layer],
                      outputs=output_layer)
          
        try:
          model.load_weights(base_dir + model_filename)
          print(model_filename, " loaded.")
        except Exception as e:
          print(model_filename, "not loaded.")
        
        #model.compile(loss='mse', optimizer=Adam(lr=self._learning_rate))
        #return model, state_input_layer, action_input_layer
        return model
    
    def get_target_qvals(self, states, actions):
        #inpt = np.hstack((states, actions))
        #return self.target_model.predict(inpt)
        return self.target_model.predict([states, actions])
    
    def optimize(self, states, actions, y):
        #self.critic.fit(np.hstack((states, actions)), y, epochs=1, verbose=1)
        actions_t = tf.convert_to_tensor(actions)
        with tf.GradientTape(persistent=True) as tape:
            tape.watch(actions_t)
            qvals = self.model([states, actions_t])
            loss_value=self.mse_loss(y, qvals)
        grads = tape.gradient(loss_value, self.model.trainable_weights)
        self.qval_grads = tape.gradient(qvals, actions_t) 
        del tape
        
        #set_trace()
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        #self.epoch_loss_avg.update_state(loss_value)

        #print("Step: {}, Initial Loss: {}".format(optimizer.iterations.numpy(),
        #                                  loss_value.numpy()))

        #print("Step: {},         Loss: {}".format(optimizer.iterations.numpy(),
        #                                  loss(model, features, labels, training=True).numpy()))
        return tf.reduce_mean(qvals), loss_value
    
    def update_target_params(self):
        w = self.model.get_weights()
        target_w = self.target_model.get_weights()
        target_w1 = [tau * w1 + (1 - tau) * w2 for w1, w2 in
                          zip(w, target_w)]
        self.target_model.set_weights(target_w1)
    
    def save_models(self):
        self.model.save_weights(base_dir + "critic_w.h5")
        self.target_model.save_weights(base_dir + "critic_target_w.h5")

In [16]:
#agent

class ActorCritic(object):
    def __init__(self, action_space):
        super(ActorCritic, self).__init__()
        self.gamma = gamma
        self.actor = Actor(actor_input_shape, actor_hidden1_n, actor_hidden2_n, actor_output_n, actor_lr)
        self.critic = Critic(critic_input1_shape, critic_input2_shape, critic_hidden1_n, \
                             critic_hidden2_n, critic_output_n, critic_lr)
        self.max_mem_len = max_mem_len        
        self.batch_size = batch_size
        self.action_space = action_space
        self.noise = OUNoise(action_space.shape[0])
        try:
          self.reward_list = pickle.load(open(base_dir+"reward_list.file", "rb"))
          self.mem = pickle.load(open(base_dir+"replay_mem.file", "rb"))          
        except Exception as e:
          self.mem = ReplayMemory([], max_mem_len)
          self.reward_list = []

    def get_action_noisy(self, state):
        action = self.actor.get_action(state[np.newaxis, :], target=False).squeeze(axis=0)
        return np.clip(action + self.noise.get_ounoise(), self.action_space.low, self.action_space.high)
        
    def train(self):
        batch_size1, states, actions, rewards, next_states, dones = self.sample_transitions()
        #if batch_size1 < self.batch_size: return
        
        target_actions = self.actor.get_target_actions(next_states)
        target_qvals = self.critic.get_target_qvals(next_states, target_actions)
        y = rewards + gamma*target_qvals*(1-dones)
        qval, qloss = self.critic.optimize(states, actions, y)
        self.actor.optimize(states, self.critic.qval_grads)
        self.critic.update_target_params()
        self.actor.update_target_params()        
        return qval, qloss
        #return tf.reshape(qval, []), tf.reshape(qloss, [])
        

    def store_transition(self, state, action, reward, next_state, done):
        self.mem.store(state, action, reward, next_state, float(done))

    def sample_transitions(self):
        batch_size1, samples = self.mem.sample(self.batch_size)
        states, actions, rewards, next_states, dones = zip(*samples)
        states = np.stack(states)
        actions = np.stack(actions)
        rewards = np.stack(rewards)
        next_states = np.stack(next_states)
        dones = np.stack(dones)
        return batch_size1, states, actions, rewards, next_states, dones

    def save_models(self):
        self.actor.save_models()
        self.critic.save_models()
        with open(base_dir+"reward_list.file", "wb") as f:
          pickle.dump(self.reward_list, f, pickle.HIGHEST_PROTOCOL)

    def append_reward(self, s, rp, rn):
      self.reward_list.append((s, rp, rn))

    def save_replay_mem(self):
      with open(base_dir+"replay_mem.file", "wb") as f:
        pickle.dump(self.mem, f, pickle.HIGHEST_PROTOCOL)

In [None]:
env = gym.make ('BipedalWalker-v3') 
#env = wrap_env(gym.make('BipedalWalker-v3'))
agent = ActorCritic(env.action_space)
no_of_episodes = 6000
epi_start = 119
for episode in range(epi_start, no_of_episodes+1):
    state = env.reset()
    done = False
    epoch = 0
    tot_rewardp = 0
    tot_rewardn = 0
    tot_qval = 0
    tot_qloss = 0
    reward = 0
    while not done:
        epoch += 1
        action = agent.get_action_noisy(state)
        #action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        agent.store_transition(state, action, reward, next_state, done)
        state = next_state
        qval, qloss = agent.train()
        tot_qval += qval
        tot_qloss += qloss
        if reward >= 0: tot_rewardp += reward
        else: tot_rewardn += reward
    print("Epi: {}\tSteps: {}\tQV: {:.3e}\tQL: {:.3e}\tPR: {:.3f}\tNR: {:.3f}\tR: {:.3f}\tLR: {:.3f}".format(\
                          episode, epoch, tot_qval/epoch, tot_qloss/epoch, tot_rewardp, tot_rewardn, tot_rewardp+tot_rewardn-reward, reward))
    agent.append_reward(epoch, tot_rewardp, tot_rewardn)
    if episode % 10 == 0 :
        agent.save_models()
    if episode % 100 == 0 :
        agent.save_replay_mem()
        
env.close()
#agent.close()



actor_w.h5  loaded.
actor_target_w.h5  loaded.
critic_w.h5  loaded.
critic_target_w.h5  loaded.
Epi: 119	Steps: 88	QV: -1.450e+01	QL: 5.971e+01	PR: 2.609	NR: -116.024	R: -13.415	LR: -100.000
Epi: 120	Steps: 141	QV: -1.457e+01	QL: 5.883e+01	PR: 2.138	NR: -117.296	R: -15.158	LR: -100.000
Epi: 121	Steps: 110	QV: -1.467e+01	QL: 6.059e+01	PR: 0.733	NR: -115.691	R: -14.958	LR: -100.000
Epi: 122	Steps: 197	QV: -1.474e+01	QL: 5.649e+01	PR: 0.260	NR: -137.126	R: -36.867	LR: -100.000
Epi: 123	Steps: 175	QV: -1.475e+01	QL: 5.071e+01	PR: 2.323	NR: -112.793	R: -10.470	LR: -100.000
Epi: 124	Steps: 1600	QV: -1.532e+01	QL: 5.332e+01	PR: 18.872	NR: -133.196	R: -114.389	LR: 0.065
Epi: 125	Steps: 1600	QV: -1.617e+01	QL: 5.219e+01	PR: 11.372	NR: -128.042	R: -116.574	LR: -0.096
Epi: 126	Steps: 1600	QV: -1.702e+01	QL: 5.050e+01	PR: 34.244	NR: -96.902	R: -62.606	LR: -0.052
Epi: 127	Steps: 1600	QV: -1.784e+01	QL: 4.794e+01	PR: 31.041	NR: -131.591	R: -100.386	LR: -0.165
Epi: 128	Steps: 648	QV: -1.841e+01	QL: 4

In [1]:
print(agent.mem.cntr)

NameError: ignored

In [31]:
#show_video()
