# **Twin-Delayed DDPG**

## **INSTALLING LIBRARIES**

In [1]:
# Module for physics simulation
!pip install pybullet

Collecting pybullet
[?25l  Downloading https://files.pythonhosted.org/packages/cb/94/9fdf9b24dd1ebdf2b7920d97c124d2df56edd0bdc8dff942e5a585c0fd86/pybullet-2.5.6.tar.gz (77.4MB)
[K     |████████████████████████████████| 77.4MB 50kB/s 
[?25hBuilding wheels for collected packages: pybullet
  Building wheel for pybullet (setup.py) ... [?25l[?25hdone
  Created wheel for pybullet: filename=pybullet-2.5.6-cp36-cp36m-linux_x86_64.whl size=88636101 sha256=1421b7833a246e6f13a3169a9585f8208f0c9c3b2a23048f302e8a8a6116fe1d
  Stored in directory: /root/.cache/pip/wheels/74/22/24/936718f593d621ad167815e1b54e69135191954a9f52024328
Successfully built pybullet
Installing collected packages: pybullet
Successfully installed pybullet-2.5.6


## **IMPORTING LIBRARIES**

In [0]:
import os
import gym # Library for Enviornment
import time
import random
import torch # PyTorch
import numpy as np
import torch.nn as nn
from gym import wrappers
import pybullet_envs 
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt

## **DEFINING EXPERIENCE REPLAY**

In [0]:
class ReplayBuffer(object):

  def __init__(self, max_size=1e6):
    """
      ARGS:
        max_size: maximum size of previous transitions stored in buffer
      PARAMS:
        storage: List containg list of transitions where each transition in list containg current state, next state , action taken to get to the next state , reward , done
        ptr: points to cell which was enter first (oldest) among all the rest of enteries at any given time
    """
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self, transition):
    """
      ARGS:
        transition: list containg current state, next state , action taken to get to the next state , reward , done
      ADD:
        Adds the new transition to storage
    """
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr + 1) % self.max_size # point to the oldest cell 
    else:
      self.storage.append(transition)

  def sample(self, batch_size):
    """
      Returns arrays of individual columns of a transition with each array having size = batch_size
      ARGS:
        batch_size: selects size of transitions to sampled
    """ 
    ind = np.random.randint(0, len(self.storage), size=batch_size)   # Samples random indices of size = batch_size
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
    for i in ind: 
      state, next_state, action, reward, done = self.storage[i]
      # Append all transitions columns to list
      batch_states.append(np.array(state, copy=False))
      batch_next_states.append(np.array(next_state, copy=False))
      batch_actions.append(np.array(action, copy=False))
      batch_rewards.append(np.array(reward, copy=False))
      batch_dones.append(np.array(done, copy=False))
    return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)

## **NEURAL NETWORK FOR ACTOR MODEL**

In [0]:
def Linear_Layer(input_dim,output_dim,activation='ReLU'):
  """
    ARGS:
      input_dim: Dimensions of input 
      output_dim: Dimensions of output 
      activation: Either ReLU or Tanh will be used as activation for linear layer

    RETURNS:
      Object for linear layer
  """
  if activation=='Tanh':
    return nn.Sequential(nn.Linear(input_dim,output_dim),nn.Tanh())
  elif activation=='ReLU':
    return nn.Sequential(nn.Linear(input_dim,output_dim),nn.ReLU(True))
  # If anyother activation is required it will return only linear layer
  else:
    return nn.Linear(input_dim,output_dim)

In [0]:
class Actor(nn.Module):
  
  def __init__(self, state_dim, action_dim, max_action):
    """
      ARGS:
        state_dim: Dimensions of input state
        actions_dim: Dimensions of actions (output)
        max_actions: used to clip actions (limit range)
    """
    super(Actor, self).__init__()
    self.layer_1 = nn.Linear(state_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, action_dim)
    self.max_action = max_action

  def forward(self, x):
    """
      RETURNS:
        output of network passed through each layer
    """
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.tanh(self.layer_3(x))
    return x

## **NEURAL NETWORK FOR TWIN CRITIC MODEL**

In [0]:
class Critic(nn.Module):
  
  def __init__(self, state_dim, action_dim):
    super(Critic, self).__init__()
    # Defining the first Critic neural network
    self.layer_1 = nn.Linear(state_dim + action_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, 1)
    # Defining the second Critic neural network
    self.layer_4 = nn.Linear(state_dim + action_dim, 400)
    self.layer_5 = nn.Linear(400, 300)
    self.layer_6 = nn.Linear(300, 1)  

  def forward(self, x, u):
    """
        ARGS:
          x,u: States and Actions
        RETURNS:
          output of network passed through each layer
    """
    xu = torch.cat([x, u], 1)
    # Forward-Propagation on the first Critic Neural Network
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1) # No activation 
    # Forward-Propagation on the second Critic Neural Network
    x2 = F.relu(self.layer_4(xu))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2) # No activation 
    return x1, x2

  def Q1(self, x, u):
    """
      ARGS:
          x,u: States and Actions
      RETURNS:
        Calculated Q1 Value for current input
    """
    xu = torch.cat([x, u], 1)
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    return x1

## **POLICY CLASS**

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TD3(object):
  
  def __init__(self, state_dim, action_dim, max_actions):
    """
      Initialize the complete model with actor model, actor target, critic model and critic target.
      ARGS:
        state_dim: Dimensions of input state 
        actions_dim: Dimensions of actions (output)
        max_actions: used to clip actions (limit range)
    """
    self.actor=Actor(state_dim,action_dim,max_actions).to(device) # Object for Actor model
    self.actor_target=Actor(state_dim,action_dim,max_actions).to(device) # Object for Actor Target model
    self.actor_target.load_state_dict(self.actor.state_dict())  # Intialize weights of actor target with weights of actor model
    self.actor_optimizer=torch.optim.Adam(self.actor.parameters())  # Adam Optimizer
    self.critic=Critic(state_dim,action_dim).to(device)  # Object for Critic Model
    self.critic_target=Critic(state_dim,action_dim).to(device) # Object for Critic Target Model
    self.critic_target.load_state_dict(self.critic.state_dict())  # Intialize weights of critic target with weights of critic model
    self.critic_optimizer=torch.optim.Adam(self.critic.parameters())   # Adam Optimizer
    self.max_actions=max_actions
    self.max_action=max_actions
  def select_action(self, state):
    """
      RETURNS:
        action as output from actor model
    """
    state = torch.Tensor(state.reshape(1, -1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()

  def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
    
    """"
      Trainer Function
      ARGS:
        replay_buffer: History of Previous transitions
        epochs: Number of iterations
        batch_size: Batch Size for training
        discount: discount factor to reduce value of later rewards
        tau: factor used in Polyvak Averaging for actor target model
        policy_noise: Noise in Policy
        noise_clip: Maximum noise value (clipped)
        policy_freq: frequency of updating weights for policy (actor target)
    """
    for it in range(iterations):
      
      # Convert Training data into torch tensors
      batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
      state = torch.Tensor(batch_states).to(device)
      next_state = torch.Tensor(batch_next_states).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)
      
      # Predict Next action using actor model
      next_action = self.actor_target(next_state)
      
      
      noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)  # Get Noise of size of sampled data
      noise = noise.clamp(-noise_clip, noise_clip)  # Clip the noise
      next_action = (next_action + noise).clamp(-self.max_action, self.max_action)  # Add noise to next actions
      
      # The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
      target_Q1, target_Q2 = self.critic_target(next_state, next_action)
      
      #Take minimum of Q values predicted by the twin critic models
      target_Q = torch.min(target_Q1, target_Q2)
      
      # Final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
      target_Q = reward + ((1 - done) * discount * target_Q).detach()
      
      
      current_Q1, current_Q2 = self.critic(state, action) # RL ,detach from graph
      
      
      critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) # Get MSE loss
      
      # Backpropagate
      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      self.critic_optimizer.step()
      
      if it % policy_freq == 0:
        # Apply polvak average with policy frequency
        # BackProp
        actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        # Polyak Averaging for actor
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
        
        # Polyak Averaging for critic
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
  
  # Making a save method to save a trained model
  def save(self, filename, directory):
    """
      Save Actor and critic models in directory
      ARGS:
        filename:Name of weights
        directory:Directory for saving
    """
    torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
    torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
  
  # Making a load method to load a pre-trained model
  def load(self, filename, directory):
    """
      Save Actor and critic models in directory
      ARGS:
        filename:Name of weights
        directory:Directory for saving
    """
    self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
    self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

In [0]:
def evaluate_policy(policy, eval_episodes=10):
  """
    Calculates average reward over eval episode
  """
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False
    # Simulate
    while not done:
      action = policy.select_action(np.array(obs))
      obs, reward, done, _ = env.step(action)
      avg_reward += reward
  avg_reward /= eval_episodes
  print(8*'-','\n','Average Reward: {}'.format(avg_reward),' \n',8*'-','\n')
  return avg_reward

## **SET PARAMETERS**

In [0]:
env_name = "AntBulletEnv-v0" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

## We create a file name for the two saved models: the Actor and Critic models

In [9]:
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

---------------------------------------
Settings: TD3_AntBulletEnv-v0_0
---------------------------------------


In [0]:
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')
max_episode_steps = env._max_episode_steps
save_env_vid = False
if save_env_vid:
  env = wrappers.Monitor(env, monitor_dir, force = True)
  env.reset()

In [0]:
# MAKE DIRECTORY FOR WEIGHTS/RESULTS
mkdir('./','results')
mkdir('./','pytorch_models')

### SETTING THE ENVIORNMENT

In [11]:
env = gym.make(env_name)



In [0]:
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

### CREATING THE MODEL AND EXPERIENCE REPLAY





In [15]:
policy = TD3(state_dim, action_dim, max_action)
replay_buffer = ReplayBuffer()
evaluations = [evaluate_policy(policy)]

---------------------------------------
Average Reward over the Evaluation Step: 9.804045
---------------------------------------


### SETTING STARTING VARIABLES

In [0]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()
avg_reward_list=[]

## **TRAINING**

In [18]:
# TRAINING
while total_timesteps < max_timesteps:
  
  # If the episode is done
  if done:

    # Start the training process of the model
    if total_timesteps != 0:
      print("Total Timesteps: {} Episode Num: {} Reward: {}".format(total_timesteps, episode_num, episode_reward))
      policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)

    # Evaluate POLICY of model
    if timesteps_since_eval >= eval_freq:
      timesteps_since_eval %= eval_freq
      temp=evaluate_policy(policy)
      avg_reward_list.append(temp)
      evaluations.append(temp)
      policy.save(file_name, directory="./pytorch_models")
      np.save("./results/%s" % (file_name), evaluations)
    
    # Reset the state of the environment
    obs = env.reset()
    
    # Set the Done to False
    done = False
    
    # Set rewards and episode timesteps to zero
    episode_reward = 0
    episode_timesteps = 0
    episode_num += 1
  
  # Before 10000 timesteps, we play random actions
  if total_timesteps < start_timesteps:
    action = env.action_space.sample()
  else: # After 10000 timesteps, we switch to the model
    action = policy.select_action(np.array(obs))
    # If the explore_noise parameter is not 0, we add noise to the action and we clip it
    if expl_noise != 0:
      action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)
  
  # The agent performs the action in the environment, then reaches the next state and receives the reward
  new_obs, reward, done, _ = env.step(action)
  
  # Check if the episode is done
  done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
  
  # Accumulate the total reward
  episode_reward += reward
  
  # We store the new transition into the Experience Replay memory (ReplayBuffer)
  replay_buffer.add((obs, new_obs, action, reward, done_bool))

  # Update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
  obs = new_obs
  episode_timesteps += 1
  total_timesteps += 1
  timesteps_since_eval += 1

# Add the last policy evaluation to our list of evaluations and we save our model
evaluations.append(evaluate_policy(policy))
if save_models: policy.save("%s" % (file_name), directory="./pytorch_models")
np.save("./results/%s" % (file_name), evaluations)

Total Timesteps: 1000 Episode Num: 1 Reward: 496.2954657611786
Total Timesteps: 1172 Episode Num: 2 Reward: 79.91545047840339
Total Timesteps: 2172 Episode Num: 3 Reward: 518.3324693367457
Total Timesteps: 3172 Episode Num: 4 Reward: 537.836161280911
Total Timesteps: 4172 Episode Num: 5 Reward: 510.8818134303453
Total Timesteps: 5172 Episode Num: 6 Reward: 516.4083629260031
---------------------------------------
Average Reward over the Evaluation Step: 136.943563
---------------------------------------
Total Timesteps: 6172 Episode Num: 7 Reward: 461.26159425501857
Total Timesteps: 7172 Episode Num: 8 Reward: 496.2660554826422
Total Timesteps: 8172 Episode Num: 9 Reward: 486.4106359269747
Total Timesteps: 8354 Episode Num: 10 Reward: 82.01943954108575
Total Timesteps: 8374 Episode Num: 11 Reward: 4.667669991715229
Total Timesteps: 8513 Episode Num: 12 Reward: 53.858602006848365
Total Timesteps: 9167 Episode Num: 13 Reward: 305.0356408144698
Total Timesteps: 10167 Episode Num: 14 Rewar

In [0]:
import matplotlib.pyplot as plt
plt.plot(x_axis,avg_reward_list)
for i in range(len(avg_reward_list)):
  x_axis.append(str((i+1)*eval_freq/1000)+'k')