<a href="https://colab.research.google.com/github/krishnagorrepati/DeepLearningProjects/blob/master/T3D_Algorithm_EVAP2S9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Initialization

In [0]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque


Step 1: Initialize the Experience Replay Memory with a size of 1e6.
Then we populate it with new transitions

In [0]:
class ReplayBuffer(obect):
  def __init__(self, max_size= 1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0
  
  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr +1) % self.max_size
    else:
      self.storage.append(transition)

  def sample(self, batch_size):
    ind = np.random.randinit(0, len(self.storage), batch_size)
    batch_states, batch_next_states, batch_actions, batch_rewards, \
      batch_dones = [], [], [], [], []
    for i in ind:
      state, next_state, action, reward, done = self.storage[i]
      batch_states.append(np.array(state, copy = False))
      batch_next_states.append(np.array(next_state, copy = False))
      batch_actions.append(np.array(action, copy = False))
      batch_rewards.append(np.array(reward, copy = False))
      batch_dones.append(np.array(done, copy + False))
    return np.array(batch_states), np.array(batch_next_states), \
      np.array(batch_actions), np.array(batch_rewards).reshape(-1,1), \
      np.array(batch_dones).reshare(-1,1)
           

Step 2: Build one DNN for the Actor model and one for Actor Target

In [0]:
class Actor(nn.Module):

  def __init__(self, state_dims, action_dim, max_action):
    #max_action is to clip in case we added too much noise
    super(Actor, self).__init__() #activate inheritance
    self.layer_1 = nn.Linear(state_dims, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, action_dim)
    self.max_action = max-action
  
  def forward(self, x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.tanh(self.layer_3(x))
    return x

Step 3: Build two DNNs for the two Critic models and two DNNs for the two Critic Targets.

In [0]:
class Critic(nn.Module):

  def __init__(self, state_dims, action_dim):
    #max_action is to clip in case we added too much noise
    super(Critic, self).__init__() # activate the inheritance
    #First Critic Network
    self.layer_1 = nn.Linear(state_dims + action_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, action_dim)
    # Second Critic Network
    self.layer_4 = nn.Linear(state_dims + action_dim, 400)
    self.layer_5 = nn.Linear(400, 300)
    self.layer_6 = nn.Linear(300, action_dim)
  
  def forward(self, x, u): #x - state, u= action
    xu = torch.cat([x, u], 1) # 1 for vertical concatenation, 0 for Horizontal
    # forward propagation from first critic
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    # forward propagation on second Critic
    x2 = F.relu(self.layer_4(xu))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)

    return x1, x2
  
  def Q1(self, x, u): #x - state,  u = action This is used for updating the Q values
    xu = torch.cat([x,u], 1) # 1 for vertical concatenation, 0 for Horizontal
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer-2(x1))
    x1 = self.layer_3(x1)
    return x1

# Step 4 - 15: Training process. Create a T3D class, initialize variables.

In [0]:
# Select the device  (GPU or CPU)
device = torch.device('cuda' if torch.cude.is_available() else 'cpu')

# Building the whole trianing Pricess into a class

class T3D(object):
  def __init__(self, state-dims, action_dim, max_action):
    #making sure our T3D class can work with any env
    self.actor = Actor(state_dims, action_dim, max_action).to(device) #GD
    self.actor_target = Actor(state_dims, action_dim, max_action).to(device) #Polyak Avg
    self.actor_target.load_state_dict(self.actor.state_dict)
    #initializing with model weights to keep them same
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

    self.critic = Critic(state_dims, action_dim).to(device) #GD
    self.critic_target = critic(state_dims, action_dim).to(device) # Polyak Avg
    self.critic_target.load_state_dict(self.critic.state_dict)
    #Initializing with odel weights to keep them same
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
    self.max_action = max_action

  def select_action(self, state):
    state = torch.Tensor(state.reshape(1,-1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()
    #need to to convert to numpy,  
  
  # Sample from a batch of transitions (s, s', a, r) from the memory

  def train(self, replay_buffer, iterations, batch_size=100, discount=0.99,
            tau = 0.005, policy_noise=0.2, noise_clip=0.5, policy_freq = 2):
    for it in range(iterations):
      #step 4 We sample from a batch of transitions (s, s', a, r) from the memory
      batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones \
        = replay_buffer.sample(batch_size)
      state = torch.Tensor(batch_states).to(device)
      next_state = torch.Tensor(batch_next_states).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)

      #Step 5: From the next state s', the Actor target plays the next action a'
      next_action = self.actor_target.forward(next_state)

      #Step 6: We add Gaussian noise to this next action a' and we clamp it in a
      # range of values supported by the environment
      noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
      noise = noise.clamp(-noise_clip, noise_clip)
      next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

      #Step 7: The two Critic targets take each the couple (s', a') as input and
      # return two Q values, Qt1(s', a') and Qt2(s', a') as outputs
      target_Q1, target_Q2 = self.critic_target.forward(next_state, next_action)

      #Step 8: Keep the minimum of these two Q-Values
      target_Q = torch.min(target_Q1, target_Q2)

      #Step 9: We get the final target of the two Critic models, which is:
      # Qt = r + gamma * min(Qt1, Qt2)
      #taget_Q = reward + (1-done)* discount *target_Q
      # 0 = episode not over, 1 = episode over
      # We can't run the above equation efficiently as some components are in Computational
      # graphs and some are not. we need to make one minor modification
      target_Q = reward + ((1-done) * discount * target_Q).detach()

      #Step 10: Two Critic models take each the couple (s,a) as input
      # and return two Q-Values
      current_Q1, current_Q2 = self.critic.forward(state, action)

      #Step 11: We compute the loss coming from the two Critic models
      critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

      #Step 12: We backpropagate this Critic loss and update the parameters of the two Critic
      # models with a Adam optimizer
      self.critic_optimizer.zero_grad() #initializing the gradients to zero
      critic_loss.backward() #computing the gradients
      self.critic_optimizer.step() # performing the updates

      #Step 13: Once every two iterations, we update our Actor model by performing
      # gradient ASCENT on the output of the first Critic model
      if it % policy_freq == 0:
        #This is the DPG part
        actor_loss = -(self.critic.Q1(state, self.actor(state)).mean())
        self.actor_optimizer.grad_zero()
        actor_loss.backward()
        self.actor_optimizer.step()
      
      #Step 14: Still, in once every two iterations, we update our Actor Target
      # by Polyak Averaging
      for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
      
      #Step 15: Still, in once every two iterations, we update our Critic Target
      # by Polyak Averaging
      for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)







