In [9]:
### Importing all the required python modules ###

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import numpy as np
import gym
from torch.autograd import Variable
import copy
import time

In [10]:
### Setting hyperparameters ###

#Number of episodes
M = 1000

# Optimizer learning rates #
critic_learning_rate = 0.001
actor_learning_rate = 0.0001

# Magnitude of action noise #
noise = 0.05

# Size of batch #
batch_size = 50

# Discount for calculating state value #
gamma = 0.9

# Control whether env renders # 
render = False

In [11]:
### Class that stores trajectory data ###

class Step():
    def __init__(self,state,action,reward,next_state):
        self.state = state
        self.action = action
        self.reward = reward
        self.next_state = next_state

In [12]:
### Generating the neural networks ###

# The Critic #

class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.layer1 = nn.Linear(4,64)
        self.layer2 = nn.Linear(64,64)
        self.layer3 = nn.Linear(64,1)
    
    def forward(self,x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x
     
# The Actor #
        
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.layer1 = nn.Linear(3,64)
        self.layer2 = nn.Linear(64,64)
        self.layer3 = nn.Linear(64,1)
        
    def forward(self,x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.tanh(self.layer3(x))
        return x

In [13]:
### Initializing the networks ###

# Initialize actor and critic # 

critic = Critic()
actor = Actor()

# Initialize target actor and target critic with the same parameters as actor and critic #

target_critic = copy.deepcopy(critic)
target_actor = copy.deepcopy(actor)

# Initialize the ADAM optimizers for actor and critic #

critic_optimizer = torch.optim.Adam(critic.parameters(), lr = critic_learning_rate)
actor_optimizer = torch.optim.Adam(actor.parameters(), lr = actor_learning_rate)

In [14]:
### Store the undiscounted return ###

scores = []

In [15]:
### The DDPG ###

# Store initial time #

t = time.time()

# Initialize environment #

env = gym.make('Pendulum-v0')

# Initialize the memory #

episode = []

# For each episode: #

for n in range(M):
    
    # Initialize undiscounted return at 0 #
    
    total_reward = 0
    
    # Initialize starting state as a Tensor #
    
    state = env.reset()
    state = torch.from_numpy(state).float()
    
    # While current state isn't terminal: #
    
    terminal = False
    while not terminal:
        
        # Have the actor network generate an action, then add random noise #
        
        #print(f"Start {time.time() - t}")
        action = actor(state) * 2  
        action = action.item()
        action += (noise * np.random.randn())

        # Take the action generated by the network #
        
        next_state, reward, terminal, _ = env.step([action])
        
        # Record the returns #
        
        total_reward += reward
        
        # Store the transition as a Step object in the memory #
        
        transition = Step(state, action, reward, next_state)
        episode.append(transition)
        
        # Render the environment
        
        if render:
            env.render()
        
        # If the memory can produce a batch, do so #
        
        #print(f"Before batch {time.time() - t}")
        
        if batch_size < len(episode):
            
            # Select random samples from the memory #
            
            batch = np.random.choice(episode,batch_size)
            
            # Assemble the batch data into a matrix #
            
            batch_matrix = []
            
            for step in batch:
                temp_list = []
                temp_list.append(step.state)
                temp_list.append(step.action)
                temp_list.append(step.reward)
                temp_list.append(step.next_state)
                batch_matrix.append(temp_list)
            
            batch_matrix = np.array(batch_matrix)
            
            #print(f"After batch {time.time() - t}")
            
            # Arrange the matrix into Tensors of each individual attribute #
            
            states = batch_matrix[:,0]
            
            actions = torch.from_numpy(np.array(batch_matrix[:,1],dtype = np.float16))

            rewards = torch.from_numpy(np.array(batch_matrix[:,2],dtype = np.float16))

            next_states = torch.Tensor((batch_matrix[:,3]).tolist())
            
            # Input the next_state data into the target actor network to get the action generated by the policy #

            target_policy_output = target_actor(next_states)
            
            # Generate the state value using the target critic network #
            
            target_critic_input = torch.hstack([next_states,target_policy_output])

            y = rewards + (gamma * torch.flatten(target_critic(target_critic_input)))
            
            # Optimize the target critic network #

            for i in range(batch_size):
                critic_optimizer.zero_grad()
                state_tensor = torch.Tensor(states[i])
                critic_input = torch.hstack([state_tensor,actions[i]])
                critic_output = critic(critic_input)

                # Loss function is the MSE of the state value and the critic network output #
                
                loss = (y[i]  - critic_output) ** 2
                
                # Perform gradient descent #
                
                loss.backward(retain_graph=True)
                critic_optimizer.step()
            
            #print(f"After opt1 {time.time() - t}")
            
            # Optimize the target actor network #
            
            qvalues = []
            
            for i in range(batch_size):
                actor_optimizer.zero_grad()
                state_tensor = torch.Tensor(states[i])
                critic_input = torch.hstack([state_tensor,actor(state_tensor)])
                critic_output = critic(critic_input))
                qvalues.append(critic_output)
            
            qvalues = torch.cat(qvalues,0)
            
            # Calculate the expected state value # 
            
            loss = -qvalues.mean()

            # Maximise the expected state value #
            
            loss.backward(retain_graph=True)
            actor_optimizer.step()
            
            #print(f"After opt2 {time.time() - t}")
            
            # Perform an update step on the parameters of the actor and critic networks #
            
            with torch.no_grad():
                for index,parameter in enumerate(target_critic.parameters()):
                    critic_parameter = list(critic.parameters())[index]
                    new_value = (parameter * (1-critic_learning_rate)) + (critic_parameter * critic_learning_rate)
                    parameter.copy_(new_value)
            
            with torch.no_grad():
                for index,parameter in enumerate(target_actor.parameters()):
                    actor_parameter = list(actor.parameters())[index]
                    new_value = (parameter * (1-actor_learning_rate)) + (actor_parameter * actor_learning_rate)
                    parameter.copy_(new_value)
            
        # Update the current state #
            
        state = torch.from_numpy(next_state).float()
    
    # Print the outcome of the episode #
    
    print(f"Episode {n+1}: Total reward: {round(total_reward,1)}, Time elapsed: {round(time.time() - t,2)}s")
    
    # Record the final undiscounted return #
    
    scores.append(total_reward)



Episode 1: -1253.4, Time elapsed: 17.74s
Episode 2: -1346.5, Time elapsed: 39.1s
Episode 3: -1494.4, Time elapsed: 62.39s
Episode 4: -1703.6, Time elapsed: 85.12s
Episode 5: -1631.2, Time elapsed: 107.26s
Episode 6: -1558.4, Time elapsed: 129.57s
Episode 7: -1258.3, Time elapsed: 152.98s
Episode 8: -1528.6, Time elapsed: 176.71s
Episode 9: -1361.1, Time elapsed: 200.09s
Episode 10: -1431.8, Time elapsed: 223.89s
Episode 11: -1510.7, Time elapsed: 248.32s
Episode 12: -1388.7, Time elapsed: 272.97s
Episode 13: -1378.6, Time elapsed: 297.07s
Episode 14: -1375.9, Time elapsed: 321.78s
Episode 15: -1525.3, Time elapsed: 346.22s
Episode 16: -1388.9, Time elapsed: 370.65s
Episode 17: -1445.2, Time elapsed: 394.85s
Episode 18: -1505.3, Time elapsed: 419.28s
Episode 19: -1491.5, Time elapsed: 443.76s
Episode 20: -1518.8, Time elapsed: 470.11s


KeyboardInterrupt: 