# DDPG in the Pendulum environment

In [None]:
import random
from collections import deque

import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils as utils
import torch.optim as optim
from torch.autograd import grad

EPISODE_COUNT = 10000
EPISODE_LENGTH = 200
OBS_SIZE = 3
MINIBATCH_SIZE = 16
EXPERIENCE_CAPACITY = 5000

In [None]:
class Critic(nn.Module):

    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(OBS_SIZE + 1, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class Actor(nn.Module):
    
    def __init__(self, low, high):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(OBS_SIZE, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)
        self.action_scale = (high - low) / 2
        self.action_avg = (high + low) / 2

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        return x*self.action_scale + self.action_avg

       
def transfer_state(src, dst):
    dst.load_state_dict(src.state_dict())  
    

def empty_transition_block():
    return torch.zeros(1, OBS_SIZE * 2 + 3)


def decreasing_uniform_noise(i):
    max_mul = 4
    min_mul = 0.5
    i_at_min = int(EPISODE_COUNT / 2)
    mul = ((i_at_min - min(i, i_at_min))/i_at_min)*(max_mul - min_mul) + min_mul
    return (random.random() - 0.5)*mul


def clip_with_noise(value, i, low, high):
    return torch.clamp(value + decreasing_uniform_noise(i), low, high)


def polyak(src, dst, p):
    for src_p, dst_p in zip(src.parameters(), dst.parameters()):
        dst_p.data.copy_(p*dst_p.data + (1-p)*src_p.data) 


In [None]:
# Environment
env = gym.make('Pendulum-v0')

low = env.action_space.low.item()
high = env.action_space.high.item()

# Networks, loss fn, optimizer
q = Critic()
q_ = Critic()
transfer_state(q, q_)

a = Actor(low, high)
a_ = Actor(low, high)
transfer_state(a, a_)

criterion = nn.MSELoss()
q_optimizer = optim.Adam(q.parameters(), lr=0.005)
a_optimizer = optim.Adam(a.parameters(), lr=0.005)

# Discount factor
gamma = 0.99

# Replay memory
transitions = deque(maxlen=EXPERIENCE_CAPACITY)

# Replay memory counter, counter for target net update, polyak average multiplier
next_i = 0
c = 0
p = 0.99

# Train, test, visualise
for i_episode in range(EPISODE_COUNT):
    observation = torch.Tensor(env.reset())
    for t in range(EPISODE_LENGTH):
        # Selection action a_t from the actor network, add noise and clip
        with torch.no_grad():
            action = clip_with_noise(a.forward(observation), i_episode, low, high)

        # Execute action a_t in emulator and observe reward r_t and observation x_t+1
        new_observation, reward, done, info = env.step(action.numpy())
        new_observation = torch.Tensor(new_observation)

        # Store transition (x_t, a_t, r_t, x_t+1, d)
        transition_block = empty_transition_block()
        transition_block[0][0] = action[0]
        transition_block[0][1:OBS_SIZE + 1] = observation
        transition_block[0][OBS_SIZE + 1: 2 * OBS_SIZE + 1] = new_observation
        transition_block[0][2 * OBS_SIZE + 1] = reward
        transition_block[0][2 * OBS_SIZE + 2] = 0 if (done or t == EPISODE_LENGTH - 1) else 1
        transitions.append(transition_block)
        observation = new_observation
        next_i += 1
        c += 1

        # If enough memory stored
        if next_i > 500:
            # Sample random minibatch of transitions (x_j, a_j, r_j, x_j+1)
            batch = torch.cat(random.sample(transitions, MINIBATCH_SIZE), 0)

            # Set targets
            rs = batch[:, 2 * OBS_SIZE + 1].view(MINIBATCH_SIZE, 1)
            ds = batch[:, 2 * OBS_SIZE + 2].view(MINIBATCH_SIZE, 1)
            next_states = batch[:, OBS_SIZE + 1: 2 * OBS_SIZE + 1]
            actions = a_.forward(next_states)
            ys = rs + gamma * q_.forward(torch.cat((actions, next_states), 1))

            # Perform a gradient descent step for critic on (y_j - Q(x_j, a_j))^2
            outputs = q.forward(batch[:, :OBS_SIZE + 1])
            loss = criterion(outputs, ys)
            
            q_optimizer.zero_grad()
            loss.backward()
            utils.clip_grad_norm_(q.parameters(), 5)  # Gradients tend to explode otherwise
            q_optimizer.step()

            # Perform a gradient ascent step for actor based on gradient of Q w.r.t action
            a_optimizer.zero_grad()
            q_optimizer.zero_grad()

            actions = a.forward(batch[:, 1:OBS_SIZE + 1])
            
            loss = -1*(q.forward(torch.cat((actions, batch[:, 1:OBS_SIZE + 1]), 1))).sum()
            loss.backward()   
            a_optimizer.step()

            # Update target network weights
            polyak(q, q_, p)
            polyak(a, a_, p)

        if done:
            break

    if i_episode % 10 == 0 and next_i > 500:
        allrewards = 0
        count = 100
        with torch.no_grad():
            for j in range(count):
                observation = torch.Tensor(env.reset())
                rewards = 0
                for t in range(EPISODE_LENGTH):
                    # Uncomment for rendering:
                    # env.render()
                    
                    action = a.forward(observation)
                    observation, reward, done, info = env.step(action)
                    rewards += reward
                    observation = torch.Tensor(observation)
                    if done:
                        break
                allrewards += rewards/count
        print("trained from %d episodes, current avg test reward: %d" % (i_episode, allrewards))
        if allrewards > -300:
            torch.save(a.state_dict(), "DDPGPendulum")
            print("Pendulum solved, saved actor")
            break

env.close()
