In [1]:
import gym
import random
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch
import pickle
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from IPython import display

In [12]:
env_name = "LunarLanderContinuous-v2"
env = gym.make(env_name)

In [13]:
env.observation_space

Box(-inf, inf, (8,), float32)

In [2]:
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))


class ReplayMemory:
    def __init__(self, length):
        self.memory = deque(maxlen=length)

    def __len__(self):
        return len(self.memory)

    def push(self, transition):
        self.memory.append(transition)

    def sample(self, size):
        sample = random.sample(self.memory, size)

        state = [i[0] for i in sample]
        action = [i[1] for i in sample]
        reward = [i[2] for i in sample]
        next_state = [i[3] for i in sample]
        terminal = [i[4] for i in sample]

        state = np.stack(state)
        state = torch.Tensor(state).squeeze()

        next_state = np.stack(next_state)
        next_state = torch.Tensor(next_state).squeeze()

        reward = np.array(reward)
        reward = torch.tensor(reward, dtype=torch.float32).reshape(-1, 1)

        terminal = np.array(terminal).astype(int)
        terminal = torch.tensor(terminal).reshape(-1, 1)

        action = np.array(action)
        action = torch.tensor(action, dtype=torch.float32)

        return state, action, reward, next_state, terminal

In [3]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=[64, 64]):
        super(Actor, self).__init__()
        self.layers = nn.ModuleList()
        self.activations = nn.ModuleList()

        input_dims = [state_dim] + hidden_dim
        output_dims = hidden_dim + [action_dim]

        for in_dim, out_dim in zip(input_dims, output_dims):
            self.layers.append(nn.Linear(in_dim, out_dim))

        for i in range(len(hidden_dim)):
            self.activations.append(nn.LeakyReLU())

        self.activations.append(nn.Tanh())

    def forward(self, state):
        x = state
        for l, activation in zip(self.layers, self.activations):
            x = l(x)
            x = activation(x)
        return x

In [4]:
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64, output_dim=1):
        super(Critic, self).__init__()
        self.l1 = nn.Linear(state_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim + action_dim, hidden_dim)
        self.l3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, state, action):
        x = F.relu(self.l1(state))
        x = F.relu(self.l2(torch.cat([x, action], dim=-1)))
        x = self.l3(x)

        return x

In [5]:
class OUNoise:
    """
    Taken from https://github.com/vitchyr/rlkit/blob/master/rlkit/exploration_strategies/ou_strategy.py
    """

    def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=1000):
        self.mu = mu
        self.theta = theta
        self.sigma = max_sigma
        self.max_sigma = max_sigma
        self.min_sigma = min_sigma
        self.decay_period = decay_period
        self.action_dim = action_space
        self.reset()

        self.epsilon = 1.0
        self.epsilon_decay = 0.00001
        self.epsilon_min = 0.05

    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu

    def evolve_state(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.state = x + dx
        return self.state

    def get_action(self, action, t=0):
        ou_state = self.evolve_state() * self.epsilon

        self.epsilon -= self.epsilon_decay
        if self.epsilon < self.epsilon_min:
            self.epsilon = self.epsilon_min

        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
        return np.clip(action + ou_state, -1.0, 1.0)

In [10]:
class DDPGAgent(nn.Module):
    def __init__(self, state_dim, action_dim, action_min, action_max, gamma=0.99):
        super(DDPGAgent, self).__init__()
        self.action_min = np.array(action_min)
        self.action_max = np.array(action_max)

        self.gamma = gamma
        self.ou_noise = OUNoise(action_dim)

        self.actor = Actor(state_dim, action_dim)
        self.critic = Critic(state_dim, action_dim)

        self.actor_target = Actor(state_dim, action_dim)
        self.critic_target = Critic(state_dim, action_dim)
        
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())

        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=1e-3)

        ################################################################
        # Task 1: Create a replay memory with maximum capacity=1000
        self.memory = ReplayMemory(10000)
        ################################################################
        self.batch_size = 50
        self.tau = 0.005
        
        self.num_fit = 0

        self.loss_ftn = nn.MSELoss()

    def get_action(self, state, t=0):
        action_before_norm = self.actor(state).detach().numpy() # ranges in [-1, 1]
        action_before_norm = self.ou_noise.get_action(action_before_norm, t)# ranges in [-1, 1]
        
        action = np.clip(action_before_norm, self.action_min, self.action_max)
        return action  #, action

    def push(self, transition):
        self.memory.push(transition)
    
    def update_target(self, source, target, tau):
        for src_param, target_param in zip(source.parameters(), target.parameters()):
            target_param.data.copy_(tau * src_param.data + (1.0 - tau) * target_param.data)

    def fit(self):
        if len(self.memory) < self.batch_size:
            return 0, 0
            
        state, action, reward, next_state, terminal = self.memory.sample(self.batch_size)

        q = self.critic(state, action)
        
        next_q_val = self.critic_target(next_state, self.actor_target(next_state))
        ################################################################
        # Task 3: Complete the critic loss calculation
        target_q = reward + self.gamma * next_q_val * (1 - terminal)
        
        # Critic loss
        critic_loss = self.loss_ftn(q.squeeze(), target_q.squeeze().detach())
        ################################################################

        # Update Critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        
        ################################################################
        # Task 4: Complete the actor loss calculation
        mu_s = self.actor(state)
        actor_loss = - self.critic(state, mu_s).mean()
#         actor_loss = - torch.sum(self.critic(state, action)) / self.batch_size
        ################################################################

        # Update Actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
#         if self.num_fit % 100 == 0:
#             self.critic_target.load_state_dict(self.critic.state_dict())
#             self.actor_target.load_state_dict(self.actor.state_dict())
        self.update_target(self.critic, self.critic_target, tau=self.tau)
        self.update_target(self.actor, self.actor_target, tau=self.tau)

        return critic_loss.item(), actor_loss.item()

In [15]:
# initialize environment / action
env_name = "LunarLanderContinuous-v2"
env = gym.make(env_name)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

action_low = env.action_space.low
action_high = env.action_space.high

################################################################
# Task 5: Initialize the DDPGAgent on the training loop
agent = DDPGAgent(state_dim, action_dim, action_low, action_high)
################################################################

MAX_EPISODES = 500

# Performance metric
rewards = []
critic_loss_traj = []
actor_loss_traj = []
max_reward = 0.0

for e in range(MAX_EPISODES):

    state = env.reset()
    episode_reward = 0
    step = 0
    critic_loss_epi = []
    actor_loss_epi = []
    state = env.reset()
    
    while True:

        state = torch.tensor(state, dtype=torch.float32)
        action = agent.get_action(state, step)

        next_state, reward, done, info = env.step(action)

        transition = [state, action, reward, next_state, done]
        
        agent.push(transition) # Saves transition on every step

        state = next_state
        episode_reward += reward
  
        critic_loss, actor_loss = agent.fit()

        critic_loss_epi.append(critic_loss)
        actor_loss_epi.append(actor_loss)
            
        if done:
            rewards.append(episode_reward)
            critic_loss_traj += critic_loss_epi
            actor_loss_traj += actor_loss_epi
            
            critic_mean = sum(critic_loss_epi) / len(critic_loss_epi)
            actor_mean = sum(actor_loss_epi) / len(actor_loss_epi)
            
            print("episode : %4d, reward : % 4.3f, critic loss : %4.3f, actor loss : %4.3f" % (e, rewards[-1], critic_mean, actor_mean))
            break

        step += 1
    
    avg5 = np.mean(rewards[-5:])
    if avg5 > max_reward:
        torch.save(agent.actor.state_dict(), "actor.pth")
        torch.save(agent.critic.state_dict(), "critic.pth")
        max_reward = avg5

env.close()

with open('rewards.pkl', 'wb') as f:
    pickle.dump(rewards, f, pickle.HIGHEST_PROTOCOL)
    
with open('actor_loss.pkl', 'wb') as f:
    pickle.dump(critic_loss_traj, f, pickle.HIGHEST_PROTOCOL)
    
with open('critic_loss.pkl', 'wb') as f:
    pickle.dump(actor_loss_traj, f, pickle.HIGHEST_PROTOCOL)

episode :    0, reward : -341.622, critic loss : 1.619, actor loss : 0.787
episode :    1, reward : -411.746, critic loss : 67.707, actor loss : 3.536
episode :    2, reward : -13.459, critic loss : 84.022, actor loss : 4.081
episode :    3, reward : -721.306, critic loss : 74.299, actor loss : 3.389
episode :    4, reward : -440.273, critic loss : 110.380, actor loss : 4.462
episode :    5, reward : -539.688, critic loss : 79.051, actor loss : 6.066
episode :    6, reward : -391.826, critic loss : 74.172, actor loss : 6.780
episode :    7, reward : -137.406, critic loss : 69.020, actor loss : 7.345
episode :    8, reward : -122.095, critic loss : 81.592, actor loss : 7.447
episode :    9, reward : -187.629, critic loss : 76.445, actor loss : 8.079
episode :   10, reward : -204.566, critic loss : 62.537, actor loss : 9.246
episode :   11, reward : -149.620, critic loss : 49.390, actor loss : 9.537
episode :   12, reward : -128.614, critic loss : 60.195, actor loss : 10.309
episode :   

episode :  107, reward : -6.214, critic loss : 18.774, actor loss : -44.215
episode :  108, reward : -52.417, critic loss : 13.654, actor loss : -43.490
episode :  109, reward : -95.683, critic loss : 11.701, actor loss : -42.136
episode :  110, reward : -96.242, critic loss : 10.113, actor loss : -41.112
episode :  111, reward : -84.339, critic loss : 6.645, actor loss : -40.723
episode :  112, reward : -52.508, critic loss : 4.361, actor loss : -40.564
episode :  113, reward : -96.407, critic loss : 5.301, actor loss : -40.237
episode :  114, reward :  246.987, critic loss : 4.398, actor loss : -39.253
episode :  115, reward : -64.165, critic loss : 3.859, actor loss : -38.415
episode :  116, reward :  130.720, critic loss : 4.347, actor loss : -38.071
episode :  117, reward : -136.406, critic loss : 3.671, actor loss : -38.488
episode :  118, reward : -73.516, critic loss : 2.729, actor loss : -38.054
episode :  119, reward : -162.804, critic loss : 7.401, actor loss : -38.894
episo

episode :  214, reward : -191.020, critic loss : 5.670, actor loss : -27.593
episode :  215, reward : -81.040, critic loss : 4.649, actor loss : -27.028
episode :  216, reward : -152.049, critic loss : 7.354, actor loss : -26.201
episode :  217, reward : -196.889, critic loss : 4.866, actor loss : -25.990
episode :  218, reward : -142.111, critic loss : 8.665, actor loss : -25.500
episode :  219, reward : -189.121, critic loss : 6.377, actor loss : -25.611
episode :  220, reward : -87.399, critic loss : 5.963, actor loss : -24.893
episode :  221, reward : -186.915, critic loss : 6.708, actor loss : -24.939
episode :  222, reward : -101.884, critic loss : 5.229, actor loss : -24.124
episode :  223, reward : -149.923, critic loss : 6.026, actor loss : -24.055
episode :  224, reward : -184.488, critic loss : 7.669, actor loss : -21.874
episode :  225, reward : -188.740, critic loss : 6.545, actor loss : -21.990
episode :  226, reward : -76.479, critic loss : 5.100, actor loss : -20.612
ep

episode :  322, reward :  29.629, critic loss : 11.558, actor loss : -34.232
episode :  323, reward :  205.701, critic loss : 16.757, actor loss : -34.830
episode :  324, reward :  2.752, critic loss : 12.414, actor loss : -36.286
episode :  325, reward :  245.259, critic loss : 17.742, actor loss : -37.861
episode :  326, reward :  201.113, critic loss : 14.986, actor loss : -39.396
episode :  327, reward :  224.339, critic loss : 16.153, actor loss : -39.897
episode :  328, reward :  287.667, critic loss : 18.521, actor loss : -39.920
episode :  329, reward :  266.263, critic loss : 14.534, actor loss : -40.818
episode :  330, reward :  303.717, critic loss : 15.986, actor loss : -41.948
episode :  331, reward :  294.616, critic loss : 18.320, actor loss : -43.478
episode :  332, reward :  281.605, critic loss : 16.651, actor loss : -43.642
episode :  333, reward :  286.219, critic loss : 16.406, actor loss : -44.404
episode :  334, reward :  280.722, critic loss : 13.340, actor loss

episode :  429, reward :  257.808, critic loss : 11.321, actor loss : -70.328
episode :  430, reward :  135.772, critic loss : 10.887, actor loss : -71.339
episode :  431, reward :  276.336, critic loss : 11.969, actor loss : -72.461
episode :  432, reward :  263.377, critic loss : 8.641, actor loss : -72.890
episode :  433, reward :  192.295, critic loss : 10.740, actor loss : -73.279
episode :  434, reward :  260.253, critic loss : 10.786, actor loss : -73.882
episode :  435, reward :  156.458, critic loss : 9.240, actor loss : -74.629
episode :  436, reward :  216.280, critic loss : 10.980, actor loss : -75.101
episode :  437, reward :  190.922, critic loss : 11.148, actor loss : -74.946
episode :  438, reward :  283.789, critic loss : 10.663, actor loss : -74.081
episode :  439, reward :  270.239, critic loss : 15.178, actor loss : -74.511
episode :  440, reward :  211.349, critic loss : 12.158, actor loss : -74.712
episode :  441, reward :  275.276, critic loss : 8.423, actor loss

In [None]:
# with open('rewards.pkl', 'rb') as f:
#     rewards = pickle.load(f)
    
# with open('critic_loss.pkl', 'rb') as f:
#     critic_loss_traj = pickle.load(f)

# with open('actor_loss.pkl', 'rb') as f:
#     actor_loss_traj = pickle.load(f)

plt.plot(rewards)
plt.title('reward')
plt.show()

plt.plot(critic_loss_traj)
plt.title('critic loss')
plt.show()

plt.plot(actor_loss_traj)
plt.title('actor loss')
plt.show()

In [18]:
# Evaluate
env_name = "LunarLanderContinuous-v2"
env = gym.make(env_name)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

action_low = env.action_space.low
action_high = env.action_space.high

agent = DDPGAgent(state_dim, action_dim, action_low, action_high)
agent.actor.load_state_dict(torch.load("actor.pth"))

rewards = []

for e in range(10):
    state = env.reset()
    episode_reward = 0
    step = 0

    while True:
        state = torch.tensor(state, dtype=torch.float32)
        action = agent.get_action(state)
#         env.render()
        next_state, reward, done, info = env.step(action)

        state = next_state
        episode_reward += reward

        if done:
            print("reward : % 4.3f" % (episode_reward))
            rewards.append(episode_reward)
            break

        step += 1
env.close()
print("Avg. Rewards:", np.mean(rewards))

reward :  256.484
reward :  246.416
reward :  116.758
reward :  285.494
reward :  249.361
reward :  238.628
reward :  207.064
reward :  228.758
reward :  251.011
reward :  91.926
Avg. Rewards: 217.1901281618704
