In [1]:
#%% Imports
import gym
import numpy as np
import os
import torch
from buffer import ReplayBuffer
from collections import deque
from gym import wrappers
from networks import DQN
from tqdm import tqdm
from utils import set_seed_everywhere

In [2]:
def show_replay():
    """
    Not-so-elegant way to display the MP4 file generated by the Monitor wrapper inside a notebook.
    The Monitor wrapper dumps the replay to a local file that we then display as a HTML video object.
    """
    import io
    import base64
    from IPython.display import HTML
    video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
    encoded = base64.b64encode(video)
    return HTML(data='''
        <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
    .format(encoded.decode('ascii')))

In [3]:
#%% Hyperparameters
NAME = 'dqn_32_clip_random_sgd'
PATH_TRAINED = 'trained_models_ll/'
PATH_STATS = 'statistics_ll/'

general = {
        "seed": 1,
        "checkpoint": 50,
        "agent_type": "DQN",
        "gradient_clipping": True,
        "buffer_type": "Random"
    }

params = {
        "n_episodes": 1000,
        "batch_size": 32,
        "buffer_cap": 100000,
        "gamma": 0.99,
        "tau": 0.01
    }

nn_params ={
        "optimizer": "SGD",
        "layers": [64,64,32,16,8],
        "learning_rate": 0.001,
        "weight_decay": 0
    }

In [4]:
#%% Initialize environment
env = gym.make('LunarLander-v2')

In [5]:
#%% Unpack parameters
SEED = general['seed']
AGENT_TYPE = general['agent_type']
CHECKPOINT = general['checkpoint']
CLAMP_GRAD = general['gradient_clipping']
BUFFER_TYPE = general['buffer_type']
N_EPISODES = params['n_episodes']
BATCH_SIZE = params['batch_size']
BUFFER_CAP = params['buffer_cap']
GAMMA = params['gamma']
TAU = params['tau']
n_in = env.observation_space.shape[0]
n_out = env.action_space.n

# Fix NN layers
layers = deque(nn_params['layers'])
layers.appendleft(n_in)
layers.append(n_out)
nn_params['layers'] = layers

# Instantiate networks and replay buffer
policy_net = DQN(nn_params)
target_net = DQN(nn_params)
target_net.load_state_dict(policy_net.state_dict()) # same weights as policy_net
if BUFFER_TYPE == "Random":
    buffer = ReplayBuffer(BUFFER_CAP)

# Move NN parameters to GPU (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
policy_net.to(device)
target_net.to(device)

DQN(
  (nn): Sequential(
    (layer_0): Linear(in_features=8, out_features=64, bias=True)
    (act_0): ReLU()
    (layer_1): Linear(in_features=64, out_features=64, bias=True)
    (act_1): ReLU()
    (layer_2): Linear(in_features=64, out_features=32, bias=True)
    (act_2): ReLU()
    (layer_3): Linear(in_features=32, out_features=16, bias=True)
    (act_3): ReLU()
    (layer_4): Linear(in_features=16, out_features=8, bias=True)
    (act_4): ReLU()
    (layer_5): Linear(in_features=8, out_features=4, bias=True)
  )
)

In [6]:
#%% Fill experience replay buffer
s = env.reset()
while len(buffer) < BUFFER_CAP:
    a = env.action_space.sample()
    s1, r, done, info = env.step(a)
    buffer.push(s, a, r, s1, done)
    if not done:
        s = np.copy(s1)
    else:
        s = env.reset()

In [7]:
# TODO refractor code, no need to repeat training loop (adapt DQN to handle this)
if AGENT_TYPE == "DQN":

    epsilon = 1.0
    rewards, lengths, losses, epsilons, dones = [], [], [], [], []
    
    for i in tqdm(range(N_EPISODES)):

        length = 0
        ep_reward = 0
        ep_loss = 0
        
        s = env.reset()

        while True:
            
            # Select action with epsilon-greedy strategy
            if np.random.rand() < epsilon:
                a = env.action_space.sample()
            else:
                with torch.no_grad():
                    a = policy_net(torch.tensor(s, device=device).float()).argmax().item()

            s1, r, done, info = env.step(a) # act
            buffer.push(s, a, r, s1, done) # store transition in experience replay buffer

            # Sample from buffer
            batch = np.array(buffer.sample(BATCH_SIZE)) 
            s_batch, a_batch, r_batch, s1_batch , done_mask = batch[:,0], batch[:,1], batch[:,2], batch[:,3], batch[:, 4]

            # Convert and to Tensor (if needed)
            s_batch = torch.tensor(list(s_batch), device=device).float()
            a_batch = torch.tensor(a_batch.astype(int, copy=False), device=device)
            r_batch = torch.tensor(r_batch.astype(float, copy=False), device=device)
            s1_batch = torch.tensor(list(s1_batch), device=device).float()
            done_mask = done_mask.astype(bool, copy=False)

            policy_net.optimizer.zero_grad() # clean gradients

            # Compute policy Q-values (s, a) from observation batch
            Q = policy_net(s_batch).gather(1, a_batch.unsqueeze(1))
            
            # Compute target max a Q(s', a) from next observation batch
            Q1 = torch.zeros(BATCH_SIZE, device=device)
            Q1[~done_mask] = target_net(s1_batch[~done_mask]).max(1)[0].detach()
            
            # Compute expected target values for each sampled experience
            Q_target = r_batch + (GAMMA * Q1)

            # Update network weights
            loss = policy_net.loss(Q, Q_target.unsqueeze(1))
            loss.backward()
            if CLAMP_GRAD:
                for param in policy_net.parameters():
                    param.grad.data.clamp_(-1, 1)
            policy_net.optimizer.step()

            # Update target network parameters from policy network parameters
            target_net.update_params(policy_net.state_dict(), TAU)
            
            # Bookkeeping
            s = np.copy(s1)
            length += 1
            ep_reward += r
            ep_loss += loss.item()

            if done:
                break

        # Bookkeeping
        epsilon *= N_EPISODES/(i/(N_EPISODES/20) + N_EPISODES) # decrease epsilon
        epsilons.append(epsilon); rewards.append(ep_reward); lengths.append(length); losses.append(ep_loss)

        if (i+1) % CHECKPOINT == 0:
            print(f"Episode {i+1}:\nEpsilon: {epsilon}\nReward: {ep_reward}\nLength: {length}\nLosses: {ep_loss}\n")
           
    torch.save(policy_net.state_dict(), PATH_TRAINED + NAME + f'_policy_net_{i+1}.pth')
    torch.save(target_net.state_dict(), PATH_TRAINED + NAME + f'_target_net_{i+1}.pth')
    np.savez(PATH_STATS + NAME + f'_stats_{i+1}', epsilons=epsilons, rewards=rewards, lengths=lengths, losses=losses)

elif AGENT_TYPE == "DDQN":
    
    epsilon = 1.0
    rewards, lengths, losses, epsilons, dones = [], [], [], [], []
    
    for i in tqdm(range(N_EPISODES)):

        length = 0
        ep_reward = 0
        ep_loss = 0
        
        s = env.reset()

        while True:
            
            # Select action with epsilon-greedy strategy
            if np.random.rand() < epsilon:
                a = env.action_space.sample()
            else:
                with torch.no_grad():
                    a = policy_net(torch.tensor(s, device=device).float()).argmax().item()

            s1, r, done, info = env.step(a) # act
            buffer.push(s, a, r, s1, done) # store transition in experience replay buffer

            # Sample from buffer
            batch = np.array(buffer.sample(BATCH_SIZE)) 
            s_batch, a_batch, r_batch, s1_batch , done_mask = batch[:,0], batch[:,1], batch[:,2], batch[:,3], batch[:, 4]

            # Convert and to Tensor (if needed)
            s_batch = torch.tensor(list(s_batch), device=device).float()
            a_batch = torch.tensor(a_batch.astype(int, copy=False), device=device)
            r_batch = torch.tensor(r_batch.astype(float, copy=False), device=device)
            s1_batch = torch.tensor(list(s1_batch), device=device).float()
            done_mask = done_mask.astype(bool, copy=False)

            policy_net.optimizer.zero_grad() # clean gradients

            # Compute Q-values (s, a) and a' from policy net
            Q = policy_net(s_batch).gather(1, a_batch.unsqueeze(1))
            a_next = policy_net(s1_batch[~done_mask]).argmax(dim=1, keepdim=True)
            
            # Compute Q-values (s', a') from target net
            Q1 = torch.zeros(BATCH_SIZE, device=device)
            Q1[~done_mask] = target_net(s1_batch[~done_mask]).gather(1, a_next).squeeze(1).detach()
            
            # Compute expected target values for each sampled experience
            Q_target = r_batch + (GAMMA * Q1)

            # Update network weights
            loss = policy_net.loss(Q, Q_target.unsqueeze(1))
            loss.backward()
            if CLAMP_GRAD:
                for param in policy_net.parameters():
                    param.grad.data.clamp_(-1, 1) # clamping gradients
            policy_net.optimizer.step()

            # Update target network parameters from policy network parameters
            target_net.update_params(policy_net.state_dict(), TAU)
            
            # Bookkeeping
            s = np.copy(s1)
            length += 1
            ep_reward += r
            ep_loss += loss.item()

            if done:
                break

        # Bookkeeping
        epsilon *= N_EPISODES/(i/(N_EPISODES/20) + N_EPISODES) # decrease epsilon
        epsilons.append(epsilon); rewards.append(ep_reward); lengths.append(length); losses.append(ep_loss)

        if (i+1) % CHECKPOINT == 0:
            print(f"Episode {i+1}:\nEpsilon: {epsilon}\nReward: {ep_reward}\nLength: {length}\nLosses: {ep_loss}\n")
           
    torch.save(policy_net.state_dict(), PATH_TRAINED + NAME + f'_policy_net_{i+1}.pth')
    torch.save(target_net.state_dict(), PATH_TRAINED + NAME + f'_target_net_{i+1}.pth')
    np.savez(PATH_STATS + NAME + f'_stats_{i+1}', epsilons=epsilons, rewards=rewards, lengths=lengths, losses=losses)

  batch = np.array(buffer.sample(BATCH_SIZE))
  5%|▌         | 50/1000 [00:35<10:16,  1.54it/s]

Episode 50:
Epsilon: 0.9758055743720845
Reward: -110.61636624761134
Length: 84
Losses: 564341.8421797946



 10%|█         | 100/1000 [01:08<10:02,  1.49it/s]

Episode 100:
Epsilon: 0.9058021309857155
Reward: -99.9601010933797
Length: 149
Losses: 904940.2629440746



 15%|█▌        | 150/1000 [01:57<16:28,  1.16s/it]

Episode 150:
Epsilon: 0.7998927863012315
Reward: -89.24661562109104
Length: 190
Losses: 2857079.6650688993



 20%|██        | 200/1000 [02:48<13:29,  1.01s/it]

Episode 200:
Epsilon: 0.6720169513826998
Reward: -120.81049884301696
Length: 213
Losses: 259518515.87271726



 25%|██▌       | 250/1000 [04:10<18:18,  1.46s/it]

Episode 250:
Epsilon: 0.5371557707427748
Reward: -287.91904519316205
Length: 118
Losses: 579366330.0206673



 30%|███       | 300/1000 [05:13<11:30,  1.01it/s]

Episode 300:
Epsilon: 0.40852002917402486
Reward: -308.5363476840229
Length: 174
Losses: 2134529672.474514



 35%|███▍      | 349/1000 [07:43<14:24,  1.33s/it]  


KeyboardInterrupt: 

# Testing the agent

In [59]:
#%% Random agent
env = gym.make('LunarLander-v2')
env = wrappers.Monitor(env, "./gym-results", force=True) 
env.reset()
done = False
while True:
    env.render()
    s, r, done, info = env.step(env.action_space.sample())
    if done: break
env.close()
show_replay()

In [None]:
#%% DQN agent
s = env.reset()
done = False
rewards = []
while True:
    env.render()
    s, r, done, info = env.step(policy_net(torch.tensor(s, device=device).float()).argmax().item())
    rewards.append(r)
    if done: break
env.close()
show_replay()

In [None]:
plt.plot(rewards)
plt.title("DQN agent")
plt.xlabel("Timestep")
plt.ylabel("Reward")

In [None]:
#%% DDQN agent
s = env.reset()
done = False
while True:
    env.render()
    s, r, done, info = env.step(policy_net(torch.tensor(s, device=device).float()).argmax().item())
    if done: break
env.close()
show_replay()

In [None]:
plt.plot(rewards)
plt.title("DDQN agent")
plt.xlabel("Timestep")
plt.ylabel("Reward")

Actions:

0. Do nothing
1. Fire left engine
2. Fire down engine
3. Fire right engine