In [27]:
import time
from collections import deque, namedtuple

import gym #Testing
import numpy as np
import PIL.Image
import tensorflow as tf
import utils

from pyvirtualdisplay import Display
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers import Adam

In [26]:
# Set up a virtual display to render the Lunar Lander environment.
Display(visible=0, size=(840, 480)).start();

# Set the random seed for TensorFlow
tf.random.set_seed(utils.SEED)

FileNotFoundError: [WinError 2] The system cannot find the file specified

In [None]:
#Hyperparameters

MEMORY_SIZE = 100_000  #Size of a memory buffer
GAMMA = 0.995  #Discount factor
ALPHA = 1e-3 #Learning rate
NUM_STEPS_FOR_UPDATE = 4 #perfrom a learning update every C time steps

Action Space

The agent has four discrete actions available:

* Do nothing.
* Fire right engine.
* Fire main engine.
* Fire left engine.

Each action has a corresponding numerical value:

```python
Do nothing = 0
Fire right engine = 1
Fire main engine = 2
Fire left engine = 3
```

### Rewards

The Lunar Lander environment has the following reward system:

* Landing on the landing pad and coming to rest is about 100-140 points.
* If the lander moves away from the landing pad, it loses reward. 
* If the lander crashes, it receives -100 points.
* If the lander comes to rest, it receives +100 points.
* Each leg with ground contact is +10 points.
* Firing the main engine is -0.3 points each frame.
* Firing the side engine is -0.03 points each frame.



In [None]:
#Load the enviroment

env = gym.make('LunarLander-v2')
env.reset()
PIL.Image.fromarray(env.render(mode='rgb_array'))

In [None]:
#Getting the size of the state vector and the number of valid actions
state_size = env.observation_space.shape
num_actions = env.action_space.n

print('State Shape: ', state_size)
print('Number of actions: ', num_actions)

### Interacting with the Gym Environment

In [None]:
#Reset the environment and get the initial state.

initial_state = env.reset()

In [None]:
#Select an action
action = 0

#Run a single time step of the environment's dynamics with the given action
next_state, reward, done, _ = env.step(action)

utils.display_table(initial_state, action, next_state, reward, done)

### Deeep Q Learning

In [None]:
q_network = Sequential([
    Input(shape=state_size),
    Dense(units=64, activation='relu'),
    Dense(units=64, activation='relu'),
    Dense(units=num_actions, activation='linear')
])

target_q_network = Sequential([
    Input(shape=state_size),
    Dense(units=64, activation='relu'),
    Dense(units=64, activation='relu'),
    Dense(units=num_actions, activation='linear')
])

optimizer = Adam(learning_rate=ALPHA)

In [None]:
#Experience Replay

#Store experiences as named tuples
experience = namedtuple("Experience", field_names = ["state", "action", "reward", "next_state", "done"])

<a name="7"></a>
## Deep Q-Learning Algorithm with Experience Replay

<br>
<br>
<figure>
  <img src = "images/deep_q_algorithm.png" width = 90% style = "border: thin silver solid; padding: 0px">
</figure>

In [None]:
#calculate_loss

def compute_loss(experiences, gamma, q_network, target_q_network):
    # Unpack the mini-batch of experience tuples
    states, actions, rewards, next_states, done_vals = experiences
    
    # Compute max Q^(s,a)
    max_qsa = tf.reduce_max(target_q_network(next_states), axis=-1)
    
    # Set y = R if episode terminates, otherwise set y = R + γ max Q^(s,a).

    y_targets = rewards + (gamma * max_qsa * (1 - done_vals))
    
    # Get the q_values
    q_values = q_network(states)
    q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]),
                                                tf.cast(actions, tf.int32)], axis=1))  
    # Compute the loss
    loss = MSE(y_targets, q_values) 
    
    return loss

In [None]:
# UNIT TEST    
test_compute_loss(compute_loss)

In [1]:
#Updating the neural network weights

@tf.function
def agent_learn(experiences, gamma):    
    # Calculate the loss
    with tf.GradientTape() as tape:
        loss = compute_loss(experiences, gamma, q_network, target_q_network)

    # Get the gradients of the loss with respect to the weights.
    gradients = tape.gradient(loss, q_network.trainable_variables)
    
    # Update the weights of the q_network.
    optimizer.apply_gradients(zip(gradients, q_network.trainable_variables))

    # update the weights of target q_network
    utils.update_target_network(q_network, target_q_network)

NameError: name 'tf' is not defined

In [None]:
#Train the agent
