# Deep Q-Network with Lunar Lander

This notebook shows an implementation of a DQN on the LunarLander environment.
Details on the environment can be found [here](https://gym.openai.com/envs/LunarLander-v2/).

Note: The following code is heavily inspired by [this]( https://www.katnoria.com/nb_dqn_lunar/) blog post.


## 1. Setup

We first need to install some dependencies for using the environment:

In [None]:
!pip3 install box2d pygame moviepy

**Make sure to restart your kernel now!**

In [None]:
import random
import sys
from time import time
from collections import deque, defaultdict, namedtuple
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
env = gym.make('LunarLander-v2')
env.reset(seed=0)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

## 2. Define the neural network, the replay buffer and the agent

First, we define the neural network that predicts the Q-values for all actions, given a state as input.
This is a fully-connected neural net with two hidden layers using Relu activations.
The last layer does not have any activation and outputs a Q-value for every action.

In [None]:
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, seed):
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, 32)
        self.fc2 = nn.Linear(32, 64)
        self.fc3 = nn.Linear(64, action_size)  
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)     

Next, we define a replay buffer that saves previous transitions (so-called `experiences`) and provides a `sample` function to randomly extract a batch of experiences from the buffer.

Note that experiences are internally saved as `numpy`-arrays. They are converted back to PyTorch tensors before being returned by the `sample`-method.

In [None]:
class ReplayBuffer:
    def __init__(self, buffer_size, batch_size, seed):
        self.batch_size = batch_size
        self.seed = random.seed(seed)
        self.memory = deque(maxlen=buffer_size) # maximum size of buffer
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
    
    def add(self, state, action, reward, next_state, done):
        experience = self.experience(state, action, reward, next_state, done)
        self.memory.append(experience)
                
    def sample(self):
        experiences = random.sample(self.memory, self.batch_size)
        
        # Convert to PyTorch tensors
        states = np.vstack([experience.state for experience in experiences if experience is not None])
        states_tensor = torch.from_numpy(states).float().to(device)
        
        actions = np.vstack([experience.action for experience in experiences if experience is not None])
        actions_tensor = torch.from_numpy(actions).long().to(device)

        rewards = np.vstack([experience.reward for experience in experiences if experience is not None])
        rewards_tensor = torch.from_numpy(rewards).float().to(device)

        next_states = np.vstack([experience.next_state for experience in experiences if experience is not None])
        next_states_tensor = torch.from_numpy(next_states).float().to(device)
        
        # Convert done flag from boolean to int
        dones = np.vstack([experience.done for experience in experiences if experience is not None]).astype(np.uint8)
        dones_tensor = torch.from_numpy(dones).float().to(device)
        
        return (states_tensor, actions_tensor, rewards_tensor, next_states_tensor, dones_tensor)
        
    def __len__(self):
        return len(self.memory)
    

In [None]:
BUFFER_SIZE = int(1e5)  # Replay memory size
BATCH_SIZE = 64         # Number of experiences to sample from memory
GAMMA = 0.99            # Discount factor
TAU = 1e-3              # Soft update parameter for updating fixed q network
LR = 1e-4               # Q Network learning rate
UPDATE_EVERY = 4        # How often to update Q network

class DQNAgent:
    def __init__(self, state_size, action_size, seed):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Initialize Q and Fixed Q networks
        self.q_network = QNetwork(state_size, action_size, seed).to(device)
        self.fixed_network = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.q_network.parameters())
        # Initiliase memory 
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)
        self.timestep = 0
    
    def step(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)
        self.timestep += 1
        
        # trigger training
        if self.timestep % UPDATE_EVERY == 0:
            if len(self.memory) > BATCH_SIZE: # only when buffer is filled
                sampled_experiences = self.memory.sample()
                self.learn(sampled_experiences)
        
    def learn(self, experiences):
 
        states, actions, rewards, next_states, dones = experiences

        action_values = self.fixed_network(next_states).detach()
        max_action_values = action_values.max(1)[0].unsqueeze(1)

        # If "done" just use reward, else update Q_target with discounted action values
        Q_target = rewards + (GAMMA * max_action_values * (1 - dones))
        Q_expected = self.q_network(states).gather(1, actions)

        # Calculate loss and update weights
        loss = F.mse_loss(Q_expected, Q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # Update fixed weights
        self.update_fixed_network(self.q_network, self.fixed_network)
        
    def update_fixed_network(self, q_network, fixed_network):
        for source_parameters, target_parameters in zip(q_network.parameters(), fixed_network.parameters()):
            target_parameters.data.copy_(TAU * source_parameters.data + (1.0 - TAU) * target_parameters.data)
        
        
    def act(self, state, eps=0.0):
        rnd = random.random()
        if rnd < eps:
            return np.random.randint(self.action_size)
        else:
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            action_values = self.q_network(state)
            action = np.argmax(action_values.cpu().data.numpy())
            return action

### 3. Executes episodes and train the model

We first define some paramters which are guiding the training process:

In [None]:
MAX_EPISODES = 2000  # Max number of episodes to play
MAX_STEPS = 1000     # Max steps allowed in a single episode/play

# Epsilon schedule
EPS_START = 1.0      # Default/starting value of eps
EPS_DECAY = 0.999    # Epsilon decay rate
EPS_MIN = 0.01       # Minimum epsilon 

Then we start executing episodes and observe the mean score per episode.
The environment is considered as solved if this score is above 200.

In [None]:
# Get state and action sizes
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

print('State size: {}, action size: {}'.format(state_size, action_size))
dqn_agent = DQNAgent(state_size, action_size, seed=0)
start = time()

# Maintain a list of last 100 scores
scores_window = deque(maxlen=100)
eps = EPS_START
for episode in range(1, MAX_EPISODES + 1 ):
    state, _ = env.reset()
    score = 0
    for t in range(MAX_STEPS):
        action = dqn_agent.act(state, eps)
        next_state, reward, done, _, _ = env.step(action)
        dqn_agent.step(state, action, reward, next_state, done)
        state = next_state        
        score += reward        
        if done:
            break
            
        eps = max(eps * EPS_DECAY, EPS_MIN)

    scores_window.append(score)

    if episode % 99 == 0:
        mean_score = np.mean(scores_window)
        print('Progress {}/{}, average score:{:.2f}'.format(episode, MAX_EPISODES, mean_score))

    mean_score = np.mean(scores_window)
    if mean_score >= 200:
        print('\rEnvironment solved in {} episodes, average score: {:.2f}'.format(episode, mean_score))
        sys.stdout.flush()
        break
            
end = time()    
print('Took {} seconds'.format(end - start))

### 4. Play epsiode and record it

Use the trained model to play and record one episode. The recorded video will be stored into the `video`-subfolder on disk.

In [None]:
import time

FPS = 25 

env = gym.make('LunarLander-v2', render_mode="rgb_array")
video = gym.wrappers.monitoring.video_recorder.VideoRecorder(env, "video_lunarlander.mp4")

state, _ = env.reset()
total_reward = 0.0

while True:
        start_ts = time.time()
        env.render()
        video.capture_frame()
        
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        action_values = dqn_agent.q_network(state)
        action = np.argmax(action_values.cpu().data.numpy())

        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        if done:
            break
            
        delta = 1/FPS - (time.time() - start_ts)
        if delta > 0:
            time.sleep(delta)

print("Total reward: %.2f" % total_reward)
video.close()
env.close()