<a href="https://colab.research.google.com/github/johannesmichael/AMLD/blob/main/02_Intro_to_Q_learning_and_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup
---

Make sure to select `GPU` under Runtime > Change runtime type > Hardware accelerator!

In [None]:
import sys

# Checks that the Runtime is correct
if 'google.colab' in sys.modules:
    !nvidia-smi | grep -q 'failed' && echo "STOP! You are using a runtime without a GPU. Change the runtime type before going further!"

In [None]:
import sys

# Setup for use in Colab
if 'google.colab' in sys.modules:
    # Clone GitHub repository
    !git clone https://github.com/AIcrowd/droneRL-workshop

    # Install packages via pip
    !pip install -r "droneRL-workshop/colab-requirements.txt"

    # Restart Runtime so everything takes effect
    import os
    os.kill(os.getpid(), 9)

    # Your Runtime will crash after this - this is normal!
    # Resume from next cell after it restarted

In [None]:
%cd droneRL-workshop

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from PIL import Image
import os

Intro to Q-Learning (compass Q-table)
---

You can find a Q-learning implementation in `agents/`

```
agents/
├── curiosity.py
├── dqn.py
├── logging.py
├── qlearning.py    <-- Q-learning agent
└── random.py
```

In [None]:
from env.env import DeliveryDrones
from env.wrappers import CompassQTable, CompassChargeQTable, LidarCompassQTable, LidarCompassChargeQTable

# Environment without Skyscrapers + discharge
env = CompassQTable(DeliveryDrones())
env.env_params.update({'n_drones': 3, 'skyscrapers_factor': 0, 'stations_factor': 0,  'discharge': 0})
states = env.reset()

print('Observation space:', env.observation_space)
print('Initial state:', {drone_index: env.format_state(state) for drone_index, state in states.items()})
Image.fromarray(env.render(mode='rgb_array'))

In [None]:
from agents.random import RandomAgent
from agents.qlearning import QLearningAgent

# Create random agents
agents = {drone.index: RandomAgent(env) for drone in env.drones}

# Create one Q-learning agent
agents[0] = QLearningAgent(
    env,
    gamma=0.95, # Discount factor
    alpha=0.1, # Learning rate
    # Exploration rate
    epsilon_start=1, epsilon_decay=0.99, epsilon_end=0.01
)

agents

In [None]:
from helpers.rl_helpers import MultiAgentTrainer, plot_rolling_rewards, test_agents, plot_cumulative_rewards

# Train agents
trainer = MultiAgentTrainer(env, agents, reset_agents=True, seed=0)
trainer.train(5000)

In [None]:
plot_rolling_rewards(trainer.rewards_log, drones_labels={0: 'Q-learning'})

In [None]:
from env.env import DeliveryDrones

agents[0].get_qtable()

In [None]:
plt.plot(agents[0].gamma**np.arange(100))
plt.title('Discount factor: {}'.format(agents[0].gamma))
plt.xlabel('Number of steps')
plt.ylabel('Discount')
plt.show()

In [None]:
rewards_log = test_agents(env, agents, n_steps=1000, seed=0)
plot_cumulative_rewards(rewards_log, drones_labels={0: 'Q-learning'})

In [None]:
from helpers.rl_helpers import render_video, ColabVideo

path = os.path.join('output', 'videos', 'ql-compass.mp4')
render_video(env, agents, video_path=path, n_steps=120, fps=1, seed=0)
ColabVideo(path)

Scaling Q-learning (compass + lidar Q-table)
---

Let's see how Q-learning scales to larger observation spaces

In [None]:
# Environment with skyscrapers but without discharge
env = LidarCompassQTable(DeliveryDrones())
env.env_params.update({'n_drones': 3, 'skyscrapers_factor': 3, 'stations_factor': 0, 'discharge': 0})
states = env.reset()

print('Observation space:', env.observation_space)
print('Sample state:', {drone_index: env.format_state(state) for drone_index, state in states.items()})
Image.fromarray(env.render(mode='rgb_array'))

In [None]:
# Create the agents
agents = {drone.index: RandomAgent(env) for drone in env.drones}
agents[0] = QLearningAgent(
    env,
    gamma=0.95, # Discount factor
    alpha=0.1, # Learning rate
    # Exploration rate
    epsilon_start=1, epsilon_decay=0.99, epsilon_end=0.01
)
agents

In [None]:
# Train agents
trainer = MultiAgentTrainer(env, agents, reset_agents=True, seed=0)
trainer.train(5000)
plot_rolling_rewards(trainer.rewards_log, drones_labels={0: 'Q-learning'})

In [None]:
rewards_log = test_agents(env, agents, n_steps=1000, seed=0)
plot_cumulative_rewards(rewards_log, drones_labels={0: 'Q-learning'})

In [None]:
path = os.path.join('output', 'videos', 'ql-compass-lidar-1st-try.mp4')
render_video(env, agents, video_path=path, n_steps=120, fps=1, seed=0)
ColabVideo(path)

Issues with Q-learning
---

Two issues here

* Sparse reward: pickup rate is around 1%
* No generalization: need to explore entire space!

In [None]:
q_table = agents[0].get_qtable()
print('Q-table:', q_table.shape)
q_table

In [None]:
plt.plot(agents[0].epsilons)
plt.xlabel('Number of episodes')
plt.ylabel('Exploration rate (epsilon)')
plt.show()

Possible solutions
---

In [None]:
from helpers.rl_helpers import set_seed

# (1/2) Sparse rewards: Create an intermediate "pickup" reward to help
env.env_params.update({
    'n_drones': 3, 'pickup_reward': 0.99, 'delivery_reward': 1.0,
    'skyscrapers_factor': 3, 'stations_factor': 0, 'discharge': 0})
states = env.reset()

# (2/2) Train longer...
agents[0].epsilon = 1.0
agents[0].epsilon_decay = 0.999

set_seed(env, seed=0) # Make things deterministic
trainer.train(30000)

plot_rolling_rewards(
    trainer.rewards_log,
    events={'pickup': [0.99], 'delivery': [1], 'crash': [-1]},
    drones_labels={0: 'Q-learning'})

In [None]:
plt.plot(agents[0].epsilons)
plt.xlabel('Number of episodes')
plt.ylabel('Exploration rate (epsilon)')
plt.show()

In [None]:
rewards_log = test_agents(env, agents, n_steps=1000, seed=0)
plot_cumulative_rewards(
    rewards_log,
    events={'pickup': [0.99], 'delivery': [1], 'crash': [-1]},
    drones_labels={0: 'Q-learning'}
)

Overfitting issues: try with different seeds
---

In [None]:
rewards_log = test_agents(env, agents, n_steps=1000, seed=1)
plot_cumulative_rewards(
    rewards_log,
    events={'pickup': [0.99], 'delivery': [1], 'crash': [-1]},
    drones_labels={0: 'Q-learning'}
)

The agent only learned to act in a specific environment!

Q-learning limitations: discrete Q-table!
---

Let's try Q-learning with the full environment: skyscrapers + charge

In [None]:
env = LidarCompassChargeQTable(DeliveryDrones())
env.env_params.update({
    'n_drones': 3, 'pickup_reward': 0.99, 'delivery_reward': 1,
    'discharge': 10, 'charge': 20, 'charge_reward': -0.1
})
states = env.reset()

print('Observation space:', env.observation_space)
print('Sample state:', env.format_state(states[0]))
Image.fromarray(env.render(mode='rgb_array'))

In [None]:
# Create the agents
agents = {drone.index: RandomAgent(env) for drone in env.drones}
agents[0] = QLearningAgent(
    env, gamma=0.95, alpha=0.1,
    epsilon_start=1, epsilon_decay=0.999, epsilon_end=0.01
)

trainer = MultiAgentTrainer(env, agents, reset_agents=True, seed=0)
trainer.train(35000)
plot_rolling_rewards(trainer.rewards_log)

In [None]:
q_table = agents[0].get_qtable()
print('Q-table:', q_table.shape)
q_table.sample(10)

Don't forget to test with different seeds

In [None]:
rewards_log = test_agents(env, agents, n_steps=1000, seed=0)
plot_cumulative_rewards(
    rewards_log,
    events={'pickup': [0.99], 'delivery': [1], 'crash': [-1], 'charging': [-0.1]},
    drones_labels={0: 'Q-learning'}
)

Note that for now, we are only training our agent in a single environment: the charging points, skyscrapers etc are always at the same position.

But during evaluation, the environment won't be the same!

Resetting the environment every X steps would help, but won't solve the important limitations with Q-learning.

In [None]:
rewards_log = test_agents(env, agents, n_steps=1000, seed=1)
plot_cumulative_rewards(
    rewards_log,
    events={'pickup': [0.99], 'delivery': [1], 'crash': [-1], 'charging': [-0.1]},
    drones_labels={0: 'Q-learning'}
)

First tests with deep Q-learning (DQN)
---

In [None]:
from agents.dqn import DQNAgent, DenseQNetworkFactory

# Create environment
env = LidarCompassChargeQTable(DeliveryDrones())
env.env_params.update({
    'n_drones': 3, 'pickup_reward': 0.99, 'delivery_reward': 1
})
states = env.reset()

# Create the agents
agents = {drone.index: RandomAgent(env) for drone in env.drones}
agents[0] = DQNAgent(
    env, DenseQNetworkFactory(env, hidden_layers=[256, 256]),
    gamma=0.95, epsilon_start=1, epsilon_decay=0.999, epsilon_end=0.01,
    memory_size=10000, batch_size=64, target_update_interval=5
)
trainer = MultiAgentTrainer(env, agents, reset_agents=True, seed=0)
agents[0].qnetwork

In [None]:
# Train the agents
trainer.train(25000)
plot_rolling_rewards(
    trainer.rewards_log, drones_labels={0: 'DQN'},
    events={'pickup': [0.99], 'delivery': [1], 'crash': [-1], 'charging': [-0.1]})

In [None]:
plt.plot(agents[0].epsilons)
plt.xlabel('Number of episodes')
plt.ylabel('Exploration rate (epsilon)')
plt.show()

Try with different seeds

In [None]:
rewards_log = test_agents(env, agents, n_steps=1000, seed=0)
plot_cumulative_rewards(
    rewards_log, drones_labels={0: 'DQN'},
    events={'pickup': [0.99], 'delivery': [1], 'crash': [-1], 'charging': [-0.1]})

In [None]:
# Inspect replay memory buffer
agents[0].inspect_memory(top_n=10, max_col=80)

Take a moment to play with the different parameters: `memory_size`, `batch_size`, `target_update_interval`

In [None]:
path = os.path.join('output', 'videos', 'dqn-compass-lidar-charge.mp4')
render_video(env, agents, video_path=path, n_steps=120, fps=1, seed=0)
ColabVideo(path)

DQN and WindowedGrid
---

In [None]:
from env.wrappers import WindowedGridView
from agents.dqn import ConvQNetworkFactory

# Create environment
env = WindowedGridView(DeliveryDrones(), radius=3)
env.env_params.update({
    'n_drones': 3, 'pickup_reward': 0.99, 'delivery_reward': 1
})
states = env.reset()

# Create the agents
agents = {drone.index: RandomAgent(env) for drone in env.drones}
agents[0] = my_agent = DQNAgent(
    env, ConvQNetworkFactory(env, conv_layers=[
        {'out_channels': 32, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'out_channels': 32, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'out_channels': 32, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'out_channels': 64, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'out_channels': 64, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'out_channels': 64, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'out_channels': 64, 'kernel_size': 3, 'stride': 1, 'padding': 1},
    ], dense_layers=[1024, 256]),
    gamma=0.95, epsilon_start=1, epsilon_decay=0.99, epsilon_end=0.01,
    memory_size=10000, batch_size=64, target_update_interval=5
)
trainer = MultiAgentTrainer(env, agents, reset_agents=True, seed=0)
agents[0].qnetwork

In [None]:
# Train the agents
for run in range(10):
  trainer.train(2500)
  plot_rolling_rewards(
      trainer.rewards_log, drones_labels={0: 'DQN'},
      events={'pickup': [0.99], 'delivery': [1], 'crash': [-1], 'charging': [-0.1]})

In [None]:
rewards_log = test_agents(env, agents, n_steps=1000, seed=0)
plot_cumulative_rewards(
    rewards_log, drones_labels={0: 'DQN'},
    events={'pickup': [0.99], 'delivery': [1], 'crash': [-1], 'charging': [-0.1]})

# Print final evaluation scores
print('Final scores:')
for idx, score in enumerate(np.sum(list(rewards_log.values()), axis=1)):
    print("Agent {}: {}".format(idx, score))

In [None]:
path = os.path.join('output', 'videos', 'dqn-windowed.mp4')
render_video(env, agents, video_path=path, n_steps=120, fps=1, seed=0)
ColabVideo(path)

## Submit to AIcrowd! 🚀

> https://www.aicrowd.com/challenges/dronerl

In [None]:
path = os.path.join('output', 'agents', 'dqn-agent.pt')
agents[0].save(path)
# agents[0].load(path) # Later, load the qnetwork!