# In this project we will solve two simple environments using a Q-table and a Neural Network (Deep Q-learning).

# Subproject 1

Solve [`FrozenLake8x8-v0`](https://gym.openai.com/envs/FrozenLake8x8-v0/) using a Q-table.


1. Import Necessary Packages:

In [1]:
import numpy as np
import random
import gym


2. Instantiate the Environment and Agent

In [2]:
env = gym.make("FrozenLake8x8-v0")
env.render()


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


3. Set up the QTable:

In [3]:
action_size = env.action_space.n
print("Actions: ", action_size)
state_size = env.observation_space.n
print("States: ", state_size)
obs = env.reset()
print(env.action_space)

Actions:  4
States:  64
Discrete(4)


4. The Q-Learning algorithm training

In [4]:
# Hyperparameters
tot_eps = 100000
tot_test_eps = 20

lr = 0.01
discount = 0.97

epsilon = 1
epsilon_max = 0.9
epsilon_min = 0.01
decay = 0.01
qtable = np.zeros((state_size, action_size))
print(qtable.size)

256


In [6]:
import progressbar
for ep in progressbar.progressbar(range(tot_eps)):
    # reset variables at start of new episode
    state = env.reset()
    step = 0
    done = False
    reward = 0
    while not done:
        action = env.action_space.sample()
        state_new, reward, done, _ = env.step(action)
        qtable[state, action] = qtable[state, action] + lr * (reward + discount * np.max(qtable[state_new, :]) - qtable[state, action])
        state = state_new
print("Done!")

100% (100000 of 100000) |################| Elapsed Time: 0:02:38 Time:  0:02:38


Done!


5. Evaluate how well your agent performs
* Render output of one episode
* Give an average episode return

In [56]:
rewards = []
hist = [[] for _ in range(tot_test_eps)]
for ep in progressbar.progressbar(range(tot_test_eps)):        
    state = env.reset()
    step = 0
    done = False
    tot_rewards = 0
    hist[ep].append(env.render(mode='ansi'))
    while not done:
        action = np.argmax(qtable[state, :])
        state_new, reward, done, info = env.step(action)
        tot_rewards += reward
        state = state_new
        hist[ep].append(env.render(mode='ansi'))
    rewards.append(tot_rewards)
env.close()
success = np.argwhere(rewards == np.amax(rewards)).flatten().tolist()
print(f"Successful episodes: {success}")
print("/////////////////////////////////////////////////////")
all_steps = list(map(lambda x : len(x), hist))
for s in hist[np.argmin(all_steps)]:
    print(s)
print("/////////////////////////////////////////////////////")
print ("Score over time: " +  str(sum(rewards)/tot_test_eps))
print(rewards)

100% (20 of 20) |########################| Elapsed Time: 0:00:00 Time:  0:00:00


Successful episodes: [0, 4, 5, 6, 7, 8, 11, 12, 14, 15, 16, 17, 18, 19]
/////////////////////////////////////////////////////

[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Down)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Right)
SFFFFFFF
F[41mF[0mFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Up)
SFFFFFFF
FF[41mF[0mFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Up)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Right)
SFF[41mF[0mFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Right)
SFF[41mF[0mFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Right)
SFF[41mF[0mFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Right)
SFFF[41mF[0mFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

  (Right)
SFFFF[41mF[0mFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFF

# Subproject 2

Solve [MoonLander-v2](https://gym.openai.com/envs/LunarLander-v2/) using DQN.

**1. Import Necessary Packages:**


In [3]:
!pip install box2d-py
#Imports
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
import random
from gym import wrappers



**2. Instantiate the Environment**

In [4]:
env = gym.make('LunarLander-v2')
env.seed(0)
print('State shape: ', env.observation_space.shape)
print('Number of Actions: ', env.action_space.n)

State shape:  (8,)
Number of Actions:  4


**3. Implement and instantiate the agent**



In [22]:
class DQN():
    def __init__(
        self, states, actions, lr, batch_size, # neural network stuff
        gamma, epsilon, epsilon_min, epsilon_decay # coefficients
    ):
        self.states = states
        self.actions = actions
        self.batch_size = batch_size
        self.memory = deque([], maxlen=100000)
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        self.model = self.build_model()
        self.loss = []
        
    def build_model(self, units_fc1 = 128, units_fc2 = 128):
        model = keras.Sequential()
        model.add(keras.layers.Dense(units_fc1, input_dim = self.states, activation='relu')) 
        model.add(keras.layers.Dense(units_fc2, activation='relu')) 
        model.add(keras.layers.Dense(self.actions, activation='linear'))
        model.compile(loss='mean_squared_error', optimizer=keras.optimizers.Adam(learning_rate = self.lr)) 
        return model

    def action(self, state):
        if np.random.rand() <= self.epsilon: # should we explore?
            return random.randrange(self.actions) # why not
        return self.exploit(state) # nah, let's exploit

    def exploit(self, state):
        vals = self.model.predict(state)
        return np.argmax(vals[0])

    def store(self, state, action, reward, nstate, done):
        self.memory.append((state, action, reward, nstate, done))

    def experience_replay(self):
        mini_batch = random.sample(self.memory, self.batch_size)
        x = []
        y = []
        n = np.array(mini_batch, dtype='object')
        st = np.zeros((0, self.states))
        nst = np.zeros((0, self.states))
        for i in range(len(n)):
            st = np.append( st, n[i, 0], axis=0)
            nst = np.append( nst, n[i, 3], axis=0)
        st_predict = self.model.predict(st)
        nst_predict = self.model.predict(nst)
        index = 0
        for state, action, reward, next_state, done in mini_batch:
            x.append(state)
            predict_actions = nst_predict[index]
            if done == True:
                target = reward
            else:
                target = reward + self.gamma * np.amax(predict_actions)
            target_f = st_predict[index]
            target_f[action] = target
            y.append(target_f)
            index += 1
        x_reshape = np.array(x).reshape(self.batch_size, self.states)
        y_reshape = np.array(y)
        epoch_count = 1
        hist = self.model.fit(x_reshape, y_reshape, epochs = epoch_count, verbose=0)
        for i in range(epoch_count):
            self.loss.append( hist.history['loss'][i] )
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

**4. Train the agent with DQN**

4.1 Show the episode return plot
  
  - Is the agent learning to solve the task?

4.2 Save the best model

In [24]:
import progressbar
episodes = 1000
dqn = DQN(env.observation_space.shape[0], env.action_space.n, 0.0001, 64, 0.99, 1, 0.001, 0.995)
rewards = [] #Store rewards for graphing
epsilons = [] # Store the Explore/Exploit
test_episodes = 0
frames = 900
for e in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, dqn.states])
    tot_rewards = 0
    for frame in range(frames):
        action = dqn.action(state)
        nstate, reward, done, _ = env.step(action)
        nstate = np.reshape(nstate, [1, dqn.states])
        tot_rewards += reward
        dqn.store(state, action, reward, nstate, done)
        state = nstate
        if done or frame == frames - 1:
            rewards.append(tot_rewards)
            epsilons.append(dqn.epsilon)
            print(f"episode: {e}/{episodes}, score: {tot_rewards}, e: {dqn.epsilon}, terminal: {done}")
            break
        if len(dqn.memory) > dqn.batch_size:
            dqn.experience_replay()
    if len(rewards) > 5 and np.average(rewards[-5:]) > 200:
        test_episodes = episodes - e
        train_end = e
        break
dqn.model.save(dqn.model.name)

episode: 0/1000, score: -109.43353595470799, e: 0.8690529955452602, terminal: True
episode: 1/1000, score: -484.4234492529217, e: 0.4222502236424958, terminal: True
episode: 2/1000, score: -688.1426636461046, e: 0.20826882814336947, terminal: True
episode: 3/1000, score: -318.624147090209, e: 0.12743425563174798, terminal: True
episode: 4/1000, score: -213.1794386533012, e: 0.09480864735409487, terminal: True
episode: 5/1000, score: -137.50795396944375, e: 0.056575091797066025, terminal: True
episode: 6/1000, score: -425.9270786199924, e: 0.03750748018035199, terminal: True
episode: 7/1000, score: -395.48484209311874, e: 0.02260731802731653, terminal: True
episode: 8/1000, score: -388.22018790318026, e: 0.01335588042198471, terminal: True
episode: 9/1000, score: -1111.4637589305362, e: 0.00196829127312784, terminal: True
episode: 10/1000, score: -426.1804389137089, e: 0.0009954703940636294, terminal: False
episode: 11/1000, score: -110.85764260527773, e: 0.0009954703940636294, terminal

KeyboardInterrupt: 

In [None]:
for e_test in range(test_episodes):
    state = env.reset()
    state = np.reshape(state, [1, nS])
    tot_rewards = 0
    frames = 900
    for frame in range(frames):
        action = dqn.exploit(state)
        nstate, reward, done, _ = env.step(action)
        nstate = np.reshape( nstate, [1, nS])
        tot_rewards += reward
        state = nstate
        if done or frame == frame - 1: 
            rewards.append(tot_rewards)
            epsilons.append(0)
            print("episode: {}/{}, score: {}, e: {}"
                  .format(e_test, TEST_Episodes, tot_rewards, 0))
            break;

In [None]:
rolling_average = np.convolve(rewards, np.ones(100)/100)
plt.plot(rewards)
plt.plot(rolling_average, color='black')
plt.axhline(y=200, color='r', linestyle='-')
eps_graph = [200*x for x in epsilons]
plt.plot(eps_graph, color='g', linestyle='-')
plt.axvline(x = train_end, color='y', linestyle='-')
plt.xlim( (0, episodes) )
plt.show()

**5. Load the model from the disk and run it in a loop**
- Hint: if you want to see the agent laning the Moon Lander, type `env.render()` after the `env.step()`.
- Do to Colab not cooperating with the Gym rendering, you might want to download the trained model and run this loop on you computer to visualise the behavior.

In [None]:
model = keras.models.load_model(dqn.model.name)
while True:
    state = env.reset()
    vals = model.predict(state)
    action = np.argmax(vals[0])
    state, reward, done, _ = env.step(action)
    env.render()

**Helper functions**

Save rendered images:

In [None]:
import imageio
import numpy as np

images = []
images.append(img)
img = model.env.render(mode='rgb_array')

imageio.mimwrite('./moonlander.gif',
                [np.array(img) for i, img in enumerate(images) if i%2 == 0],
                fps=29)

Display saved .gif

In [None]:
from pathlib import Path
gifPath = Path("./moonlander.gif")
# Display GIF in Jupyter, CoLab, IPython
with open(gifPath,'rb') as f:
    display.Image(data=f.read(), format='png')