# DQN

Originally from https://skettee.github.io/post/q_network/ (in Korean)

## Load Libraries and Extensions

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import random
from collections import deque
from IPython.display import display, clear_output, Pretty
from keras import Sequential
from keras.layers import Dense
from keras.models import load_model
import numpy as np
import os
from pprint import pprint
from time import sleep
from tqdm import tqdm_notebook as tqdm

import gym

Using TensorFlow backend.


## Cart Pole Environment

In [3]:
ENV_NAME = 'CartPole-v1'
N_STEP = 100
N_EPISODE = 10

In [4]:
env = gym.make(ENV_NAME)
state = env.reset()
action = env.action_space.sample()

for i_episode in range(N_EPISODE):
    observation = env.reset()
    for t in range(N_STEP):
        env.render()

        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print('Episode finished after {} steps'.format(t + 1))
            sleep(1)
            break

env.close()

print('State space: ', env.observation_space)
print('Initial state: ', state)
print('\nAction space: ', env.action_space)
print('Random action: ', action)

Episode finished after 14 steps
Episode finished after 9 steps
Episode finished after 13 steps
Episode finished after 10 steps
Episode finished after 16 steps
Episode finished after 15 steps
Episode finished after 20 steps
Episode finished after 21 steps
Episode finished after 38 steps
Episode finished after 34 steps
State space:  Box(4,)
Initial state:  [-0.03022219 -0.01513947  0.01063969  0.02018719]

Action space:  Discrete(2)
Random action:  1


### Action

$A = \{0, 1\}$   

Num	| Action
----|----
0 |	Push cart to the left
1 |	Push cart to the right


### State

$S = \begin{bmatrix}
\{ s_{00}, s_{01}, \cdots\}, \\
\{ s_{10}, s_{11}, \cdots \}, \\
\{ s_{20}, s_{21}, \cdots \}, \\
\{ s_{30}, s_{31}, \cdots \}
\end{bmatrix}$  

Index | State                   | Min      | Max
-----|-----------------|--------|------
0      | Cart Position       | -4.8    | 4.8  
1       | Cart Velocity       | -Inf     | Inf
2      | Pole Angle           | -24 deg | 24 deg
3      | Pole Velocity At Tip | -Inf     | Inf


### Reward

+1 for each step including the terminal step.

## DQN Model Training

$X = \{ x_1^{(i)}, x_2^{(i)}, x_3^{(i)}, x_4^{(i)} \}$

$Y = \{ q_1^{(i)}, q_2^{(i)} \}$

$\begin{align}
J(w) & = \mathbb E_{\pi} [(q - \hat q)^2] \\
& =  \mathbb E_{\pi} \left [\left (R_{t+1} + \gamma \max_{a'} Q(S_{t+1}, a') - Q(S_t, A_t, w) \right)^2 \right ]
\end{align}$

$ w \leftarrow w-\alpha \dfrac{\partial J(w,b)}{\partial w}$  

$\hat A_t = \text{argmax}_{a'}Q(S_t, a', w)$  

In [5]:
# Q-Network Modeling
num_state = env.observation_space.shape[0]
num_action = env.action_space.n

model = Sequential()
model.add(Dense(32, input_dim= num_state, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(num_action, activation=None))
model.compile(loss='mse', optimizer="adam")

In [6]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                160       
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 66        
Total params: 1,282
Trainable params: 1,282
Non-trainable params: 0
_________________________________________________________________


In [7]:
num_iteration = 500
min_timesteps_per_batch = 2500

# Hyper parameter
epsilon = 0.3
gamma = 0.95
batch_size = 32

# Q-Network Learning
for i in tqdm(range(num_iteration)):
    timesteps_this_batch = 0
    memory = []
    while True:
        state = env.reset()
        done = False
        while not done:
            if np.random.uniform() < epsilon:
                action = env.action_space.sample()
            else:
                q_value = model.predict(state.reshape(1, num_state))
                action = np.argmax(q_value[0])
            next_state, reward, done, info = env.step(action)
            # Memory
            memory.append((state, action, reward, next_state, done))

            state = next_state

        timesteps_this_batch += len(memory)
        if timesteps_this_batch > min_timesteps_per_batch:
            break

    # Replay   
    for state, action, reward, next_state, done in memory:
        if done:
            target = reward
        else:
            target = reward + gamma * (np.max(model.predict(next_state.reshape(1, num_state))[0]))
        q_value = model.predict(state.reshape(1, num_state))
        q_value[0][action] = target
        model.fit(state.reshape(1, num_state), q_value, epochs=1, batch_size=batch_size, verbose=0)

env.close()

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

W1030 00:13:10.926017 4639802816 deprecation_wrapper.py:119] From /Users/jeong/.conda/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.






In [8]:
save_dir = os.getcwd()
model_name = 'keras_dqn_trained_model.h5'

# Save model and weights
model_path = os.path.join(save_dir, model_name)
model.save(model_path)

## Solution

In [9]:
model = load_model(model_path)

env = gym.make(ENV_NAME)
num_state = env.observation_space.shape[0]

for i_episode in range(N_EPISODE):
    state = env.reset()
    done = False
    t = 0

    while not done:
        env.render()

        state = np.array(state).reshape(1, num_state)
        q_value = model.predict(state)
        action = np.argmax(q_value[0])
        state, reward, done, info = env.step(action)

        if done:
            print('Episode finished after {} steps'.format(t + 1))
            sleep(1)
            break
        t += 1      
        
env.close()

Episode finished after 279 steps
Episode finished after 269 steps
Episode finished after 277 steps
Episode finished after 311 steps
Episode finished after 287 steps
Episode finished after 283 steps
Episode finished after 263 steps
Episode finished after 375 steps
Episode finished after 285 steps
Episode finished after 305 steps
