<img src="images/cartpole.gif" width="500">

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span><ul class="toc-item"><li><span><a href="#About-the-environment" data-toc-modified-id="About-the-environment-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>About the environment</a></span></li><li><span><a href="#Action-space-(Discrete)" data-toc-modified-id="Action-space-(Discrete)-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Action space (Discrete)</a></span></li><li><span><a href="#State-space-(Continuous)" data-toc-modified-id="State-space-(Continuous)-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>State space (Continuous)</a></span></li></ul></li><li><span><a href="#Generate-Neural-Network" data-toc-modified-id="Generate-Neural-Network-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Generate Neural Network</a></span></li><li><span><a href="#DQN-Agent-Class" data-toc-modified-id="DQN-Agent-Class-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>DQN Agent Class</a></span></li><li><span><a href="#Create-DQN-Agent" data-toc-modified-id="Create-DQN-Agent-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Create DQN Agent</a></span></li><li><span><a href="#Train-Deep-Q-Learning-model" data-toc-modified-id="Train-Deep-Q-Learning-model-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Train Deep Q Learning model</a></span></li><li><span><a href="#Test-the-trained-model" data-toc-modified-id="Test-the-trained-model-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Test the trained model</a></span></li></ul></div>

# Imports

In [7]:
import os
import random
import gym
import numpy as np
from collections import deque
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.optimizers import Adam, RMSprop

## About the environment
A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the centre.


## Action space (Discrete)

* 0 - Apply 1 unit of force in the left direction on the Cart
* 1 - Apply 1 unit force in the right direction on the cart

## State space (Continuous)

* 0 - Cart Position: from -4.8 to 4.8
* 1 - Cart Velocity: from -Inf to Inf
* 2 - Pole Angle: from -24° to 24°
* 3 - Pole Velocity At Tip: from -Inf to Inf

In this environment, we have a discrete action space and continuous state space. In order to maximize the reward agent has to balance the pole as long as it can. Because it is getting the reward of +1 for each time step.

In [8]:
env = gym.make('CartPole-v1')
print("Action space: ",env.action_space.n)
print("State space: ",env.observation_space.shape[0])

Action space:  2
State space:  4


# Generate Neural Network

In [9]:
def OurModel(input_shape, action_space):
    X_input = Input(input_shape)

    # 'Dense' is the basic form of a neural network layer
    # Input Layer of state size(4) and Hidden Layer with 512 nodes
    X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X_input)

    # Hidden layer with 256 nodes
    X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
    
    # Hidden layer with 64 nodes
    X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)

    # Output Layer with # of actions: 2 nodes (left, right)
    X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)

    model = Model(inputs = X_input, outputs = X, name='CartPole DQN model')
    model.compile(loss="mse", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])

    model.summary()
    return model

# DQN Agent Class

In [14]:
class DQNAgent:
    def __init__(self):
        self.env = gym.make('CartPole-v1')
        # by default, CartPole-v1 has max episode steps = 500
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.EPISODES = 1000
        self.memory = deque(maxlen=2000)
        
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.001
        self.epsilon_decay = 0.999
        self.batch_size = 128
        self.train_start = 1000
        
        self.Model_name="cartpole-dqn.h5"

        # create main model
        self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) > self.train_start:
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay

    def act(self, state):
        if np.random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.model.predict(state))

    def replay(self):
        if len(self.memory) < self.train_start:
            return
        # Randomly sample minibatch from the memory
        minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))

        state = np.zeros((self.batch_size, self.state_size))
        next_state = np.zeros((self.batch_size, self.state_size))
        action, reward, done = [], [], []

        # do this before prediction
        # for speedup, this could be done on the tensor level
        # but easier to understand using a loop
        for i in range(self.batch_size):
            state[i] = minibatch[i][0]
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            next_state[i] = minibatch[i][3]
            done.append(minibatch[i][4])

        # do batch prediction to save speed
        target = self.model.predict(state)
        target_next = self.model.predict(next_state)

        for i in range(self.batch_size):
            # correction on the Q value for the action used
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # Standard - DQN
                # DQN chooses the max Q value among next actions
                # selection and evaluation of action is on the target Q Network
                # Q_max = max_a' Q_target(s', a')
                target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))

        # Train the Neural Network with batches
        self.model.fit(state, target, batch_size=self.batch_size, verbose=0)


    def load(self, name):
        self.model = load_model(name)

    def save(self, name):
        self.model.save(name)
            
    def run(self):
        for e in range(self.EPISODES):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            done = False
            i = 0
            while not done:
                self.env.render()
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.state_size])
                if not done or i == self.env._max_episode_steps-1:
                    reward = reward
                else:
                    reward = -100
                self.remember(state, action, reward, next_state, done)
                state = next_state
                i += 1
                if done:                   
                    print("episode: {}/{}, score: {}, e: {:.2}".format(e, self.EPISODES, i, self.epsilon))
                    if i == 500:
                        print("Saving trained model as cartpole-dqn.h5")
                        self.save("cartpole-dqn.h5")
                    break
                self.replay()

    def test(self):
        self.load(self.Model_name)
        for e in range(self.EPISODES):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            done = False
            i = 0
            while not done:
                self.env.render()
                action = np.argmax(self.model.predict(state))
                next_state, reward, done, _ = self.env.step(action)
                state = np.reshape(next_state, [1, self.state_size])
                i += 1
                if done:
                    print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
                    break

# Create DQN Agent

In [15]:
agent = DQNAgent()

Model: "CartPole DQN model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 4)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 512)               2560      
_________________________________________________________________
dense_6 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_7 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 130       
Total params: 150,466
Trainable params: 150,466
Non-trainable params: 0
_________________________________________________________________


# Train Deep Q Learning model

In [16]:
#agent.run()

# Test the trained model

In [None]:
agent.test()