<a href="https://colab.research.google.com/github/isaranja/RL-CartPole-V1/blob/master/CartPole_V1_DQN_Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Q-Learning with Keras for CartPole-V1

This notebooks tries to solve the CartPole-V1 problem using deep q-learning by using only the observation space

[Reference](https://towardsdatascience.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288)

In [1]:
# imporing required libraries

# -*- coding: utf-8 -*-
import random
import numpy as np

from collections import deque

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

Using TensorFlow backend.


Rendering the envioronment is  a tricky task in colab. So It is required to install and import following libraries

[Reference](https://star-ai.github.io/Rendering-OpenAi-Gym-in-Colaboratory/)

In [0]:
# installing required libraries
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [0]:
# imporing gym and relevant libraries to render
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only

import os

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

In [0]:
# starting the virtual display
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

In [0]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""
def show_video():
  mp4list = glob.glob('video/*.mp4')
  print('no of vedios saved : ', len(mp4list))
  if len(mp4list) > 0:
    file_size = 0
    for mp4file in mp4list:
      s = os.path.getsize(mp4file)
      if (s > file_size):
        file_size = s
        mp4 = mp4file
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="Cartpole-v1" controls 
                style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

## Creating the AI agent


*   **brain :** This part present the brain of AI agent. Simple Keras model is implemented hear
*   **remember :** The memory of the agent. This help to train the agent using experiance replay
*   **act :** action selection mechanism of the agent. epsilon greedy search method is used
*   **get_batch :** selecting mini batch for training 
*   **learn :** learning using the past experiance




In [0]:
class DQNAgent():
  
  def __init__(self, observation_space, action_size, max_memory=2000, gamma = 0.95, epsilon = 0.9, epsilon_min = 0.01,epsilon_decay=0.99,learning_rate=0.0001):

    self.observation_space = observation_space
    self.action_size = action_size
    self.memory = deque(maxlen=max_memory)
    self.gamma = gamma    # discount rate
    self.epsilon = epsilon # exploration rate
    self.epsilon_min = epsilon_min
    self.epsilon_decay = epsilon_decay
    self.learning_rate = 0.001
    self.model = self.brain()

  def brain(self):

    model = Sequential()
    model.add(Dense(24, input_dim=self.observation_space, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(self.action_size, activation='linear'))

    model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
    return model

  def remember(self, current_state, action, reward, next_state, done):
    self.memory.append((current_state, action, reward, next_state, done))

  def act(self, state):
    if np.random.rand() <= self.epsilon:
        return random.randrange(self.action_size)
    act_values = self.model.predict(state)
    return np.argmax(act_values[0])  # returns action

  def get_batch(self, batch_size = 32):
    minibatch = random.sample(self.memory, batch_size)

    inputs  = np.zeros((min(len(self.memory), batch_size), self.observation_space))
    targets = np.zeros((min(len(self.memory), batch_size), self.action_size))

    for i,(current_state, action, reward, next_state, game_over) in enumerate(minibatch):
        inputs[i]  = current_state
        targets[i] = self.model.predict(current_state)[0]
        if game_over:
          targets[i,action] = reward
        else:
          targets[i,action] = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
    return inputs, targets

  def learn(self, batch_size):
    inputs, targets = self.get_batch(batch_size)
    self.model.train_on_batch(inputs, targets)
    if self.epsilon > self.epsilon_min:
       self.epsilon *= self.epsilon_decay

  def load(self, name):
    self.model.load_weights(name)

  def save(self, name):
    self.model.save_weights(name)


## Training

In [0]:
epochs = 100
batch_size = 32
max_memory = 10000
epsilon_decay = 0.995
learning_rate = 0.0005
max_time_step =1000


env = wrap_env(gym.make('CartPole-v1'))
observation_space = env.observation_space.shape[0]
action_size = env.action_space.n

agent = DQNAgent(observation_space, action_size,max_memory=max_memory,epsilon_decay=epsilon_decay,learning_rate=learning_rate)
last_time_step = 0
agent.load("cartpole-v1-dqn.h5")

for epoch in range(epochs):
  current_state = env.reset()
  current_state = np.reshape(current_state, [1, observation_space])
  game_over = False
  time_step = 0
  
  while ((not game_over) and (time_step <= max_time_step)):
    
    env.render()
    action = agent.act(current_state)
    next_state, reward, game_over, _ = env.step(action)
    reward = reward if not game_over else -reward
    next_state = np.reshape(next_state, [1, observation_space])
    agent.remember(current_state, action, reward, next_state, game_over)
    current_state = next_state
    time_step += 1
    
    if game_over:
        print("episode: {}/{}, score: {}, e: {:.2}"
              .format(epoch, epochs, time_step, agent.epsilon))
        break
        
    if len(agent.memory) > batch_size:
        agent.learn(batch_size)
        
  if last_time_step < time_step :
    last_time_step = time_step
    agent.save("cartpole-v1-dqn.h5")
env.close()

episode: 0/100, score: 29, e: 0.9
episode: 1/100, score: 18, e: 0.84
episode: 2/100, score: 24, e: 0.75
episode: 3/100, score: 9, e: 0.72
episode: 4/100, score: 30, e: 0.62
episode: 5/100, score: 159, e: 0.28
episode: 6/100, score: 179, e: 0.12
episode: 7/100, score: 178, e: 0.047
episode: 8/100, score: 173, e: 0.02
episode: 9/100, score: 255, e: 0.01
episode: 10/100, score: 195, e: 0.01
episode: 11/100, score: 174, e: 0.01
episode: 12/100, score: 190, e: 0.01
episode: 13/100, score: 220, e: 0.01
episode: 14/100, score: 210, e: 0.01
episode: 15/100, score: 158, e: 0.01
episode: 16/100, score: 152, e: 0.01
episode: 17/100, score: 219, e: 0.01
episode: 18/100, score: 177, e: 0.01
episode: 19/100, score: 178, e: 0.01
episode: 20/100, score: 175, e: 0.01
episode: 21/100, score: 148, e: 0.01
episode: 22/100, score: 164, e: 0.01
episode: 23/100, score: 246, e: 0.01
episode: 24/100, score: 268, e: 0.01
episode: 25/100, score: 179, e: 0.01
episode: 26/100, score: 159, e: 0.01
episode: 27/100, 

In [19]:
show_video()

no of vedios saved :  5
