<a href="https://colab.research.google.com/github/isaranja/RL-MountainCar-v0/blob/master/MountainCar_v0_DQN_Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Q-Learning with Keras for MountainCar-v0

This notebooks tries to solve the CartPole-V1 problem using deep q-learning by using only the observation space

[Reference](https://towardsdatascience.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288)

In [1]:
# imporing required libraries

# -*- coding: utf-8 -*-
import random
import numpy as np

from collections import deque

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

Using TensorFlow backend.


Rendering the envioronment is  a tricky task in colab. So It is required to install and import following libraries

[Reference](https://star-ai.github.io/Rendering-OpenAi-Gym-in-Colaboratory/)

In [0]:
# installing required libraries
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [0]:
# imporing gym and relevant libraries to render
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io

import os

import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

In [4]:
# starting the virtual display
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

W0621 08:09:41.833832 140343124035456 abstractdisplay.py:144] xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [0]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""
def show_video(id):
  mp4list = glob.glob('video/*.mp4')
  print('no of vedios saved : ', len(mp4list))
  if len(mp4list) > 0:
    #file_size = 0
    #for mp4file in mp4list:
    #  s = os.path.getsize(mp4file)
    #  if (s > file_size):
    #    file_size = s
    #    mp4 = mp4file
    mp4 = mp4list[id]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="Cartpole-v1" controls 
                style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

## Random play

In [0]:
def play_a_random_game_first(env_name,steps):
  env = wrap_env(gym.make(env_name))
  env.reset()
  for step_index in range(steps):
      env.render()
      action = env.action_space.sample()
      observation, reward, done, info = env.step(action)
      print("Step {}:".format(step_index))
      print("action: {}".format(action))
      print("observation: {}".format(observation))
      print("reward: {}".format(reward))
      print("done: {}".format(done))
      print("info: {}".format(info))
      if done:
          break
  env.close()

In [7]:
play_a_random_game_first('MountainCar-v0',2)

Step 0:
action: 0
observation: [-0.42266793 -0.00175804]
reward: -1.0
done: False
info: {}
Step 1:
action: 1
observation: [-0.42517144 -0.00250351]
reward: -1.0
done: False
info: {}


## Creating the AI agent


*   **brain :** This part present the brain of AI agent. Simple Keras model is implemented hear
*   **remember :** The memory of the agent. This help to train the agent using experiance replay
*   **act :** action selection mechanism of the agent. epsilon greedy search method is used
*   **get_batch :** selecting mini batch for training 
*   **learn :** learning using the past experiance




In [0]:
class DQNAgent():
  
  def __init__(self, observation_space, action_size, max_memory=2000, gamma = 0.95, epsilon = 0.9, epsilon_min = 0.01,epsilon_decay=0.99,learning_rate=0.0001):

    self.observation_space = observation_space
    self.action_size = action_size
    self.memory = deque(maxlen=max_memory)
    self.gamma = gamma    # discount rate
    self.epsilon = epsilon # exploration rate
    self.epsilon_min = epsilon_min
    self.epsilon_decay = epsilon_decay
    self.learning_rate = 0.001
    self.model = self.brain()

  def brain(self):

    model = Sequential()
    model.add(Dense(128, input_dim=self.observation_space, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(self.action_size, activation='linear'))

    model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
    return model

  def remember(self, current_state, action, reward, next_state, done):
    self.memory.append((current_state, action, reward, next_state, done))

  def act(self, state):
    if np.random.rand() <= self.epsilon:
        return random.randrange(self.action_size)
    act_values = self.model.predict(state)
    return np.argmax(act_values[0])  # returns action

  def get_batch(self, batch_size = 32):
    minibatch = random.sample(self.memory, batch_size)

    inputs  = np.zeros((min(len(self.memory), batch_size), self.observation_space))
    targets = np.zeros((min(len(self.memory), batch_size), self.action_size))

    for i,(current_state, action, reward, next_state, game_over) in enumerate(minibatch):
        inputs[i]  = current_state
        targets[i] = self.model.predict(current_state)[0]
        if game_over:
          targets[i,action] = reward
        else:
          targets[i,action] = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
    return inputs, targets

  def learn(self, batch_size):
    inputs, targets = self.get_batch(batch_size)
    self.model.train_on_batch(inputs, targets)
    if self.epsilon > self.epsilon_min:
       self.epsilon *= self.epsilon_decay

  def load(self, name):
    self.model.load_weights(name)

  def save(self, name):
    self.model.save_weights(name)


## Training

In [21]:
#env.close()

epochs = 100
batch_size = 32
max_memory = 3000
epsilon_decay = 0.9995
learning_rate = 0.001
max_position = -1.2

env = wrap_env(gym.make('MountainCar-v0'))
observation_space = env.observation_space.shape[0]
action_size = env.action_space.n

agent = DQNAgent(observation_space, action_size,max_memory=max_memory,epsilon_decay=epsilon_decay,learning_rate=learning_rate)
last_time_step = 0
#agent.load("MountainCar-v0.h5")

for epoch in range(epochs):
  current_state = env.reset()
  current_state = np.reshape(current_state, [1, observation_space])
  game_over = False
  episode = 0
  max_position = -1.2
  
  total_reward = 0
  
  while (not game_over):
    
    env.render()
    
    action = agent.act(current_state)
    
    next_state, reward, game_over, _ = env.step(action)
    
    next_state = np.reshape(next_state, [1, observation_space])
    
    reward = np.abs(next_state[0][0] - current_state[0][0]) - 0.1
    
    if next_state[0][0] >= 0.5:
      reward += 20
      
    total_reward += reward
    
    if (max_position < next_state[0][0]):
      max_position = next_state[0][0]
    
    agent.remember(current_state, action, reward, next_state, game_over)
    
    current_state = next_state
    
    episode += 1
    
    if game_over:
        print("epoch: {}/{}, episodes: {}, tatal_reward: {:.2f}, max_position: {:.2}, e: {:.2}"
              .format(epoch, epochs, episode, total_reward, max_position, agent.epsilon))
        break
        
    if len(agent.memory) > batch_size:
        agent.learn(batch_size)

env.close()

epoch: 0/100, episodes: 200, tatal_reward: -18.21, max_position: -0.3, e: 0.83
epoch: 1/100, episodes: 200, tatal_reward: -18.88, max_position: -0.34, e: 0.75
epoch: 2/100, episodes: 200, tatal_reward: -19.49, max_position: -0.43, e: 0.68
epoch: 3/100, episodes: 200, tatal_reward: -18.69, max_position: -0.29, e: 0.61
epoch: 4/100, episodes: 200, tatal_reward: -19.17, max_position: -0.38, e: 0.56
epoch: 5/100, episodes: 200, tatal_reward: -18.32, max_position: -0.24, e: 0.5
epoch: 6/100, episodes: 200, tatal_reward: -18.67, max_position: -0.32, e: 0.46
epoch: 7/100, episodes: 200, tatal_reward: -19.17, max_position: -0.37, e: 0.41
epoch: 8/100, episodes: 200, tatal_reward: -18.48, max_position: -0.33, e: 0.37
epoch: 9/100, episodes: 200, tatal_reward: -18.48, max_position: -0.3, e: 0.34
epoch: 10/100, episodes: 200, tatal_reward: -19.01, max_position: -0.41, e: 0.31
epoch: 11/100, episodes: 200, tatal_reward: -18.38, max_position: -0.32, e: 0.28
epoch: 12/100, episodes: 200, tatal_rewar

In [27]:
show_video(3)

no of vedios saved :  5
