<a href="https://colab.research.google.com/github/kapilnchauhan77/CartPole_DQN/blob/master/CartPole_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Trainging a deep Q neural network to balance

##Checking Tensorflow Version

In [4]:
import tensorflow as tf
tf.__version__

'1.15.0'

##Mish Activation function, courtesy of https://arxiv.org/ftp/arxiv/papers/1908/1908.08681.pdf

In [0]:
from keras.engine.base_layer import Layer
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.keras.layers import Activation
from keras.utils.generic_utils import get_custom_objects
import tensorflow.keras.backend as K

class Mish(Activation):
    '''
    Mish Activation Function.
    .. math::
        mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + e^{x}))
    Shape:
        - Input: Arbitrary. Use the keyword argument `input_shape`
        (tuple of integers, does not include the samples axis)
        when using this layer as the first layer in a model.
        - Output: Same shape as the input.
    Examples:
        >>> X = Activation('Mish', name="conv1_act")(X_input)
    '''

    def __init__(self, activation, **kwargs):
        super(Mish, self).__init__(activation, **kwargs)
        self.__name__ = 'Mish'


def mish(x):
    return x*K.tanh(K.softplus(x))

get_custom_objects().update({'Mish': Mish(mish)})

##Importing dependencies, making environment and defining variables

In [0]:
import gym
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from collections import deque
import random
import matplotlib.pyplot as plt

In [0]:
env = gym.make('CartPole-v0')

seed = 2
env.seed(seed)
random.seed(seed)
np.random.seed(seed)

EPISODES = 1000
DIM = len(env.observation_space.high)
DISCOUNT = 0.99
SHOW_AT = 100
show = False
BATCH_SIZE = 32

epsilon = 0.99
EPSILON_DECAY = 0.99
MIN_EPSILON = 0.01

memory = deque(maxlen=10000)
batch = deque(maxlen=BATCH_SIZE)
scores = deque(maxlen=100)
episode_rewards = []
avg_scores = []

task_complete = False

##Defining model class, helper functions and building the models

In [0]:
class Model:
  def __init__(self):
    self.model = self.create_model()

  def create_model(self):
    model = Sequential()

    model.add(Dense(128, input_dim=len(env.observation_space.high), kernel_initializer='he_normal'))
    model.add(Mish(mish))
    # model.add(Activation('softplus'))

    model.add(Dense(64, kernel_initializer='he_normal'))
    model.add(Mish(mish))
    # model.add(Activation('tanh'))

    model.add(Dense(64, kernel_initializer='he_normal'))
    model.add(Mish(mish))
    # model.add(Activation('tanh'))

    model.add(Dense(env.action_space.n, kernel_initializer='he_normal'))
    # model.add(Activation('softmax'))

    model.compile(loss='mse', optimizer=Adam())
    return model
  
  def get_action(self, state):
    return np.argmax(self.predict(state)) if np.random.random() > epsilon else np.random.randint(env.action_space.n)

  def train(self, x_train, y_train):
    return self.model.train_on_batch(x_train, y_train)
  
  def predict(self, x):
    return self.model.predict(x)
  
  def get_weights(self):
    return self.model.get_weights()

  def set_weights(self, other):
    return self.model.set_weights(other.model.get_weights())
  
  def summary(self):
    return self.model.summary()

In [0]:
model = Model()
target_model = Model()
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 128)               640       
_________________________________________________________________
mish_8 (Mish)                (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
_________________________________________________________________
mish_9 (Mish)                (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 64)                4160      
_________________________________________________________________
mish_10 (Mish)               (None, 64)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 2)                

In [0]:
def get_state(st):
  return np.reshape(st, (1, len(env.observation_space.high)))

In [0]:
def train_the_model():
  if len(memory) < 1000:
    return

  batch = random.sample(memory,min(len(memory), BATCH_SIZE))

  x=[]
  y=[]
  for state, action, reward, next_state, done in batch:
    max_future_q = np.max(target_model.predict(next_state))
    new_q = reward + DISCOUNT * max_future_q * np.invert(done)

    current_q = model.predict(get_state(state))
    current_q[0][action] = new_q

    x.append(state)
    # print(f"x:{len(x)}")
    y.append(current_q)
    # print(f"y:{len(y)}")

  x = np.reshape(np.array(x), (-1, len(env.observation_space.high)))
  y = np.reshape(np.array(y), (-1, 2))
  model.train(x, y)

##Training, accomplished the goal at 4th Episode!!! 

In [0]:
for episode in range(EPISODES):

  if not (episode % SHOW_AT):
    show = True
  else:
    show = False

  episode_reward = 0
  state = get_state(env.reset())
  
  done = False
  while not done:

      action = model.get_action(state)
      next_st, reward, done, _ = env.step(action)
      next_state = get_state(next_st)
      episode_reward += reward

      memory.append((state, action, reward if not done else -100, next_state, done))
      train_the_model()

      if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
      
      state = next_state
      
      if show:
        env.render()

      if done:
        env.reset()
        target_model.set_weights(model)
        episode_rewards.append(episode_reward)
        scores.append(episode_reward)
        avg_score = np.mean(scores)
        avg_scores.append(avg_scores)

        if not task_complete and avg_score >= 195 and episode >= 100:
          solution_episode = next(x[0] for x in enumerate(episode_rewards) if x[1] >= 195) 
          print(f"Solved at episode no. {solution_episode}")
          task_complete = True
        
        if not (episode % 10):
          print(f"Episode: {episode}:- average reward = {avg_score}, max reward till now = {np.max(episode_rewards)}")

Episode: 0:- average reward = 10.0, max reward till now = 10.0
Episode: 10:- average reward = 109.27272727272727, max reward till now = 353.0
Episode: 20:- average reward = 96.42857142857143, max reward till now = 362.0
Episode: 30:- average reward = 137.51612903225808, max reward till now = 460.0
Episode: 40:- average reward = 150.8048780487805, max reward till now = 460.0
Episode: 50:- average reward = 157.2549019607843, max reward till now = 460.0
Episode: 60:- average reward = 173.11475409836066, max reward till now = 460.0
Episode: 70:- average reward = 183.2394366197183, max reward till now = 500.0
Episode: 80:- average reward = 182.19753086419752, max reward till now = 500.0
Episode: 90:- average reward = 202.56043956043956, max reward till now = 500.0
Solved at episode no. 4
Episode: 100:- average reward = 205.31, max reward till now = 500.0
Episode: 110:- average reward = 207.35, max reward till now = 500.0
Episode: 120:- average reward = 213.15, max reward till now = 500.0
Ep