<a href="https://colab.research.google.com/github/lbenbaccar/Deep-Reinforcement-Learning-with-Double-Q-Learning/blob/main/RL_BenBaccar_Lauzeral.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Reinforcement Learning with Double Q-learning

In [None]:
import numpy as np
import gym
from gym.envs.registration import register
import random

## Q-learning
### Frozen lake problem [click here](https://gym.openai.com/envs/FrozenLake-v0/)

We derive the environment using the Gym library. Instead of using the original FrozenLake environment, we make a new one with no slippery to make sure the movement in the environment is deterministic. 

In [None]:
class Environment():
    def __init__(self):
        # construct the environment where agent can perceive and act.
        pass

    def FrozenLakeNoSlippery(self):
        # construct frozen lake without slippery
        register(
                 id= 'FrozenLakeNoSlippery-v0',
                 entry_point='gym.envs.toy_text:FrozenLakeEnv',
                 kwargs={'map_name' : '4x4', 'is_slippery': False},
                 max_episode_steps=100,
                 reward_threshold=0.82
                 )
        env = gym.make('FrozenLakeNoSlippery-v0')
        return env

#### Hyper-parameters and Q-table initialization 
- The discount factor is used to measure the importance of future reward. Its value is 0~1. The more closer to 1, the more important the future reward is.
- The parameters exploration_rate and exploration_decay is used for training state and affects which kind of action, random action or optimal action, would the Agent take.
- We create the simplest form of Q(s, a) — a matrix with states as rows and actions as columns, called Q-table/Q-matrix. Since the Agent knows nothing at the beginning, all Q-values are initialized to zeros.

#### Policy that agent follows
During the training, the agent can take a random or optimal action. The former is for Exploration in order to get better future reward and the later is for Exploitation and always choose the best known action in a given state.

During the testing, the agent always takes the optimal action.
*texte en italique*
- Optimal Policy (Exploitation) — choose the most valuable action in any state.
- Random Policy (Exploration) — take a random action in given state.
- Control the Exploration Rate — Start with 100% exploration, move slowly towards roughly 0% since we want less and less exploration and focus on the optimal policy.

In [None]:
class QAgent():
    def __init__(self, env):
        # setting hyper-parameters and initialize Q-table
        self.max_episodes = 20000   # set max training episodes
        self.max_actions = 99       # set max actions per episodes
        self.learning_rate = 0.83   # for q-learning
        self.discount = 0.93        # for q-learning
        self.exploration_rate = 1.0 # for exploration
        self.exploration_decay = 1.0/self.max_episodes  # for exploitation

        # get environmnent
        self.env = env

        # initialize Q(s, a)
        row = env.observation_space.n # Discrete(16)
        col = env.action_space.n # Discrete(4)
        self.Q = np.zeros((row, col))

    def _policy(self, mode, state, e_rate):
        # return the action the Agent taken in a given state
        if mode=='train':
            if random.random() > e_rate:
                return np.argmax(self.Q[state,:]) # exploitation
            else:
                return self.env.action_space.sample() # exploration
        elif mode=='test':
            return np.argmax(self.Q[state,:]) # optimal policy
        
    def train(self):
        # training the agent
        # get hyper-parameters
        max_episodes = self.max_episodes
        max_actions = self.max_actions
        learning_rate = self.learning_rate
        discount = self.discount
        exploration_rate = self.exploration_rate
        exploration_decay = 1.0/self.max_episodes
        
        # start training
        for i in range(max_episodes):
            state = self.env.reset() # reset the environment per eisodes
            for a in range(max_actions):
                action = self._policy('train', state, exploration_rate)
                new_state, reward, done, info = self.env.step(action)
                # The formulation of updating Q(s, a)
                self.Q[state, action] = self.Q [state, action] + learning_rate*(reward+discount*np.max(self.Q [new_state, :]) - self.Q [state, action])
                state = new_state # update the current state
                if done == True:  # if fall in the hole or arrive to the goal, then this episode is terminated.
                    break
            if exploration_rate>0.001:
                exploration_rate -= exploration_decay

    def test(self):
        # testing the agent
        # Setting hyper-parameters
        max_actions = self.max_actions
        state = self.env.reset() # reset the environment
        for a in range(max_actions):
            self.env.render() # show the environment states
            action = np.argmax(self.Q[state,:]) # take action with the Optimal Policy
            new_state, reward, done, info = self.env.step(action) # arrive to next_state after taking the action
            state = new_state # update current state
            if done:
                print("======")
                self.env.render()
                break
            print("======")
        self.env.close()
        
    def displayQ():
        # show information
        print("Q\n", self.Q)

In [None]:
env = Environment().FrozenLakeNoSlippery() # construct the environment
agent = QAgent(env) # get agent
agent.train()
print("Testing Model")
agent.test()

Testing Model

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


## Deep Q-Learning


In [None]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 

Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
class DeepQAgent():
    def __init__(self, env):
        # set hyperparameters
        self.max_episodes = 20000
        self.max_actions = 99
        self.discount = 0.93
        self.exploration_rate = 1.0
        self.exploration_decay = 1.0/20000
        # get envirionment
        self.env = env
    
        # nn_model parameters
        self.in_units = env.observation_space.n
        self.out_units = env.action_space.n
        self.hidden_units = 10
        
        # construct nn model
        self._nn_model()
    
        # save nn model
        self.saver = tf.train.Saver()

    def _nn_model(self):
        self.a0 = tf.placeholder(tf.float32, shape=[1, self.in_units]) # input layer
        self.y = tf.placeholder(tf.float32, shape=[1, self.out_units]) # ouput layer
        
        # from input layer to hidden layer
        self.w1 = tf.Variable(tf.zeros([self.in_units, self.hidden_units], dtype=tf.float32)) # weight
        self.b1 = tf.Variable(tf.random_uniform([self.hidden_units], 0, 0.01, dtype=tf.float32)) # bias
        self.a1 = tf.nn.relu(tf.matmul(self.a0, self.w1) + self.b1) # the ouput of hidden layer
        
        # from hidden layer to output layer
        self.w2 = tf.Variable(tf.zeros([self.hidden_units, self.out_units], dtype=tf.float32)) # weight
        self.b2 = tf.Variable(tf.random_uniform([self.out_units], 0, 0.01, dtype=tf.float32)) # bias
        
        # Q-value and Action
        self.a2 = tf.matmul(self.a1, self.w2) + self.b2 # the predicted_y (Q-value) of four actions
        self.action = tf.argmax(self.a2, 1) # the agent would take the action which has maximum Q-value

        # loss function
        self.loss = tf.reduce_sum(tf.square(self.a2-self.y))
        
        # upate model
        self.update_model =  tf.train.GradientDescentOptimizer(learning_rate=0.05).minimize(self.loss)

    def train(self):
        # get hyper parameters
        max_episodes = self.max_episodes
        max_actions = self.max_actions
        discount = self.discount
        exploration_rate = self.exploration_rate
        exploration_decay = self.exploration_decay
        
        # start training
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer()) # initialize tf variables
            for i in range(max_episodes):
                state = env.reset() # reset the environment per eisodes
                for j in range(max_actions):
                     # get action and Q-values of all actions
                    action, pred_Q = sess.run([self.action, self.a2],feed_dict={self.a0:np.eye(16)[state:state+1]})
                    
                    # if explorating, then taking a random action instead
                    if np.random.rand()<exploration_rate: 
                        action[0] = env.action_space.sample() 

                    # get nextQ in given next_state
                    next_state, rewards, done, info = env.step(action[0])
                    next_Q = sess.run(self.a2,feed_dict={self.a0:np.eye(16)[next_state:next_state+1]})

                    # update
                    update_Q = pred_Q
                    update_Q [0,action[0]] = rewards + discount*np.max(next_Q)
                    
                    sess.run([self.update_model],
                             feed_dict={self.a0:np.identity(16)[state:state+1],self.y:update_Q})
                    state = next_state
                    
                     # if fall in the hole or arrive to the goal, then this episode is terminated.
                    if done:
                        if exploration_rate > 0.001:
                            exploration_rate -= exploration_decay
                        break
            # save model
            save_path = self.saver.save(sess, "./nn_model.ckpt")

    def test(self):
        # get hyper-parameters
        max_actions = self.max_actions
        # start testing
        with tf.Session() as sess:
            # restore the model
            sess.run(tf.global_variables_initializer())
            saver=tf.train.import_meta_graph("./nn_model.ckpt.meta") # restore model
            saver.restore(sess, tf.train.latest_checkpoint('./'))# restore variables
            
            # testing result
            state = env.reset()
            for j in range(max_actions):
                env.render() # show the environments
                # always take optimal action
                action, pred_Q = sess.run([self.action, self.a2],feed_dict={self.a0:np.eye(16)[state:state+1]})
                # update
                next_state, rewards, done, info = env.step(action[0])
                state = next_state
                if done:
                    env.render()
                    break
    def displayQ():
        # show information
        print("Q\n", self.Q)

In [None]:
agent = DeepQAgent(env) # get agent
print("START TRAINING...")
agent.train()
print("\n\nTEST\n\n")
agent.test()

START TRAINING...


TEST


INFO:tensorflow:Restoring parameters from ./nn_model.ckpt

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


## Double Deep Q Network


### Cart Pole problem : [click here](https://gym.openai.com/envs/CartPole-v1/)

In [None]:
class ExpReplay():
    def __init__(self, e_max=15000, e_min=100):
        self._max = e_max # maximum number of experiences
        self._min = e_min # minimum number of experiences for training
        self.exp = {'state':[], 'action':[], 'reward':[], 'next_state':[], 'done':[]} # total experiences the Agent stored
        
    def get_max(self):
        """return the maximum number of experiences"""
        return self._max
    
    def get_min(self):
        """return the minimum number of experiences"""
        return self._min
    
    def get_num(self):
        """return the curren number of experiences"""
        return len(self.exp['state'])
    
    def get_batch(self, batch_size=64):
        """random choose a batch of experiences for training"""
        idx = np.random.choice(self.get_num(), size=batch_size, replace=False)
        state = np.array([self.exp['state'][i] for i in idx])
        action = [self.exp['action'][i] for i in idx]
        reward = [self.exp['reward'][i] for i in idx]
        next_state = np.array([self.exp['next_state'][i] for i in idx])
        done = [self.exp['done'][i] for i in idx]
        return state, action, reward, next_state, done
        
    def add(self, state, action, reward, next_state, done):
        """remove the oldest experience if the meomory is full"""
        if self.get_num()>self.get_max():
            del self.exp['state'][0]
            del self.exp['action'][0]
            del self.exp['reward'][0]
            del self.exp['next_state'][0]
            del self.exp['done'][0]
        """add single experience"""
        self.exp['state'].append(state)
        self.exp['action'].append(action)
        self.exp['reward'].append(reward)
        self.exp['next_state'].append(next_state)
        self.exp['done'].append(done)

In [None]:
class TNET():
    """
    Target network is for calculating the maximum estimated Q-value in given action a.
    """
    def __init__(self, in_units, out_units, hidden_units=250):
        self.in_units = in_units
        self.out_units = out_units
        self.hidden_units = hidden_units
        self._model()
        
    def _model(self):
        with tf.variable_scope('tnet'):
            # input layer
            self.x = tf.placeholder(tf.float32, shape=(None, self.in_units))
            
            # from input layer to hidden layer1
            W1=tf.get_variable('W1', shape=(self.in_units, self.hidden_units), initializer=tf.random_normal_initializer())
            # from hidden layer1 to hiiden layer2
            W2=tf.get_variable('W2', shape=(self.hidden_units, self.hidden_units), initializer=tf.random_normal_initializer())
            # from hidden layer2 to output layer
            W3=tf.get_variable('W3', shape=(self.hidden_units, self.out_units), initializer=tf.random_normal_initializer())
            
            # the bias of hidden layer1
            b1=tf.get_variable('b1', shape=(self.hidden_units), initializer=tf.zeros_initializer())
            # the bias of hidden layer2
            b2=tf.get_variable('b2', shape=(self.hidden_units), initializer=tf.zeros_initializer())
 
            # the ouput of hidden layer1
            h1=tf.nn.tanh(tf.matmul(self.x, W1)+b1)
            # the output of hidden layer2
            h2=tf.nn.tanh(tf.matmul(h1, W2)+b2)
            
            # the output of output layer, that is, Q-value
            self.q=tf.matmul(h2, W3)
        
            self.params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='tnet')

In [None]:
class QNET():
    def __init__(self, in_units, out_units, exp, hidden_units=250):
        # Target Network
        self.tnet = TNET(in_units, out_units)
        
        # Q network architecture
        self.in_units = in_units
        self.out_units = out_units
        self.hidden_units = hidden_units
        self._model()
        self._batch_learning_model()
        self._tnet_update()
        
        # experience replay
        self.exp = exp 
        
    def _model(self):
        """ Q-network architecture """
        with tf.variable_scope('qnet'):
            self.x = tf.placeholder(tf.float32, shape=(None, self.in_units))
            
            W1 = tf.get_variable('W1', shape=(self.in_units, self.hidden_units), initializer=tf.random_normal_initializer())
            W2 = tf.get_variable('W2', shape=(self.hidden_units, self.hidden_units), initializer=tf.random_normal_initializer())
            W3 = tf.get_variable('W3', shape=(self.hidden_units, self.out_units), initializer=tf.random_normal_initializer())
            
            b1 = tf.get_variable('b1', shape=(self.hidden_units), initializer=tf.zeros_initializer())
            b2 = tf.get_variable('b2', shape=(self.hidden_units), initializer=tf.zeros_initializer())
 
            h1 = tf.nn.tanh(tf.matmul(self.x, W1)+b1)
            h2 = tf.nn.tanh(tf.matmul(h1, W2)+b2)
            self.q = tf.matmul(h2, W3)

    def _batch_learning_model(self):
        """For batch learning"""
        with tf.variable_scope('qnet'):
            # TD-target
            self.target = tf.placeholder(tf.float32, shape=(None, ))
            # Action index
            self.selected_idx = tf.placeholder(tf.int32, shape=(None, 2))
            # Q-value
            self.selected_q = tf.gather_nd(self.q, self.selected_idx)
            
            self.params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='qnet')
            
            # Q-network optimization alogrithms
            loss = tf.losses.mean_squared_error(self.target, self.selected_q)
            gradients = tf.gradients(loss, self.params)
            self.train_opt = tf.train.AdamOptimizer(3e-4).apply_gradients(zip(gradients, self.params))

    def _tnet_update(self):
        """ Update Target network by using the parameters of Q-Network"""
        with tf.variable_scope('qnet'):                        
            self.update_opt = [t.assign(q) for t, q in zip(self.tnet.params, self.params)]
    
    def batch_train(self, batch_size=64):
        """Implement Double DQN Algorithm, batch training"""
        if self.exp.get_num() < self.exp.get_min():
            #The number of experiences is not enough for batch training
            return

        # get a batch of experiences
        state, action, reward, next_state, done = self.exp.get_batch(batch_size)
        state = state.reshape(batch_size, self.in_units)
        next_state = next_state.reshape(batch_size, self.in_units)
        
        # get actions by Q-network
        qnet_q_values = self.session.run(self.q, feed_dict={self.x:next_state})
        qnet_actions = np.argmax(qnet_q_values, axis=1)
        
        # calculate estimated Q-values with qnet_actions by using Target-network
        tnet_q_values = self.session.run(self.tnet.q, feed_dict={self.tnet.x:next_state})
        tnet_q = [np.take(tnet_q_values[i], qnet_actions[i]) for i in range(batch_size)]
        
        # Update Q-values of Q-network
        qnet_update_q = [r+0.95*q if not d else r for r, q, d in zip(reward, tnet_q, done)]
        
        # optimization
        indices=[[i,action[i]] for i in range(batch_size)]
        feed_dict={self.x:state, self.target:qnet_update_q, self.selected_idx:indices}
        self.session.run(self.train_opt, feed_dict)
    
    def update(self):
        """ for updatte target network"""
        self.session.run(self.update_opt)
        
    def set_session(self, sess):
        self.session = sess
        
    def get_action(self, state, e_rate):
        """ for training stage of the Agent, exploitation or exploration"""
        if np.random.random()<e_rate:
            return np.random.choice(self.out_units)
        else:
            return np.argmax(self.session.run(self.q, feed_dict={self.x: state}))

In [None]:
class Agent():
    def __init__(self, env):
        # set hyper parameters
        self.max_episodes = 1000
        self.max_actions = 10000
        self.exploration_rate = 1.0
        self.exploration_decay = 0.0001  
        
        # set environment
        self.env = env
        self.states = env.observation_space.shape[0]
        self.actions = env.action_space.n
        
        # Experience Replay for batch learning
        self.exp = ExpReplay()
        # the number of experience per batch for batch learning
        self.batch_size = 64 
        
        # Deep Q Network
        self.qnet = QNET(self.states, self.actions, self.exp)
        # For execute Deep Q Network
        session = tf.InteractiveSession()
        session.run(tf.global_variables_initializer())
        self.qnet.set_session(session)
        
    def train(self):
        # set hyper parameters
        max_episodes = self.max_episodes
        max_actions = self.max_actions
        exploration_rate = self.exploration_rate
        exploration_decay = self.exploration_decay
        batch_size = self.batch_size
        
        # start training
        record_rewards = []
        for i in range(max_episodes):
            total_rewards = 0
            state = self.env.reset()
            state = state.reshape(1, self.states)
            for j in range(max_actions):
                #self.env.render() # Uncomment this line to render the environment
                action = self.qnet.get_action(state, exploration_rate)
                next_state, reward, done, info = self.env.step(action)
                next_state = next_state.reshape(1, self.states)
                total_rewards += reward
                
                if done:
                    self.exp.add(state, action, (reward-100), next_state, done)
                    self.qnet.batch_train(batch_size)
                    break
                    
                self.exp.add(state, action, reward, next_state, done)
                self.qnet.batch_train(batch_size)
                
                # update target network
                if (j%25)== 0 and j>0:
                    self.qnet.update()
                # next episode
                state = next_state
                
            record_rewards.append(total_rewards)
            exploration_rate = 0.01 + (exploration_rate-0.01)*np.exp(-exploration_decay*(i+1))
            if i%100==0 and i>0:
                average_rewards = np.mean(np.array(record_rewards))
                record_rewards = []
                print("episodes: %i to %i, average_reward: %.3f, exploration: %.3f" %(i-100, i, average_rewards, exploration_rate))


In [None]:
env = gym.make('CartPole-v1')
agent = Agent(env)
agent.train()

episodes: 0 to 100, average_reward: 31.782, exploration: 0.601
episodes: 100 to 200, average_reward: 159.830, exploration: 0.140
episodes: 200 to 300, average_reward: 263.630, exploration: 0.021
episodes: 300 to 400, average_reward: 260.130, exploration: 0.010
episodes: 400 to 500, average_reward: 273.200, exploration: 0.010
episodes: 500 to 600, average_reward: 332.550, exploration: 0.010
episodes: 600 to 700, average_reward: 318.390, exploration: 0.010
episodes: 700 to 800, average_reward: 367.220, exploration: 0.010
episodes: 800 to 900, average_reward: 310.750, exploration: 0.010
