<h1>Table of Contents<span class="tocSkip"></span></h1>


# Introduction


**What?** Deep Q-learning Network - Reinforcement Learning



# Import modules

In [1]:
import torch
import torch.nn as nn
import random
from collections import deque
import numpy as np
import torch.optim as optim
import gym

# Introduction to the gym environment

In [None]:
"""
We started by making an environment using Cartpole-v0, which has a maximum achievable score of 200, after 
which the environment terminates. We brought the environment to an initial state where the cartpole is in
the upright position using the env.reset() command. 

Then, we started a loop for 1,000 steps, wherein we rendered the current environment's start using render()
and we chose a random action for the current state using env.action_space.sample(). Then, we passed the
selected action into the step method of the environment. The step method tells us what happened to the 
environment when we performed the current action on the current state of the environment.
"""

In [2]:
env = gym.make('CartPole-v0')

In [3]:
env.reset()

array([ 0.04867627,  0.01992289, -0.03577999,  0.0427075 ])

In [4]:
for _ in range(1000):
    env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    print()
    if done:
        env.reset()
env.close()











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [5]:
env.action_space.n

2

In [6]:
env.observation_space.shape[0]

4

# Introducing DQNs

In [None]:
"""
A DQN is an RL technique that aims at picking the best possible action for a given observation. There is a 
q-value, which is the quality of a given move that's associated with each possible action for each possible 
observation. 
In the traditional RL algorithm, this q-value comes from a q-table, which is a lookup table, 
where it is a table holding q-values. This lookup table is updated iteratively by playing the game over and 
over and using the reward to update the table. 
The q-learning algorithm learns the optimum values to be populated in this table. We can simply look at the table
for a given state and select the action with the maximum q-value in order to maximize the chance of winning the 
game.

With Deep Q-learning, instead of using a Q table to look up the action with a maximum possible q-value for a 
given state, we use a deep neural network to predict the Q-values for the actions and pick the action with the 
maximum q-value for a given action.
"""

In [7]:
def cartpole_model(observation_space, action_space):
    """
    We used this model to train and predict the q-values
    for each action, given an observation. There alternative 
    architectures.
    """
    return nn.Sequential(
        nn.Linear(observation_space, 24),
        nn.ReLU(),
        nn.Linear(24, 24),
        nn.ReLU(),
        nn.Linear(24, action_space)
    )

In [8]:
class DQN:
    def __init__(self, observation_space, action_space):
        self.exploration_rate = MAX_EXPLORE
        self.action_space = action_space
        self.observation_space = observation_space
        self.memory = deque(maxlen=MEMORY_LEN)
        
        # Two instances of the same cartpole model
        self.target_net = cartpole_model(self.observation_space, self.action_space)
        self.policy_net = cartpole_model(self.observation_space, self.action_space)
        
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        
        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.policy_net.parameters())
        
        self.explore_limit = False
        
    def load_memory(self, state, action, reward, next_state, terminal):
        self.memory.append((state, action, reward, next_state, terminal))
        
    def predict_action(self, state):
        random_number = np.random.rand()
        
        if random_number < self.exploration_rate:
            return random.randrange(self.action_space)
        
        q_values = self.target_net(state).detach().numpy()
        return np.argmax(q_values[0])
    
    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        
        batch = random.sample(self.memory, BATCH_SIZE)
        
        for state, action, reward, next_state, terminal in batch:
            q_update = reward
            
            if not terminal:
                    q_update = reward + GAMMA * self.target_net(next_state).max(axis=1)[0]
                    
            q_values = self.target_net(state)
            q_values[0][action] = q_update
            
            loss = self.criterion(self.policy_net(state), q_values)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
        if not self.explore_limit:
            self.exploration_rate *= EXPLORE_DECAY
            if self.exploration_rate < MIN_EXPLORE:
                self.exploration_rate = MIN_EXPLORE
                self.explore_limit = True

In [9]:
ENV_NAME = "CartPole-v1"
BATCH_SIZE = 20
GAMMA = 0.95
LEARNING_RATE = 0.001
MAX_EXPLORE = 1.0
MIN_EXPLORE = 0.01
EXPLORE_DECAY = 0.995
MEMORY_LEN = 1_000_000
UPDATE_FREQ = 10

In [10]:
env = gym.make(ENV_NAME)
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
dqn = DQN(observation_space, action_space)

In [11]:
print(f'| Run | Exploration Rate | Score |')
steps = 0
for i in range(100):
    state = env.reset()
    state = np.reshape(state, [1, observation_space])
    state = torch.from_numpy(state).float()
    
    score = 0
    while True:
        steps += 1
        score += 1
        action = dqn.predict_action(state)
        next_state, reward, terminal, info = env.step(action)
        
        next_state = torch.from_numpy(np.reshape(next_state, [1, observation_space])).float()
        dqn.load_memory(state, action, reward, next_state, terminal)
        state = next_state
        
        if terminal:
            print(f'| {i+1:03} |       {dqn.exploration_rate:.4f}     |  {score:03}  |')
            break
        
        dqn.experience_replay()
        if steps%UPDATE_FREQ == 0:
            dqn.target_net.load_state_dict(dqn.policy_net.state_dict())

| Run | Exploration Rate | Score |
| 001 |       0.9950     |  021  |
| 002 |       0.9276     |  015  |
| 003 |       0.8433     |  020  |
| 004 |       0.7822     |  016  |
| 005 |       0.7329     |  014  |
| 006 |       0.6798     |  016  |
| 007 |       0.6274     |  017  |
| 008 |       0.5820     |  016  |
| 009 |       0.5291     |  020  |
| 010 |       0.4762     |  022  |
| 011 |       0.4417     |  016  |
| 012 |       0.3126     |  070  |
| 013 |       0.2914     |  015  |
| 014 |       0.2799     |  009  |
| 015 |       0.2676     |  010  |
| 016 |       0.2545     |  011  |
| 017 |       0.2421     |  011  |
| 018 |       0.2257     |  015  |
| 019 |       0.2157     |  010  |
| 020 |       0.1459     |  079  |
| 021 |       0.1048     |  067  |
| 022 |       0.0616     |  107  |
| 023 |       0.0522     |  034  |
| 024 |       0.0452     |  030  |
| 025 |       0.0387     |  032  |
| 026 |       0.0338     |  028  |
| 027 |       0.0243     |  067  |
| 028 |       0.0217

In [12]:
def play_agent(dqn, env):
    observation = env.reset()
    total_reward=0
    for _ in range(500):
        env.render()
        observation = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
        q_values = dqn.target_net(observation).detach().numpy()
        action = np.argmax(q_values[0])
        new_observation, reward, done, _ = env.step(action)
        total_reward += reward
        observation = new_observation

        if(done):
            break

    env.close()
    print("Rewards: ",total_reward)

In [13]:
play_agent(dqn, env)

Rewards:  315.0


# References


- Jibin Mathew, PyTorch Artificial Intelligence Fundamentals
- https://github.com/gsurma/cartpole/blob/master/cartpole.py

