# Deep Qlearning Algorithm for OpenAI Gym Cartpole

## Imports

In [1]:
from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np 
import random

## Hyperparameters

In [2]:
DISCOUNT_FACTOR = 0.99
BATCH_SIZE = 32
BUFFER_SIZE = 50000
MIN_REPLAY_SIZE = 1000
EXPLORE_RATE_START = 1.0
EXPLORE_RATE_END = 0.02
EXPLORE_RATE_DECAY = 10000
TARGET_UPDATE_FREQ = 1000

## Environment

In [3]:
#env = gym.make("CartPole-v1") 
env = gym.make("CartPole-v1", render_mode= "human") # Use this one to visualize
print("Action space:", env.action_space)
print("State space:", env.observation_space)
print("Lower bounds of state space:", env.observation_space.low)
print("Upper bounds of state space:", env.observation_space.high)

Action space: Discrete(2)
State space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
Lower bounds of state space: [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
Upper bounds of state space: [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]


## Training

In [4]:
replay_buffer = deque(maxlen=BUFFER_SIZE)
rew_buffer = deque([0.0], maxlen=100)

episode_reward = 0.0

### Neural Network Using Pytorch

In [5]:
class Network(nn.Module):
    def __init__(self, env):
        super().__init__()

        in_features = int(np.prod(env.observation_space.shape))

        self.net = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.Tanh(),
            nn.Linear(64, env.action_space.n)
        )
    
    def forward(self, x):
        return self.net(x)
    
    def act(self, obs):
        obs_t = torch.as_tensor(obs, dtype=torch.float32)
        q_values = self(obs_t.unsqueeze(0))

        max_q_index = torch.argmax(q_values, dim=1)[0]
        action = max_q_index.detach().item()

        return action

online_net = Network(env)
target_net = Network(env)

target_net.load_state_dict(online_net.state_dict())

optimizer = torch.optim.Adam(online_net.parameters(), lr=5e4)

### Initialize Replay Buffer

In [6]:
obs = env.reset()
obs = obs[0]
for _ in range(MIN_REPLAY_SIZE):
    action = env.action_space.sample()

    new_obs, rew, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    transition = (obs, action, rew, done, new_obs)
    replay_buffer.append(transition)
    obs = new_obs

    if done:
        obs = env.reset()
        obs = obs[0]

  if not isinstance(terminated, (bool, np.bool8)):


### Main Training Loop

In [7]:
obs = env.reset()
obs = obs[0]

for step in itertools.count():
    EXPLORE_RATE = np.interp(step, [0, EXPLORE_RATE_DECAY], [EXPLORE_RATE_START, EXPLORE_RATE_END])

    rnd_sample = random.random()
    
    if rnd_sample <= EXPLORE_RATE:
        action = env.action_space.sample()
    else:
        action = online_net.act(obs)
    
    new_obs, rew, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    transition = (obs, action, rew, done, new_obs)
    replay_buffer.append(transition)
    obs = new_obs

    episode_reward += rew

    if done:
        obs = env.reset()
        obs = obs[0]

        rew_buffer.append(episode_reward)
        episode_reward = 0.0
    
    # After solved, watch it play
    if len(rew_buffer) >= 100:
        if np.mean(rew_buffer) >= 195:
            while True:
                action = online_net.act(obs)
                
                obs, _, terminated, truncated, _ = env.step(action)
                done = terminated or truncated
                env.render()
                if done:
                    env.reset()
    
    # Start Gradient Descent Step
    transitions = random.sample(replay_buffer, BATCH_SIZE)

    obses = np.asarray([t[0] for t in transitions])
    actions = np.asarray([t[1] for t in transitions])
    rews = np.asarray([t[2] for t in transitions])
    dones = np.asarray([t[3] for t in transitions])
    new_obses = np.asarray([t[4] for t in transitions])

    obses_t = torch.as_tensor(obses, dtype=torch.float32)
    actions_t = torch.as_tensor(actions, dtype=torch.int64).unsqueeze(-1)
    rews_t = torch.as_tensor(rews, dtype=torch.float32).unsqueeze(-1)
    dones_t = torch.as_tensor(dones, dtype=torch.float32).unsqueeze(-1)
    new_obses_t = torch.as_tensor(new_obses, dtype=torch.float32)

    # Compute Targets
    target_q_values = target_net(new_obses_t)
    max_target_q_values = target_q_values.max(dim=1, keepdim=True)[0]
    
    targets = rews_t + DISCOUNT_FACTOR *  (1 - dones_t) * max_target_q_values

    # Compute Loss
    q_values = online_net(obses_t)

    action_q_values = torch.gather(input=q_values, dim=1, index=actions_t)

    loss = nn.functional.smooth_l1_loss(action_q_values, targets)

    # Gradient Descent
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Update Target Network
    if step % TARGET_UPDATE_FREQ == 0:
        target_net.load_state_dict(online_net.state_dict())
    
    # Logging
    if step % 1000 == 0:
        print()
        print("Step", step)
        print("Avg Rew", np.mean(rew_buffer))


Step 0
Avg Rew 0.0

Step 1000
Avg Rew 22.177777777777777

Step 2000
Avg Rew 22.211111111111112

Step 3000
Avg Rew 21.07

Step 4000
Avg Rew 19.56

Step 5000
Avg Rew 19.84

Step 6000
Avg Rew 22.31

Step 7000
Avg Rew 23.72

Step 8000
Avg Rew 24.09

Step 9000
Avg Rew 22.53

Step 10000
Avg Rew 24.6

Step 11000
Avg Rew 29.93

Step 12000
Avg Rew 35.65

Step 13000
Avg Rew 39.95

Step 14000
Avg Rew 44.26

Step 15000
Avg Rew 40.3

Step 16000
Avg Rew 42.36

Step 17000
Avg Rew 39.42

Step 18000
Avg Rew 33.09

Step 19000
Avg Rew 20.45

Step 20000
Avg Rew 15.34

Step 21000
Avg Rew 17.78


KeyboardInterrupt: 