# Modules

In [1]:
from lib import wrappers
from lib import dqn_model
import time
import numpy as np
import collections
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from tensorboardX import SummaryWriter

  ROMS = resolve_roms()


# Constants

In [2]:
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 19.5
EPSILON_DECAY_LAST_FRAME = 10**5
EPSILON_START = 1.0
EPSILON_FINAL = 0.02
GAMMA = 0.99
BATCH_SIZE = 32
REPLAY_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000
REPLAY_START_SIZE = 10000

# Classes

Experienced Buffer

In [3]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), np.array(next_states)



Agent

In [4]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0

    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None

        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        # do step in the environment
        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward

        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

# Loss Calculations

In [5]:
def calc_loss(batch, net, tgt_net, device="cpu"):
    states, actions, rewards, dones, next_states = batch

    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)

    actions_v2 = actions_v.unsqueeze(-1).long()

    state_action_values = net(states_v).gather(1, actions_v2).squeeze(-1)
    
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

# Main

In [6]:

# device = torch.device("gpu")
device = torch.device("cpu")

env = wrappers.make_env(DEFAULT_ENV_NAME)

net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
writer = SummaryWriter(comment="-" + DEFAULT_ENV_NAME)
print(net)

buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
frame_idx = 0
ts_frame = 0
ts = time.time()
best_mean_reward = None

NUM_GAMES = 350

# for plots
mean_rewards = []
num_games = []
epsilons = []
frames = []


while len(total_rewards) < NUM_GAMES:
    frame_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
    if frame_idx % 100 == 0:
        epsilons.append(epsilon)
        frames.append(frame_idx)

    reward = agent.play_step(net, epsilon, device=device)
    if reward is not None:
    # if reward is not None and reward > 0:
        total_rewards.append(reward)
        speed = (frame_idx - ts_frame) / (time.time() - ts)
        ts_frame = frame_idx
        ts = time.time()
        mean_reward = np.mean(total_rewards[-100:])
        mean_rewards.append(mean_reward)
        num_games.append(len(total_rewards))
        print("%d frames over %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
            frame_idx, len(total_rewards), mean_reward, epsilon,
            speed
        ))
        writer.add_scalar("epsilon", epsilon, frame_idx)
        writer.add_scalar("speed", speed, frame_idx)
        writer.add_scalar("reward_100", mean_reward, frame_idx)
        writer.add_scalar("reward", reward, frame_idx)
        if best_mean_reward is None or best_mean_reward < mean_reward:
            torch.save(net.state_dict(), DEFAULT_ENV_NAME + "-best.dat")
            if best_mean_reward is not None:
                print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
            best_mean_reward = mean_reward
        if mean_reward > MEAN_REWARD_BOUND:
            print("Solved in %d frames!" % frame_idx)
            break

    if len(buffer) < REPLAY_START_SIZE:
        continue

    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())

    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch, net, tgt_net, device='cpu')
    loss_t.backward()
    optimizer.step()

writer.close()

plt.plot(num_games, mean_rewards)
plt.xlabel('Number of games played')
plt.ylabel('Mean reward')
plt.show()

plt.plot(range(1, len(total_rewards)+1), total_rewards)
plt.xlabel('Number of games played')
plt.ylabel('Total reward')
plt.show()

plt.hist(total_rewards, bins=30)
plt.xlabel('Reward')
plt.ylabel('Count')
plt.show()

plt.plot(frames, epsilons)
plt.xlabel('Number of frames played')
plt.ylabel('Epsilon')
plt.show()


DQN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)
1109 frames over 1 games, mean reward -19.000, eps 0.99, speed 709.92 f/s
2087 frames over 2 games, mean reward -20.000, eps 0.98, speed 662.61 f/s
3125 frames over 3 games, mean reward -20.000, eps 0.97, speed 594.22 f/s
4212 frames over 4 games, mean reward -19.750, eps 0.96, speed 627.58 f/s
5132 frames over 5 games, mean reward -19.800, eps 0.95, speed 567.53 f/s
6015 frames over 6 games, mean reward -20.000, eps 0.94, speed 660.61 f/s
6971 frames over 7 games, mean reward -20.000, eps 0.93, speed 535.78 f/s
7762 frames over 8 games, mean reward -20.125, eps 0.92, speed

  next_state_values[done_mask] = 0.0


10647 frames over 11 games, mean reward -19.909, eps 0.89, speed 30.84 f/s
11428 frames over 12 games, mean reward -20.000, eps 0.89, speed 23.27 f/s
12268 frames over 13 games, mean reward -20.000, eps 0.88, speed 21.69 f/s
13031 frames over 14 games, mean reward -20.071, eps 0.87, speed 21.84 f/s
13914 frames over 15 games, mean reward -20.133, eps 0.86, speed 21.70 f/s
14677 frames over 16 games, mean reward -20.188, eps 0.85, speed 21.24 f/s
15562 frames over 17 games, mean reward -20.235, eps 0.84, speed 21.41 f/s
16398 frames over 18 games, mean reward -20.278, eps 0.84, speed 21.74 f/s
17221 frames over 19 games, mean reward -20.316, eps 0.83, speed 21.75 f/s
18159 frames over 20 games, mean reward -20.300, eps 0.82, speed 20.61 f/s
19087 frames over 21 games, mean reward -20.286, eps 0.81, speed 20.46 f/s
19906 frames over 22 games, mean reward -20.318, eps 0.80, speed 20.10 f/s
20777 frames over 23 games, mean reward -20.304, eps 0.79, speed 20.33 f/s
21755 frames over 24 game

KeyboardInterrupt: 

In [4]:
import tensorflow as tf

In [6]:
print(f"Tensorflow version: {tf.__version__}")
print(f"Keras Version: {tf.keras.__version__}")
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

Tensorflow version: 2.11.0
Keras Version: 2.11.0
GPU is NOT AVAILABLE
