In [50]:
!pip install tensorflow
!pip install trfl
!pip install gym

Collecting trfl
  Using cached trfl-1.1.0-py3-none-any.whl (99 kB)
Installing collected packages: trfl
Successfully installed trfl-1.1.0


In [173]:
import tensorflow as tf
import tensorflow_probability as tfp
import random
import copy
import numpy as np
import trfl

class DQNAgent():

    def __init__(self, act_dim, hidden_size=256, learning_rate=1e-3, gamma=0.99,
        max_replay_size=10000, batch_size=256, epsilon_min=0.05, epsilon_dec=5e-6, 
        target_update=1000):
        self.num_actions = act_dim
        self.q_network = tf.keras.Sequential(
            [
                tf.keras.layers.Dense(hidden_size),
                tf.keras.layers.ReLU(),
                tf.keras.layers.Dense(self.num_actions),
            ]
        )
        self.target_q_network = copy.deepcopy(self.q_network)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        self.gamma = gamma
        self.max_replay_size = max_replay_size
        self.batch_size = batch_size
        self.replay_buffer = []
        self.replay_ctr = 0

        self.epsilon = 1.0
        self.epsilon_min = epsilon_min
        self.epsilon_dec = epsilon_dec

        self.learn_ctr = 0
        self.target_update = target_update

    def decrement_epsilon(self):
        # Decrement epsilon
        self.epsilon -= self.epsilon_dec

        # If less than epsilon min, set to epsilon min
        if self.epsilon < self.epsilon_min:
            self.epsilon = self.epsilon_min

    def choose_action(self, observation):
        # Epsilon greedy: Choose random action.
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.num_actions)
        
        # Epsilon greedy: Choose greedy action.
        else:
            # Convert numpy observation to tensor
            observation = tf.convert_to_tensor(observation)

            # Add batch dimension
            observation = tf.expand_dims(observation, axis=0)

            # Compute q-values
            q_values = self.q_network(observation)

            # Chose action with highest q-value
            action = tf.argmax(q_values, axis=1)

            # Convert tensor back to int
            action = action.numpy()[0]

        return action
        
    def update_target_network(self):
        if self.learn_ctr % self.target_update == 0:
            self.target_q_network.set_weights(self.q_network.get_weights())

    def store(self, observation, action, reward, next_observation, done):
        experience_tuple = (observation, action, reward, next_observation, done)
        
        if self.replay_ctr < self.max_replay_size:
            self.replay_buffer.append(experience_tuple)
        else:
            idx = self.replay_ctr % self.max_replay_size
            self.replay_buffer[idx] = experience_tuple

        self.replay_ctr += 1

    def sample_replay(self):

        # Get a random batch from replay
        batch = random.sample(self.replay_buffer, self.batch_size)
        observations, actions, rewards, next_observations, dones = zip(*batch)

        # Convert to tensors
        observations = tf.convert_to_tensor(observations, dtype='float32')
        actions = tf.convert_to_tensor(actions, dtype='int32')
        rewards = tf.convert_to_tensor(rewards, dtype='float32')
        next_observations = tf.convert_to_tensor(next_observations, dtype='float32')
        dones = tf.convert_to_tensor(dones, dtype='float32')

        return observations, actions, rewards, next_observations, dones

    def learn(self):
        self.decrement_epsilon()

        # If too litle data in replay, do nothing.
        if len(self.replay_buffer) < self.batch_size:
            return

        observations, actions, rewards, next_observations, dones = self.sample_replay()

        with tf.GradientTape() as tape:
            # Compute q-value of observation and action pair.
            q_values = self.q_network(observations)
            # TODO explain what this does.
            q_value = trfl.indexing_ops.batched_index(q_values, actions)

            # Compute q-value of next observation using 
            # target network and `max` operator
            next_q_value = tf.reduce_max(self.target_q_network(next_observations), axis=1)

            # Bellman target
            target = rewards + (1 - dones) * self.gamma * next_q_value

            # Loss is just the mean square error
            loss = tf.losses.MSE(target, q_value)

        # Get trainable variables
        variables = self.q_network.trainable_variables

        # Compute gradients with respect to trainable variables
        gradients = tape.gradient(loss, variables)

        # Apply gradients
        self.optimizer.apply_gradients(zip(gradients, variables))

        # Maybe update the target network weights
        self.update_target_network()

        # Increment the learn counter
        self.learn_ctr += 1

In [174]:
def run_episode(environment, agent):
    episode_return = 0
    done = False
    observation = environment.reset()
    while not done:
        action = agent.choose_action(observation)

        next_observation, reward, done, info = environment.step(action)

        agent.store(observation, action, reward, next_observation, done)

        observation = next_observation

        episode_return += reward

        # Agent learns at every step.
        # Dont need to wait for end of episode.
        agent.learn()
    
    return episode_return

In [175]:
import datetime

class Logger():

    def __init__(self, logdir="./logs/"):
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        logdir = logdir + current_time

        self.summary_writer = tf.summary.create_file_writer(logdir)

    def write(self, step, logs):
        """Write logs to tensorboard.

        Args:
            step (Int): Training step of the logs.
            logs (Dict[str, float]): Dictionary of logs to be written to tensorboard.
        """
        with self.summary_writer.as_default():
            for key, value in logs.items():
                tf.summary.scalar(key, value, step=step)

In [176]:
def train(environment, agent, logger, num_iter=5000):
    scores = []
    for i in range(num_iter):
        score = run_episode(environment, agent)
        scores.append(score)

        logs = {
            "return": score,
            "epsilon": agent.epsilon # Lets log epsilon too!
        }
        
        logger.write(step=i, logs=logs)

In [177]:
import gym

environment = gym.make("LunarLander-v2")

act_dim = environment.action_space.n

agent = DQNAgent(act_dim)

episode_return = run_episode(environment, agent)
print("Episode Return:", episode_return)

Episode Return: -121.91977270430608


In [178]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [180]:
%tensorboard --logdir logs/

Reusing TensorBoard on port 6006 (pid 45472), started 0:00:02 ago. (Use '!kill 45472' to kill it.)

In [181]:
logger = Logger()

train(environment, agent, logger)

KeyboardInterrupt: 