**Chapter 18 – Reinforcement Learning-  Example 2 HEM **

# Setup

This project requires Python 3.7 or above:

In [1]:
import sys

assert sys.version_info >= (3, 7)

And TensorFlow ≥ 2.8:

In [2]:
from packaging import version
import tensorflow as tf

assert version.parse(tf.__version__) >= version.parse("2.8.0")

In [3]:
import matplotlib.animation
import matplotlib.pyplot as plt


In [4]:
import numpy as np
np.random.seed(42)

# Deep Q-Network ( SentDex )

In [5]:

#import keras.backend.tensorflow_backend as backend
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten
from keras.optimizers import Adam
from keras.callbacks import TensorBoard



In [6]:
from collections import deque
import time
import random
from tqdm import tqdm
import os
from PIL import Image
import cv2


In [7]:

DISCOUNT = 0.99
REPLAY_MEMORY_SIZE = 50_000  # How many last steps to keep for model training
MIN_REPLAY_MEMORY_SIZE = 1_000  # Minimum number of steps in a memory to start training
MINIBATCH_SIZE = 64  # How many steps (samples) to use for training
UPDATE_TARGET_EVERY = 5  # Terminal states (end of episodes)
MODEL_NAME = '2x256'
MIN_REWARD = -200  # For model save
MEMORY_FRACTION = 0.20

# Environment settings
EPISODES = 20_000

# Exploration settings
epsilon = 1  # not a constant, going to be decayed
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.001

#  Stats settings
AGGREGATE_STATS_EVERY = 50  # episodes
SHOW_PREVIEW = False


class Blob:
    def __init__(self, size):
        self.size = size
        self.x = np.random.randint(0, size)
        self.y = np.random.randint(0, size)

    def __str__(self):
        return f"Blob ({self.x}, {self.y})"

    def __sub__(self, other):
        return (self.x-other.x, self.y-other.y)

    def __eq__(self, other):
        return self.x == other.x and self.y == other.y

    def action(self, choice):
        '''
        Gives us 9 total movement options. (0,1,2,3,4,5,6,7,8)
        '''
        if choice == 0:
            self.move(x=1, y=1)
        elif choice == 1:
            self.move(x=-1, y=-1)
        elif choice == 2:
            self.move(x=-1, y=1)
        elif choice == 3:
            self.move(x=1, y=-1)

        elif choice == 4:
            self.move(x=1, y=0)
        elif choice == 5:
            self.move(x=-1, y=0)

        elif choice == 6:
            self.move(x=0, y=1)
        elif choice == 7:
            self.move(x=0, y=-1)

        elif choice == 8:
            self.move(x=0, y=0)

    def move(self, x=False, y=False):

        # If no value for x, move randomly
        if not x:
            self.x += np.random.randint(-1, 2)
        else:
            self.x += x

        # If no value for y, move randomly
        if not y:
            self.y += np.random.randint(-1, 2)
        else:
            self.y += y

        # If we are out of bounds, fix!
        if self.x < 0:
            self.x = 0
        elif self.x > self.size-1:
            self.x = self.size-1
        if self.y < 0:
            self.y = 0
        elif self.y > self.size-1:
            self.y = self.size-1


class BlobEnv:
    SIZE = 10
    RETURN_IMAGES = True
    MOVE_PENALTY = 1
    ENEMY_PENALTY = 300
    FOOD_REWARD = 25
    OBSERVATION_SPACE_VALUES = (SIZE, SIZE, 3)  # 4
    ACTION_SPACE_SIZE = 9
    PLAYER_N = 1  # player key in dict
    FOOD_N = 2  # food key in dict
    ENEMY_N = 3  # enemy key in dict
    # the dict! (colors)
    d = {1: (255, 175, 0),
         2: (0, 255, 0),
         3: (0, 0, 255)}

    def reset(self):
        self.player = Blob(self.SIZE)
        self.food = Blob(self.SIZE)
        while self.food == self.player:
            self.food = Blob(self.SIZE)
        self.enemy = Blob(self.SIZE)
        while self.enemy == self.player or self.enemy == self.food:
            self.enemy = Blob(self.SIZE)

        self.episode_step = 0

        if self.RETURN_IMAGES:
            observation = np.array(self.get_image())
        else:
            observation = (self.player-self.food) + (self.player-self.enemy)
        return observation

    def step(self, action):
        self.episode_step += 1
        self.player.action(action)

        #### MAYBE ###
        #enemy.move()
        #food.move()
        ##############

        if self.RETURN_IMAGES:
            new_observation = np.array(self.get_image())
        else:
            new_observation = (self.player-self.food) + (self.player-self.enemy)

        if self.player == self.enemy:
            reward = -self.ENEMY_PENALTY
        elif self.player == self.food:
            reward = self.FOOD_REWARD
        else:
            reward = -self.MOVE_PENALTY

        done = False
        if reward == self.FOOD_REWARD or reward == -self.ENEMY_PENALTY or self.episode_step >= 200:
            done = True

        return new_observation, reward, done

    def render(self):
        img = self.get_image()
        img = img.resize((300, 300))  # resizing so we can see our agent in all its glory.
        cv2.imshow("image", np.array(img))  # show it!
        cv2.waitKey(1)

    # FOR CNN #
    def get_image(self):
        env = np.zeros((self.SIZE, self.SIZE, 3), dtype=np.uint8)  # starts an rbg of our size
        env[self.food.x][self.food.y] = self.d[self.FOOD_N]  # sets the food location tile to green color
        env[self.enemy.x][self.enemy.y] = self.d[self.ENEMY_N]  # sets the enemy location to red
        env[self.player.x][self.player.y] = self.d[self.PLAYER_N]  # sets the player tile to blue
        img = Image.fromarray(env, 'RGB')  # reading to rgb. Apparently. Even tho color definitions are bgr. ???
        return img


env = BlobEnv()

# For stats
ep_rewards = [-200]

# For more repetitive results
random.seed(1)
np.random.seed(1)
tf.compat.v1.set_random_seed(1)

# Memory fraction, used mostly when trai8ning multiple agents
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=MEMORY_FRACTION)
#backend.set_session(tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)))

# Create models folder
if not os.path.isdir('models'):
    os.makedirs('models')


# Own Tensorboard class
class ModifiedTensorBoard(TensorBoard):

    # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.step = 1
        self.writer = tf.summary.FileWriter(self.log_dir)

    # Overriding this method to stop creating default log writer
    def set_model(self, model):
        pass

    # Overrided, saves logs with our step number
    # (otherwise every .fit() will start writing from 0th step)
    def on_epoch_end(self, epoch, logs=None):
        self.update_stats(**logs)

    # Overrided
    # We train for one batch only, no need to save anything at epoch end
    def on_batch_end(self, batch, logs=None):
        pass

    # Overrided, so won't close writer
    def on_train_end(self, _):
        pass

    # Custom method for saving own metrics
    # Creates writer, writes custom metrics and closes writer
    def update_stats(self, **stats):
        self._write_logs(stats, self.step)


# Agent class
class DQNAgent:
    def __init__(self):

        # Main model
        self.model = self.create_model()

        # Target network
        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())

        # An array with last n steps for training
        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)

        # Custom tensorboard object
        #self.tensorboard = ModifiedTensorBoard(log_dir="logs/{}-{}".format(MODEL_NAME, int(time.time())))

        # Used to count when to update target network with main network's weights
        self.target_update_counter = 0

    def create_model(self):
        model = Sequential()

        model.add(Conv2D(256, (3, 3), input_shape=env.OBSERVATION_SPACE_VALUES))  # OBSERVATION_SPACE_VALUES = (10, 10, 3) a 10x10 RGB image.
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.2))

        model.add(Conv2D(256, (3, 3)))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.2))

        model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
        model.add(Dense(64))

        model.add(Dense(env.ACTION_SPACE_SIZE, activation='linear'))  # ACTION_SPACE_SIZE = how many choices (9)
        model.compile(loss="mse", optimizer=Adam(lr=0.001), metrics=['accuracy'])
        return model

    # Adds step's data to a memory replay array
    # (observation space, action, reward, new observation space, done)
    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)

    # Trains main network every step during episode
    def train(self, terminal_state, step):

        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return

        # Get a minibatch of random samples from memory replay table
        minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)

        # Get current states from minibatch, then query NN model for Q values
        current_states = np.array([transition[0] for transition in minibatch])/255
        current_qs_list = self.model.predict(current_states)

        # Get future states from minibatch, then query NN model for Q values
        # When using target network, query it, otherwise main network should be queried
        new_current_states = np.array([transition[3] for transition in minibatch])/255
        future_qs_list = self.target_model.predict(new_current_states)

        X = []
        y = []

        # Now we need to enumerate our batches
        for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):

            # If not a terminal state, get new q from future states, otherwise set it to 0
            # almost like with Q Learning, but we use just part of equation here
            if not done:
                max_future_q = np.max(future_qs_list[index])
                new_q = reward + DISCOUNT * max_future_q
            else:
                new_q = reward

            # Update Q value for given state
            current_qs = current_qs_list[index]
            current_qs[action] = new_q

            # And append to our training data
            X.append(current_state)
            y.append(current_qs)

        # Fit on all samples as one batch, log only on terminal state
        #self.model.fit(np.array(X)/255, np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)
        self.model.fit(np.array(X)/255, np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False, callbacks=None)

        # Update target network counter every episode
        if terminal_state:
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter > UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0

    # Queries main network for Q values given current observation space (environment state)
    def get_qs(self, state):
        return self.model.predict(np.array(state).reshape(-1, *state.shape)/255)[0]




In [None]:
agent = DQNAgent()

# Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):

    # Update tensorboard step every episode
    #agent.tensorboard.step = episode

    # Restarting episode - reset episode reward and step number
    episode_reward = 0
    step = 1

    # Reset environment and get initial state
    current_state = env.reset()

    # Reset flag and start iterating until episode ends
    done = False
    while not done:

        # This part stays mostly the same, the change is to query a model for Q values
        if np.random.random() > epsilon:
            # Get action from Q table
            action = np.argmax(agent.get_qs(current_state))
        else:
            # Get random action
            action = np.random.randint(0, env.ACTION_SPACE_SIZE)

        new_state, reward, done = env.step(action)

        # Transform new continous state to new discrete state and count reward
        episode_reward += reward

        if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
            env.render()

        # Every step we update replay memory and train main network
        agent.update_replay_memory((current_state, action, reward, new_state, done))
        agent.train(done, step)

        current_state = new_state
        step += 1

    # Append episode reward to a list and log stats (every given number of episodes)
    ep_rewards.append(episode_reward)
    if not episode % AGGREGATE_STATS_EVERY or episode == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
        #agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)

        # Save model, but only when min reward is greater or equal a set value
        if min_reward >= MIN_REWARD:
            agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')

    # Decay epsilon
    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)

  0%|                                     | 0/20000 [00:00<?, ?episodes/s]



  0%|                           | 13/20000 [00:00<02:57, 112.66episodes/s]



  0%|                           | 13/20000 [00:20<02:57, 112.66episodes/s]



  0%|                          | 23/20000 [00:32<9:29:29,  1.71s/episodes]



  0%|                         | 24/20000 [00:41<12:04:19,  2.18s/episodes]



  0%|                         | 24/20000 [01:00<12:04:19,  2.18s/episodes]



  0%|                         | 28/20000 [01:14<21:39:15,  3.90s/episodes]



  0%|                         | 29/20000 [01:16<20:56:57,  3.78s/episodes]



  0%|                         | 29/20000 [01:30<20:56:57,  3.78s/episodes]



  0%|                         | 32/20000 [01:43<29:19:25,  5.29s/episodes]



  0%|                         | 33/20000 [02:03<38:45:17,  6.99s/episodes]



  0%|                         | 34/20000 [02:11<39:33:21,  7.13s/episodes]



  0%|                         | 36/20000 [02:21<36:05:24,  6.51s/episodes]



  0%|                         | 37/20000 [02:26<34:17:56,  6.19s/episodes]



  0%|                         | 38/20000 [02:34<36:48:01,  6.64s/episodes]



  0%|                         | 39/20000 [02:37<32:24:24,  5.84s/episodes]



  0%|                         | 40/20000 [02:39<27:30:39,  4.96s/episodes]



  0%|                         | 41/20000 [02:40<22:07:14,  3.99s/episodes]



  0%|                         | 42/20000 [02:43<19:33:53,  3.53s/episodes]



  0%|                         | 43/20000 [02:48<21:48:22,  3.93s/episodes]



  0%|                         | 44/20000 [02:50<18:50:40,  3.40s/episodes]



  0%|                         | 45/20000 [02:52<17:43:39,  3.20s/episodes]



  0%|                         | 46/20000 [02:55<17:10:45,  3.10s/episodes]



  0%|                         | 47/20000 [02:57<15:28:16,  2.79s/episodes]



  0%|                         | 48/20000 [03:15<40:19:23,  7.28s/episodes]



  0%|                         | 49/20000 [03:24<43:07:07,  7.78s/episodes]



  0%|                         | 50/20000 [03:25<31:42:04,  5.72s/episodes]



  0%|                         | 51/20000 [03:26<23:37:23,  4.26s/episodes]



  0%|                         | 52/20000 [03:27<18:29:59,  3.34s/episodes]



  0%|                         | 53/20000 [03:32<20:04:58,  3.62s/episodes]



  0%|                         | 54/20000 [03:32<14:45:52,  2.66s/episodes]



  0%|                         | 55/20000 [03:37<18:24:55,  3.32s/episodes]



  0%|                         | 56/20000 [03:41<20:26:06,  3.69s/episodes]



  0%|                         | 57/20000 [03:58<42:40:16,  7.70s/episodes]



  0%|                         | 58/20000 [04:15<57:43:49, 10.42s/episodes]



  0%|                         | 59/20000 [04:16<41:33:43,  7.50s/episodes]



  0%|                         | 60/20000 [04:18<32:40:29,  5.90s/episodes]



  0%|                         | 61/20000 [04:26<36:08:58,  6.53s/episodes]



  0%|                        | 62/20000 [07:15<305:33:28, 55.17s/episodes]



  0%|                        | 63/20000 [07:34<246:28:50, 44.51s/episodes]



  0%|                      | 64/20000 [22:19<1641:27:15, 296.41s/episodes]



  0%|                      | 65/20000 [22:25<1160:25:38, 209.56s/episodes]



  0%|                      | 66/20000 [34:06<1976:11:09, 356.89s/episodes]



  0%|                      | 67/20000 [34:26<1416:12:15, 255.77s/episodes]



  0%|                      | 68/20000 [34:35<1006:46:22, 181.84s/episodes]



  0%|                       | 69/20000 [34:51<730:37:30, 131.97s/episodes]



  0%|                        | 70/20000 [34:53<514:32:47, 92.94s/episodes]



  0%|                        | 71/20000 [34:53<360:50:34, 65.18s/episodes]



  0%|                        | 72/20000 [34:57<258:32:24, 46.71s/episodes]



  0%|                        | 73/20000 [35:13<208:43:46, 37.71s/episodes]



  0%|                        | 74/20000 [35:20<156:17:30, 28.24s/episodes]



  0%|                        | 75/20000 [35:23<115:09:07, 20.81s/episodes]



  0%|                         | 76/20000 [35:31<93:31:14, 16.90s/episodes]



  0%|                         | 77/20000 [35:47<92:53:40, 16.79s/episodes]



  0%|                         | 78/20000 [36:04<92:10:04, 16.66s/episodes]



  0%|                         | 79/20000 [36:07<69:40:23, 12.59s/episodes]



  0%|1                        | 80/20000 [36:19<69:41:13, 12.59s/episodes]



  0%|1                        | 81/20000 [36:25<57:50:39, 10.45s/episodes]



  0%|                        | 82/20000 [37:08<111:13:52, 20.10s/episodes]



  0%|                      | 83/20000 [52:42<1629:11:37, 294.48s/episodes]



  0%|                      | 84/20000 [52:49<1152:05:34, 208.25s/episodes]



  0%|                       | 85/20000 [53:07<835:48:14, 151.09s/episodes]



  0%|                       | 86/20000 [53:17<601:18:10, 108.70s/episodes]



  0%|1                       | 87/20000 [53:35<451:18:35, 81.59s/episodes]



  0%|1                       | 88/20000 [53:37<319:04:00, 57.69s/episodes]



  0%|1                       | 89/20000 [53:46<237:29:04, 42.94s/episodes]



  0%|1                       | 90/20000 [53:47<168:10:43, 30.41s/episodes]



  0%|1                       | 91/20000 [53:47<118:07:53, 21.36s/episodes]

