In [1]:
import gym_super_mario_bros
import gym
import numpy as np
import cv2
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from collections import deque

from gym.spaces import Box
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import RIGHT_ONLY
from gym.wrappers import FrameStack

from tensorflow.keras.layers import Input, Dense, Conv2D, Flatten
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber

In [2]:
class SkipWrapper(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self.skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self.skip):
            state, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return state, reward, done, info

In [3]:
class GrayScaleWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)

    def observation(self, observation):
        observation = cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY)
        return observation

In [4]:
class ResizeWrapper(gym.ObservationWrapper):
    def __init__(self, env, width, height, color_depth=1):
        super().__init__(env)
        self.width = width
        self.height = height
        self.color_depth = color_depth
        self.observation_space = Box(low=0, high=255, shape=(self.width, self.height, color_depth), dtype=np.uint8)

    def observation(self, observation):
        observation = cv2.resize(observation, (self.width, self.height), interpolation=cv2.INTER_AREA)
        observation = np.expand_dims(observation, -1)
        return observation

In [5]:
env = gym_super_mario_bros.make("SuperMarioBros-v0")
env = JoypadSpace(env, RIGHT_ONLY)
env = FrameStack(ResizeWrapper(GrayScaleWrapper(SkipWrapper(env, skip=4)), width=84, height=84), num_stack=4)
env.reset()

  logger.warn(


<gym.wrappers.frame_stack.LazyFrames at 0x7fe6f87fca40>

In [6]:
states = env.observation_space.shape
actions = env.action_space.n

In [7]:
states, actions

((4, 84, 84, 1), 5)

In [8]:
def build_model(states, actions):
    # 84x84x1
    inputs = Input(shape=states)
    fe1 = Conv2D(filters=32, kernel_size=8, strides=4, activation='relu')(inputs)
    fe2 = Conv2D(filters=32, kernel_size=4, strides=2,  activation='relu')(fe1)
    fe3 = Conv2D(filters=64, kernel_size=3, strides=1,  activation='relu')(fe2)
    fe4 = Flatten()(fe3)
    outputs = Dense(actions, activation="linear")(fe4)
    return Model(inputs=inputs, outputs=outputs)

In [9]:
model = build_model(states, actions)

In [10]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 4, 84, 84, 1)]    0         
                                                                 
 conv2d (Conv2D)             (None, 4, 20, 20, 32)     2080      
                                                                 
 conv2d_1 (Conv2D)           (None, 4, 9, 9, 32)       16416     
                                                                 
 conv2d_2 (Conv2D)           (None, 4, 7, 7, 64)       18496     
                                                                 
 flatten (Flatten)           (None, 12544)             0         
                                                                 
 dense (Dense)               (None, 5)                 62725     
                                                                 
Total params: 99,717
Trainable params: 99,717
Non-trainable p

In [11]:
target_model = build_model(states, actions)

In [12]:
class Agent:
    def __init__(self, actions, model, target_model, epsilon=1.0, epsilon_min=0.1, epsilon_random_frames=50000,epsilon_greedy_frames=1000000, batch_size=32, gamma=0.99, update_target_network=10000,update_after_actions=4, max_memory_length=10000):
        self.actions = actions
        self.model = model
        self.target_model = target_model
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_max = epsilon
        self.epsilon_interval = epsilon - epsilon_min
        self.epsilon_random_frames = epsilon_random_frames
        self.epsilon_greedy_frames = epsilon_greedy_frames
        self.memory = []
        self.batch_size = batch_size
        self.gamma = gamma
        self.update_after_actions = update_after_actions
        self.update_target_network=update_target_network
        self.max_memory_length = max_memory_length
        self.optimizer = Adam(learning_rate=0.00025, clipnorm=1.0)
        self.loss_function = Huber()


    def act(self,state, frame_count):
        # Explore
        if frame_count < self.epsilon_random_frames or self.epsilon > np.random.rand(1)[0]:
            action = np.random.choice(self.actions)
        # Exploit
        else:
            # convert state to a tensorflow tensor/
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state_tensor, 0)
            action_probs = model(state_tensor, training=False)
            action = tf.argmax(action_probs[0]).numpy()

        # calculate new epsilon
        self.epsilon -= self.epsilon_interval / self.epsilon_greedy_frames
        # get the maximum between epsilon and epsilon_minimum to prevent epsilon to go below the minimum
        self.epsilon = max(self.epsilon, self.epsilon_min)
        return action

    def cache(self, action, state, state_next, reward, done):
        self.memory.append((action, state, state_next, reward, done))

    def recall(self):
        #print(self.batch_size)
        # batch = np.random.sample(self.memory, self.batch_size)
        indexes = np.random.choice(len(self.memory), self.batch_size)
        batch = np.array([self.memory[i] for i in indexes])
        action, state, next_state, reward, done = map(np.array, zip(*batch))

        if len(self.memory) > self.max_memory_length:
            del self.memory[:1]

        return action, state, next_state, reward, done

    def learn(self, frame_count):
        action_sample, state_sample, state_next_sample, rewards_sample, done_sample = self.recall()

        future_rewards = self.target_model.predict(state_next_sample)
        done_sample = tf.convert_to_tensor([float(sample) for sample in done_sample])
        updated_q_values = rewards_sample + self.gamma * tf.reduce_max(
            future_rewards, axis=1
        )
        # If final frame set the last value to -1
        updated_q_values = updated_q_values * (1 - done_sample) - done_sample
        # Create a mask so we only calculate loss on the updated Q-values. One hot the actions.
        masks = tf.one_hot(action_sample, actions)

        with tf.GradientTape() as tape:
            # Train the model on the states and updated Q-values
            q_values = model(state_sample)
            # Apply the masks to the Q-values to get the Q-value for action taken
            q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
            # Calculate loss between new Q-value and old Q-value
            loss = self.loss_function(updated_q_values, q_action)

        # Backpropagation
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

        if frame_count % self.update_target_network == 0:
            # update the the target network with new weights
            self.target_model.set_weights(model.get_weights())


In [13]:
agent = Agent(actions, model, target_model)

In [14]:
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0
max_steps_per_episode = 10000

while episode_count < 5000:
    state = np.array(env.reset())
    episode_reward = 0
    for timestep in range(1, max_steps_per_episode):
        frame_count += 1
        env.render()

        action = agent.act(state, frame_count)
        state_next, reward, done, info = env.step(action)
        state_next = np.array(state_next)

        agent.cache(action, state, state_next, reward, done)

        episode_reward += reward
        state = state_next

        if frame_count % agent.update_after_actions == 0 and len(agent.memory) > agent.batch_size:
            agent.learn(frame_count)

        if frame_count % agent.update_target_network == 0:
            template = "running reward: {:.2f} at episode {}, frame count {}, epsilon: {}"
            print(template.format(running_reward, episode_count, frame_count, agent.epsilon))

        if done:
            break

    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)
    episode_count += 1

  batch = np.array([self.memory[i] for i in indexes])


KeyboardInterrupt: 

In [None]:
state, next_state, action, reward, done = map(list, zip(*agent.memory))

In [None]:
full_history = {
    "episode_rewards": episode_reward_history
}
pd.DataFrame(full_history).plot(figsize=(12, 8))
plt.grid(True)
plt.show()