In [7]:
# 8 worlds and 4 stages = 32 levels
WORLD = 1
STAGE = 1
LEVEL = f"{WORLD}-{STAGE}"
# 0 high, 1 low, 2 down, 3 lowest
QUALITY = 0
DEFAULT_GAME = f"SuperMarioBros-{LEVEL}-v{QUALITY}"
# Simplest list of actions
MY_ACTIONS = [["right"], ["right", "A"]]


# Important to set the 2 params below to affect time, each 50k steps takes
# Literature suggests  batches*each_batch_steps to exceed 10M to get reliably good Mario AI
# change to higher number for real run, 2 for test
batches = 20
# Change to 500k or more to expect see real impact
each_batch_steps = 500_000

In [8]:
import gym
from gym.spaces import Box
import gym_super_mario_bros
from gym.wrappers import FrameStack
from nes_py.wrappers import JoypadSpace
from gym.wrappers import FrameStack
from torchvision import transforms
from stable_baselines3 import PPO
import tensorflow_docs.vis.embed as embed
import numpy as np
import torch
import time
import os
import base64
from PIL import Image

In [9]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        """Return only every `skip`-th frame"""
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        """Repeat action, and sum reward"""
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            # Accumulate reward and repeat the same action
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def permute_orientation(self, observation):
        # permute [H, W, C] array to [C, H, W] tensor
        observation = np.transpose(observation, (2, 0, 1))
        observation = torch.tensor(observation.copy(), dtype=torch.float)
        return observation

    def observation(self, observation):
        observation = self.permute_orientation(observation)
        transform = transforms.Grayscale()
        observation = transform(observation)
        return observation


class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        if isinstance(shape, int):
            self.shape = (shape, shape)
        else:
            self.shape = tuple(shape)

        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        my_transforms = transforms.Compose(
            [transforms.Resize(self.shape), transforms.Normalize(0, 255)]
        )
        observation = my_transforms(observation).squeeze(0)
        return observation

def build_env():
  env = gym_super_mario_bros.make(DEFAULT_GAME)
  env = SkipFrame(env, skip=4)
  env = GrayScaleObservation(env)
  env = ResizeObservation(env, shape=84)
  env = FrameStack(env, num_stack=4)
  env = JoypadSpace(env, MY_ACTIONS)
  return env

In [18]:
def save_gif(model, image_file, max_steps=2000):
  best_img = []
  all_rewards = []
  best_reward = 0
  for i in range(40): # choose 1 best out of 10
    env = build_env()
    obs = env.reset()
    im = Image.fromarray(env.render(mode = 'rgb_array'))
    images = [im]
    obs = env.reset()
    cur_best_reward = 0
    for i in range(1, max_steps + 1):
      # Reformat lazyframe to numpy for predict method
      b = torch.Tensor(4, 84, 84)
      torch.stack(obs._frames, out=b)
      action, _ = model.predict(b.numpy())
      # print("action", action)
      # As of 09/2022, step func seems complain action as numpy scalar, so convert to int
      obs, reward, done, _ = env.step(action.tolist())
      cur_best_reward += reward
      # Render screen every 8/4 = 2 steps
      if i % 2 == 0:
        images.append(Image.fromarray(env.render(mode = 'rgb_array')))
      if done:
        break
    all_rewards.append(cur_best_reward)
    if cur_best_reward > best_reward or (cur_best_reward == best_reward and len(images) > len(best_img)):
      best_reward = cur_best_reward
      best_img = images
  best_img[0].save(image_file, save_all=True, append_images=best_img[1:], loop=0, duration=1)
  print("mean reward of 20 episodes", sum(all_rewards) / len(all_rewards), "\tlength", len(best_img))
  print("saved to", image_file)

In [19]:
VID_DIR= "./marioRL/docs/videos"
model = PPO('CnnPolicy', build_env(), verbose=0)

In [21]:
total_steps = each_batch_steps
for i in range(1,batches+1):
    model.load(f"./marioRL/models/model_{total_steps}.zip")
    save_gif(model, os.path.join(VID_DIR, f"model_{total_steps}"+".gif"))
    total_steps = total_steps + each_batch_steps

mean reward of 20 episodes 671.725 	length 365
saved to ./marioRL/docs/videos\model_500000.gif
mean reward of 20 episodes 748.55 	length 451
saved to ./marioRL/docs/videos\model_1000000.gif
mean reward of 20 episodes 587.375 	length 188
saved to ./marioRL/docs/videos\model_1500000.gif
mean reward of 20 episodes 663.5 	length 137
saved to ./marioRL/docs/videos\model_2000000.gif
mean reward of 20 episodes 653.75 	length 211
saved to ./marioRL/docs/videos\model_2500000.gif
mean reward of 20 episodes 756.6 	length 286
saved to ./marioRL/docs/videos\model_3000000.gif
mean reward of 20 episodes 603.325 	length 498
saved to ./marioRL/docs/videos\model_3500000.gif
mean reward of 20 episodes 612.5 	length 122
saved to ./marioRL/docs/videos\model_4000000.gif
mean reward of 20 episodes 613.525 	length 187
saved to ./marioRL/docs/videos\model_4500000.gif
mean reward of 20 episodes 730.375 	length 143
saved to ./marioRL/docs/videos\model_5000000.gif
mean reward of 20 episodes 575.225 	length 158
sa

In [22]:
while True:
    env = build_env()
    obs = env.reset()
    model.load("./marioRL/models/model_3000000")
    for i in range(1, 5000 + 1):
        # Reformat lazyframe to numpy fiior predict method
        b = torch.Tensor(4, 84, 84)
        torch.stack(obs._frames, out=b)
        action, _ = model.predict(b.numpy())
        obs, reward, done, _ = env.step(action.tolist())
        env.render()
        time.sleep(0.0016)
        if done:
            env.close()
            break

ContextException: Unable to share contexts.

In [None]:
env.close()