In [2]:
from gymnasium import Env
from gymnasium.wrappers import GrayScaleObservation, ResizeObservation, TimeLimit

from pokerl.env.pokemonblue import PokemonBlueEnv

from pokerl.env.wrappers import (
    ObservationAddPokemonLevel,
    ObservationAddPosition,
    ObservationDict,
    RewardDecreasingNoChange,
    RewardDecreasingSteps,
    RewardHistoryToInfo,
    RewardIncreasingBadges,
    RewardIncreasingCapturePokemon,
    RewardIncreasingPokemonLevel,
    RewardIncreasingPositionExploration,
    RemoveSelectStartAction,
    ppFlattenInfo,
)

from pokerl.env.wrappers.rewards import RewardIncreasingLandedAttack,RewardDecreasingLostBattle



In [3]:
BATCH_SIZE = 2048
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
LR = 1e-4
STEP_LIMIT = 10000

In [4]:
%load_ext autoreload
%autoreload 2

In [11]:

def create_env(interactive=False) -> Env:
    env = PokemonBlueEnv(interactive=interactive)
    # Setting observation
    env = ResizeObservation(env, 64)
    env = GrayScaleObservation(env)
    env = ObservationDict(env)
    env = ObservationAddPosition(env)
    env = ObservationAddPokemonLevel(env)
    env = RemoveSelectStartAction(env)
    # Setting reward
    env = RewardDecreasingNoChange(env, 0.01)
    env = RewardDecreasingSteps(env, .01)
    env = RewardIncreasingBadges(env, 100)
    env = RewardIncreasingCapturePokemon(env, 10)
    env = RewardIncreasingPokemonLevel(env, 10)
    env = RewardIncreasingLandedAttack(env, 0.05)
    env = RewardDecreasingLostBattle(env, 0.1)
    # env = RewardIncreasingPositionExploration(env, 1)
    env = RewardHistoryToInfo(env)
    # Post processing
    # env = TimeLimit(env, 300)
    # env = ppFlattenInfo(env)
    return env

In [12]:
env = create_env()

In [13]:
env.action_space

Discrete(7)

In [14]:
import torch
from stable_baselines3 import ppo
from stable_baselines3.common.env_util import make_vec_env


from pokerl.agent.tools import get_device

# env = make_vec_env(create_env, n_envs=8)

# model = ppo.PPO(
#     "MultiInputPolicy",
#     env,
#     device=get_device(),
#     verbose=1
#     )


In [15]:
from stable_baselines3.common.vec_env import SubprocVecEnv

def make_env(rank, seed=0):
    """
    Utility function for multiprocessed env.
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the initial seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = create_env()
        env.reset(seed=(seed + rank))
        return env
    return _init

nb_cpus = 8
ep_length = 1e4 
subproc = SubprocVecEnv([make_env(i) for i in range(nb_cpus)])

model = ppo.PPO(
    "MultiInputPolicy", 
    subproc,
    learning_rate=0.001,
    n_steps=ep_length*nb_cpus,
    batch_size=512,
    n_epochs=10,
    gamma=0.95,
    gae_lambda=0.95,
    clip_range=0.2,
    verbose=2,
    # callback=WandbCallback(),
)
model.learn(total_timesteps=2048, progress_bar=True)


Output()

Resetting game
Using cuda device
Resetting game


Resetting game


Resetting game


Resetting game


-----------------------------
| time/              |      |
|    fps             | 105  |
|    iterations      | 1    |
|    time_elapsed    | 9    |
|    total_timesteps | 1024 |
-----------------------------


Resetting game


Resetting game


Resetting game


----------------------------------------
| time/                   |            |
|    fps                  | 104        |
|    iterations           | 2          |
|    time_elapsed         | 19         |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.01322393 |
|    clip_fraction        | 0.15       |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.94      |
|    explained_variance   | 3.11e-05   |
|    learning_rate        | 0.001      |
|    loss                 | -0.00793   |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.00688   |
|    value_loss           | 0.0618     |
----------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x7f7b4a83bf70>

In [90]:
subproc.close()

In [81]:
from wandb.integration.sb3 import WandbCallback
import wandb

# config = {
#     "policy_type": "MultiInputPolicy",
#     "total_timesteps": 5000,
#     "env_name": "PokemonBlueEnv-v1",
# }

# run = wandb.init(
#     project="sb3",
#     config=config,
#     sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
#     monitor_gym=True,  # auto-upload the videos of agents playing the game
#     save_code=True,  # optional
# )

ppo.learn(total_timesteps=500, 
          progress_bar=True, 
        #   callback=WandbCallback(),
          )

Output()

------------------------------
| time/              |       |
|    fps             | 319   |
|    iterations      | 1     |
|    time_elapsed    | 51    |
|    total_timesteps | 16384 |
------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x15a6baf80>

In [107]:
from tqdm import tqdm

test_env = create_env(interactive=True)
obs, _ = test_env.reset()

model = ppo.PPO.load("../ppo_pokemon_blue_v1_1500000.zip")
for _ in range(1000):
    action, _ = model.predict(obs)
    obs, reward, _, _, _ = test_env.step(action)


1
5
5
0
1
5
5
5
5
5
1
5
5
1
1
1
5
5
1
1
5
5
5
1
5
0
1
5
5
1
5
1
5
2
1
1
5
1
5
1
5
1
5
1
1
1
1
1
5
5
1
5
5
1
5
5
5
5
5
5
1
1
5
1
5
1
1
5
1
1
1
1
1
0
5
1
5
5
1
4
5
1
1
1
5
2
5
1
5
1
1
1
5
5
5
5
5
5
5
1
5
0
1
0
1
5
1
5
5
1
5
1
5
5
5
1
5
5
5
5
1
5
1
2
5
1
5
5
1
1
5
1
1
1
1
1
1
5
5
0
5
5
1
1
0
1
1
5
0
0
5
5
1
5
1
1
1
1
1
1
5
5
5
1
5
5
1
5
1
1
5
1
1
5
1
1
1
1
1
5
1
1
1
1
1
1
5
0
1
5
5
0
1
5
1
5
5
5
5
5
1
1
5
5
5
1
5
5
1
1
1
1
1
1
1
5
3
1
1
1
5
1
1
5
1
5
1
1
1
0
5
1
5
1
1
5
5
1
5
1
5
1
5
1
1
5
1
0
1
0
1
5
1
5
5
3
5
1
5
0
5
5
1
5
5
5
0
0
5
5
5
5
5
5
1
5
5
5
1
5
1
5
1
1
0
1
1
5
1
1
1
5
5
5
1
5
1
1
1
5
1
1
1
1
1
1
0
0
1
5
1
1
0
1
5
5
1
5
3
1
1
5
5
5
1
0
4
5
5
1
5
1
5
0
5
5
1
1
1
5
5
1
1
5
5
1
5
5
1
1
5
1
5
1
5
3
1
1
5
1
1
5
1
5
1
1
2
1
5
5
2
5
1
0
5
2
5
1
1
1
1
4
1
5
1
2
5
1
5
5
1
2
0
5
0
1
5
1
4
5
1
5
1
1
5
5
1
1
1
1
1
0
5
1
1
1
5
1
5
1
1
1
1
5
1
1
5
1
1
1
0
5
1
0
1
5
5
1
5
5
5
0
5
1
1
1
5
1
1
5
5
0
1
5
1
1
1
0
1
1
1
1
2
5
5
5
1
1
1
1
5
0
5
5
1
1
1
1
1
5
5
3
1
5
1
1
0
5
1
1
0
5
1
5
1
0
5
1
1
1


In [140]:
env = create_env(interactive=False)
env.reset()

({'screen': array([[142, 142, 130, ...,   0,   0,   0],
         [232, 232, 112, ...,   0,   0,   0],
         [ 62,  90,  71, ...,   0,   0,   0],
         ...,
         [138, 138, 138, ...,   0,   0,   0],
         [138, 138, 138, ...,   0,   0,   0],
         [123, 123, 123, ...,   0,   0,   0]], dtype=uint8),
  'position': array([0., 0.], dtype=float16),
  'pokemon_level': array([0, 0, 0, 0, 0, 0], dtype=uint8)},
 {'tick': 0,
  'pokemon_level': array([5, 0, 0, 0, 0, 0]),
  'badges': array(0),
  'position': array([ 5,  5, 40]),
  'absolute_position': array([26, -1]),
  'owned_pokemon': array([8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  'rewardHistory': deque([], maxlen=10000)})

In [141]:
env.get_max_hp_pokemon(0)

  logger.warn(


19

In [143]:
env.pyboy.get_memory_value(0xD19c)

0

In [124]:
(0xD16C, 0xD16D)

(53612, 53613)

In [52]:
from tqdm import tqdm

env = create_env(interactive=False)
env.reset()
for _ in tqdm(range(2000)):
    obs, reward, _, _, _ = env.step(1)

100%|██████████| 2000/2000 [00:05<00:00, 382.01it/s]


In [111]:
test_env.step(8)

({'screen': array([[  0,   0,   0, ...,   0,   0,   0],
         [  0,   0,   0, ...,   0,   0,   0],
         [  0,   0,   0, ...,   0,   0,   0],
         ...,
         [ 65, 153, 122, ...,   0,   0,   0],
         [ 43, 102,  82, ...,   0,   0,   0],
         [ 56,  68,  68, ...,   0,   0,   0]], dtype=uint8),
  'position': array([ 0.09411765, -0.00392157]),
  'pokemon_level': array([5, 0, 0, 0, 0, 0])},
 -10.01,
 False,
 False,
 {'tick': 22066,
  'pokemon_level': array([5, 0, 0, 0, 0, 0]),
  'badges': array(0),
  'position': array([ 3,  5, 40]),
  'absolute_position': array([24, -1]),
  'owned_pokemon': array([8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  'rewardHistory': deque([-0.01,
         -10.01,
         -10.01,
         -10.01,
         -0.01,
         -10.01,
         -10.01,
         -10.01,
         -10.01,
         -10.01,
         -10.01,
         -10.01,
         -10.01,
         -10.01,
         -10.01,
         -10.01,
         -10.01,
         -10.01

In [12]:
env = create_env(interactive=True)
env.reset()

hp_value = env.get_ennemy_hp()
print(hp_value)
print(env.pyboy.get_memory_value(0xCCD5))
for _ in range(5):
    env.step(2)
while env.pyboy.get_memory_value(0xCCD5) != 0:
    env.step(5)
print('started combat')


hp_value = env.get_ennemy_hp()
print(hp_value)
for i in range(300):
    obs, reward, _, _, info = env.step(5)
    if hp_value != env.get_ennemy_hp():
        print(reward)
    hp_value = env.get_ennemy_hp()
 
    

print(env.pyboy.get_memory_value(0xCCD5))
env.close()


  logger.warn(
  logger.warn(


0
64
started combat
0
-10.01
-5.01
-5.01
-7.01
-7.01
-7.01
-9.01
5


In [146]:
env.get_ennemy_hp()

  logger.warn(


0

In [14]:
obs

{'screen': array([[ 38,  38,  38, ...,   0,   0,   0],
        [ 38,  38,  38, ...,   0,   0,   0],
        [ 89,  89,  89, ...,   0,   0,   0],
        ...,
        [138, 138, 138, ...,   0,   0,   0],
        [138, 138, 138, ...,   0,   0,   0],
        [123, 123, 123, ...,   0,   0,   0]], dtype=uint8),
 'position': array([ 0.10588235, -0.00392157]),
 'pokemon_level': array([6, 0, 0, 0, 0, 0])}