In [1]:
!pip install keras-gym -U

Collecting keras-gym
  Downloading keras_gym-0.2.17-py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 218 kB/s eta 0:00:011
[?25hCollecting numpy<1.17,>=1.16
  Downloading numpy-1.16.6.zip (5.1 MB)
[K     |████████████████████████████████| 5.1 MB 4.1 MB/s eta 0:00:01     |████████████████████            | 3.2 MB 4.1 MB/s eta 0:00:01
Collecting absl-py>=0.8.1
  Downloading absl_py-0.13.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 8.6 MB/s eta 0:00:01
[?25hCollecting gym>=0.12.1
  Downloading gym-0.18.3.tar.gz (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 8.5 MB/s eta 0:00:01
[?25hCollecting tensorflow>=1.14
  Downloading tensorflow-2.5.0-cp38-cp38-manylinux2010_x86_64.whl (454.4 MB)
[K     |████████████████████████████████| 454.4 MB 27 kB/s  eta 0:00:012   |██▎                             | 31.6 MB 4.6 MB/s eta 0:01:32     |██▎                             | 32.2 MB 4.6 MB/s eta 0:01:32     |██▌                   

In [2]:
# %load ../../scripts/frozen_lake/actor_critic.py
import numpy as np
import keras_gym as km
from tensorflow import keras
from tensorflow.keras import backend as K
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv, UP, DOWN, LEFT, RIGHT


# the MDP
actions = {LEFT: 'L', RIGHT: 'R', UP: 'U', DOWN: 'D'}
env = FrozenLakeEnv(is_slippery=False)
env = km.wrappers.TrainMonitor(env)

# show logs from TrainMonitor
km.enable_logging()


class LinearFunc(km.FunctionApproximator):
    """ linear function approximator (body only does one-hot encoding) """
    def body(self, S):
        one_hot_encoding = keras.layers.Lambda(lambda x: K.one_hot(x, 16))
        return one_hot_encoding(S)


# define function approximators
func = LinearFunc(env, lr=0.01)
pi = km.SoftmaxPolicy(func, update_strategy='ppo')
v = km.V(func, gamma=0.9, bootstrap_n=1)


# combine into one actor-critic
actor_critic = km.ActorCritic(pi, v)


# static parameters
target_model_sync_period = 20
num_episodes = 250
num_steps = 30


# train
for ep in range(num_episodes):
    s = env.reset()

    for t in range(num_steps):
        a = pi(s, use_target_model=True)
        s_next, r, done, info = env.step(a)

        # small incentive to keep moving
        if np.array_equal(s_next, s):
            r = -0.1

        actor_critic.update(s, a, r, done)

        if env.T % target_model_sync_period == 0:
            pi.sync_target_model(tau=1.0)

        if done:
            break

        s = s_next


# run env one more time to render
s = env.reset()
env.render()

for t in range(num_steps):

    # print individual action probabilities
    print("  v(s) = {:.3f}".format(v(s)))
    for i, p in enumerate(pi.dist_params(s)):
        print("  π({:s}|s) = {:.3f}".format(actions[i], p))

    a = pi.greedy(s)
    s, r, done, info = env.step(a)
    env.render()

    if done:
        break

ModuleNotFoundError: No module named 'keras_gym'