In [None]:
import tensorflow as tf
import numpy as np
import gym
import time

from tensorflow.keras import layers

In [None]:
print(tf.math.exp(tf.math.log(2.0)))

In [None]:
# 2.3.0
tf.__version__

In [None]:
# 0.17.3
gym.__version__

In [None]:
ENV_ID = 'CartPole-v0'
ENV_UNWRAPPED = False
ENV_REWARD_THRESHOLD = 195.0

GAMMA = 0.99

ENTROPY_LOSS_COEF = 0.01

CONSECUTIVE_TRIALS = 100

TEST_TIMES = 10

In [None]:
env = gym.make(ENV_ID)

if ENV_UNWRAPPED:
    env = env.unwrapped

obs_dim = env.observation_space.shape[0]
action_num = env.action_space.n

print('obs_dim', obs_dim)
print('action_num', action_num)

In [None]:
policy = tf.keras.Sequential([
    layers.Dense(16, activation='tanh'),
    layers.Dense(16, activation='tanh'),
    layers.Dense(action_num)
])

policy_optimizer=tf.optimizers.Adam(learning_rate=1e-3)

In [None]:
value_network = tf.keras.Sequential([
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1)
])

value_network_optimizer=tf.optimizers.Adam(learning_rate=1e-3)

In [None]:
def reward_to_go(rewards):
    n = len(rewards)
    returns = np.zeros_like(rewards, dtype=np.float32)
    for i in reversed(range(n)):
        returns[i] = rewards[i] + GAMMA * (0.0 if i == n-1 else returns[i+1])
    return returns

In [None]:
def estimate_advantage(observations, rewards):
    assert(len(observations) == len(rewards))
    
    n = len(observations)
    
    values = []
    for i in range(n):
        values.append(
            value_network(tf.expand_dims(observations[i], 0))
        )
    
    advantages = np.zeros_like(rewards, dtype=np.float32)
    td_targets = np.zeros_like(rewards, dtype=np.float32)
    for i in range(n):
        bootstrap = 0.0 if i == n-1 else tf.stop_gradient(values[i+1])
        td_targets[i] = rewards[i] + GAMMA * bootstrap
        advantages[i] = td_targets[i] - tf.stop_gradient(values[i])
    
    return advantages, values, td_targets

In [None]:
def train_one_episode():
    ep_obs = []
    ep_action = []
    ep_reward = []
    ep_log_prob = []
    ep_entropy = []
    
    with tf.GradientTape(persistent=True) as tape:
        obs = env.reset()
        
        while True:
            ep_obs.append(obs.copy())

            policy_logits = policy(tf.expand_dims(obs, 0))
            log_prob = tf.nn.log_softmax(policy_logits)
            entropy = -tf.reduce_sum(tf.math.exp(log_prob) * log_prob)
            action = tf.squeeze(tf.random.categorical(policy_logits, 1, seed=None), axis=1)[0]

            obs, reward, done, _ = env.step(action.numpy())

            ep_action.append(action)
            ep_reward.append(reward)
            ep_log_prob.append(log_prob)
            ep_entropy.append(entropy)

            if done:
                break

        ep_advantage, ep_estimated_value, ep_td_target = estimate_advantage(ep_obs, ep_reward)  # [batch_size]
        action_mask = tf.one_hot(ep_action, action_num)  # [batch_size, action_num]
        log_probs = tf.reduce_sum(action_mask * tf.concat(ep_log_prob, axis=0), axis=1)  # [batch_size]
        policy_loss = -(tf.reduce_mean(ep_advantage * log_probs) + ENTROPY_LOSS_COEF * tf.reduce_mean(ep_entropy))
        policy_loss_0 = -tf.reduce_mean(ep_advantage * log_probs)
        policy_loss_1 = -ENTROPY_LOSS_COEF * tf.reduce_mean(ep_entropy)
        value_network_loss = tf.reduce_mean(
            tf.math.square(
                # MC
                #tf.expand_dims(reward_to_go(ep_reward), 1) - tf.concat(ep_estimated_value, axis=0)
                
                # TD
                tf.expand_dims(ep_td_target, 1) - tf.concat(ep_estimated_value, axis=0)
            )
        )
        
    policy_grads = tape.gradient(policy_loss, policy.trainable_variables)
    policy_optimizer.apply_gradients(zip(policy_grads, policy.trainable_variables))
    
    value_network_grads = tape.gradient(value_network_loss, value_network.trainable_variables)
    value_network_optimizer.apply_gradients(zip(value_network_grads, value_network.trainable_variables))

    del tape
    
    return sum(ep_reward), len(ep_reward), policy_loss, policy_loss_0, policy_loss_1, value_network_loss

In [None]:
results = [0.0] * CONSECUTIVE_TRIALS
i = 0
while True:
    total_rewards, episode_len, policy_loss, policy_loss_0, policy_loss_1, value_network_loss = train_one_episode()
    results[i % CONSECUTIVE_TRIALS] = total_rewards
    avg_results = np.average(results) if CONSECUTIVE_TRIALS <= i else np.average(results[0:i+1])
    print('i={}, total_rewards={}, episode_len={}, p_loss={:.4f}, p_loss[0]={:.4f}, p_loss[1]={:.4f}, v_loss={:.4f}, avg_results={}'.format(
        i, total_rewards, episode_len, policy_loss, policy_loss_0, policy_loss_1, value_network_loss, avg_results))
    if ENV_REWARD_THRESHOLD <= avg_results:
        print('congratulations!')
        break
    i += 1

In [None]:
for i in range(TEST_TIMES):
    total_rewards = 0
    obs = env.reset()
    while True:
        env.render()
        policy_logits = policy(tf.expand_dims(obs, 0))
        action = tf.squeeze(tf.random.categorical(policy_logits, 1, seed=None), axis=1)[0]
        obs, reward, done, _ = env.step(action.numpy())
        total_rewards += reward
        if done:
            break
    print('i:{}, total_rewards:{}'.format(i, total_rewards))
    time.sleep(3)

In [None]:
env.close()