In [None]:
import tensorflow as tf
import numpy as np
import gym
import time

from tensorflow.keras import layers

In [None]:
# 2.3.0
tf.__version__

In [None]:
# 0.17.3
gym.__version__

In [None]:
env = gym.make('CartPole-v0')
obs_dim = env.observation_space.shape[0]
action_num = env.action_space.n

print('obs_dim', obs_dim)
print('action_num', action_num)

In [None]:
RENDER = False

In [None]:
policy = tf.keras.Sequential([
  layers.Dense(32, activation='tanh'),
  layers.Dense(action_num)
])

optimizer=tf.optimizers.Adam(learning_rate=1e-2)

In [None]:
def reward_to_go(rewards):
    n = len(rewards)
    returns = np.zeros_like(rewards)
    for i in reversed(range(n)):
        returns[i] = rewards[i] + (returns[i+1] if i+1 < n else 0)
    return returns

In [None]:
def train_one_episode():
    ep_obs = []
    ep_action = []
    ep_reward = []
    ep_log_prob = []
    
    with tf.GradientTape() as tape:
        obs = env.reset()
        
        while True:
            if RENDER:
                env.render()
            ep_obs.append(obs.copy())

            logits = policy(tf.expand_dims(obs, 0))
            log_prob = tf.nn.log_softmax(logits)
            action = tf.squeeze(tf.random.categorical(logits, 1, seed=None), axis=1)[0]

            obs, reward, done, _ = env.step(action.numpy())

            ep_action.append(action)
            ep_reward.append(reward)
            ep_log_prob.append(log_prob)

            if done:
                break
        
        ep_return = reward_to_go(ep_reward)  # [batch_size]
        action_mask = tf.one_hot(ep_action, action_num)  # [batch_size, 2]
        log_probs = tf.reduce_sum(action_mask * tf.concat(ep_log_prob, axis=0), axis=1)  # [batch_size]
        loss = -tf.reduce_mean(ep_return * log_probs)
        
    grads = tape.gradient(loss, policy.trainable_variables)
    optimizer.apply_gradients(zip(grads, policy.trainable_variables))
    
    return sum(ep_reward), len(ep_reward)

In [None]:
results = [0] * 100
i = 0
while True:
    total_rewards, episode_len = train_one_episode()
    results[i % 100] = total_rewards
    print('i:{}, total_rewards:{}, episode_len:{}, avg_results:{}'.format(i, total_rewards, episode_len, np.average(results)))
    i += 1
    if 195.0 <= np.average(results):
        print('congratulations!')
        break

In [None]:
for i in range(10):
    total_rewards = 0
    obs = env.reset()
    while True:
        env.render()
        logits = policy(tf.expand_dims(obs, 0))
        action = tf.squeeze(tf.random.categorical(logits, 1, seed=None), axis=1)[0]
        obs, reward, done, _ = env.step(action.numpy())
        total_rewards += reward
        if done:
            break
    print('i:{}, total_rewards:{}'.format(i, total_rewards))
    time.sleep(3)

In [None]:
env.close()