In [None]:
import tensorflow as tf
import numpy as np
import gym
from time import time
from math import floor

In [None]:
# Parameters
num_hidden_units = 32
num_episodes_per_training = 200
num_episodes_to_train_on = 10000
log_freq=200

discount_gamma = 0.99

In [None]:
# Global RNG, can replace later
rng = np.random.RandomState(123)

In [None]:
from tensorflow.keras.layers import Dense
from tensorflow.keras import Model

In [None]:
class Agent(Model):
    def __init__(self, num_hidden_units=8):
        super(Agent, self).__init__()
        self.d1 = Dense(units=num_hidden_units, activation='tanh')
        self.d2 = Dense(units=num_hidden_units, activation='relu')
        self.d3 = Dense(units=1, activation=None)
        
    def call(self, x):
        x = self.d1(x)
        #x = self.d2(x)
        return self.d3(x)

In [None]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

In [None]:
def generate_episode(agent, env, max_steps=100):
    s = env.reset()
    
    ep_states = []
    ep_actions = []
    ep_rewards = []
    
    for step in range(max_steps):
        # Generate an action using the current policy
        a_logit = agent(s.reshape((1, -1)))[0]
        # Turn the distribution into an action stochastically
        a = 1 if sigmoid(a_logit) > rng.uniform(low=0.0, high=1.0) else 0

        s1, r, done, _ = env.step(a)
        
        # Save more of the episode history
        ep_states.append(s)
        ep_actions.append(a)
        ep_rewards.append(r)
        
        if done:
            break
        
        s = s1

    return ep_states, ep_actions, ep_rewards

In [None]:
env = gym.make('CartPole-v0')
agent = Agent(num_hidden_units=num_hidden_units)
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [None]:
optimizer = tf.optimizers.Adam(learning_rate=1e-2)

@tf.function
def train_step(states, actions, rewards):
    actions = tf.reshape(actions, shape=(-1, 1))
    rewards = tf.reshape(rewards, shape=(-1, 1))
    with tf.GradientTape() as tape:
        tape.watch(agent.trainable_variables)
        a_logits = agent(states)
        loss = loss_object(y_true=actions, y_pred=a_logits, sample_weight=rewards)
        #loss = rewards * (actions * tf.math.log(1.0 + tf.math.exp(-a_logits)) + (1.0 - actions) * tf.math.log(1.0 + tf.math.exp(a_logits)))
    gradients = tape.gradient(loss, agent.trainable_variables)
#     tf.print(gradients)
#     tf.print("------ -------- ------")
#     tf.print(a_logits)
#     tf.print("******** ************* *********")
    optimizer.apply_gradients(zip(gradients, agent.trainable_variables))

In [None]:
buffer_states = []
buffer_actions = []
buffer_discounted_rewards = []

summed_ep_rewards = 0.0

start_time_s = time()

for episode in range(num_episodes_to_train_on):

    # Generate an episode using the current policy
    ep_states, ep_actions, ep_rewards = generate_episode(agent, env)
    
    # Update our metrics of performance
    total_reward = sum(ep_rewards)
    summed_ep_rewards += total_reward
    
    # Compute the discounted rewards
    ep_discounted_rewards = np.zeros(shape=len(ep_rewards))
    running_reward = 0.0
    for i in range(len(ep_rewards)-1, -1, -1):
        running_reward *= discount_gamma
        running_reward += ep_rewards[i]
        ep_discounted_rewards[i] = running_reward
    
    # Add to the buffers
    buffer_states.append(ep_states)
    buffer_actions.append(ep_actions)
    buffer_discounted_rewards.append(ep_discounted_rewards)
    
    if (episode + 1) % num_episodes_per_training == 0:
        buffer_states = np.concatenate(buffer_states).astype(np.float32)
        buffer_actions = np.concatenate(buffer_actions).astype(np.float32)
        buffer_discounted_rewards = np.concatenate(buffer_discounted_rewards).astype(np.float32)
        
        # Shuffle rows (only really need this if we're going to batch!)
#         indices = list(range(buffer_states.shape[0]))
#         rng.shuffle(indices)
#         buffer_states = buffer_states[indices]
#         buffer_actions = buffer_actions[indices]
#         buffer_discounted_rewards = buffer_discounted_rewards[indices]

        train_step(buffer_states, buffer_actions, buffer_discounted_rewards)
        
        # Clear the buffers
        buffer_states = []
        buffer_actions = []
        buffer_discounted_rewards = []

    if (episode + 1) % log_freq == 0:
        per_ep_reward = summed_ep_rewards / log_freq
        summed_ep_rewards = 0.0
        
        time_now_s = time()
        elapsed_time_s = time_now_s - start_time_s
        elapsed_part_m = floor(elapsed_time_s / 60)
        elapsed_part_s = elapsed_time_s - 60 * elapsed_part_m
        
        print("[{}m {:0.2f}s] episodes: {}\tMean reward: {}".format(elapsed_part_m, elapsed_part_s,
                                                                    episode + 1, per_ep_reward))
