In [6]:
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import gym

In [2]:
from tensorflow.keras import layers
from tensorflow import keras

In [3]:
class PolicyNet(keras.Model):
    def __init__(self, action_dim=1):
        super(PolicyNet, self).__init__()
        self.fc1 = layers.Dense(24, activation="relu")
        self.fc2 = layers.Dense(36, activation="relu")
        self.fc3 = layers.Dense(action_dim, activation="softmax")
        
    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x
    
    def process(self, observations):
        # Process batch observations using `call(x)` behind-the-scenes
        action_probabilities = self.predict_on_batch(observations)
        return action_probabilities
        

In [9]:
class Agent(object):
    def __init__(self, action_dim=1):
        """Agent with a neural-network policy
        
        Args: 
            action_dim (int): Action dimension
        """
        self.policy_net = PolicyNet(action_dim=action_dim)
        self.optimizer = keras.optimizers.Adam(learning_rate=1e-3)
        self.gamma = 0.99
        
    def policy(self, observation):
        observation = observation.reshape(1, -1)
        observation = tf.convert_to_tensor(observation, dtype=tf.float32)
        action_logits = self.policy_net(observation)
        action = tf.random.categorical(tf.math.log(action_logits), num_samples=1)
        return action
    
    def get_action(self, observation):
        action = self.policy(observation).numpy()
        return action.squeeze()
    
    def learn(self, states, rewards, actions):
        discounted_reward = 0
        discounted_rewards = []
        rewards.reverse()
        for r in rewards:
            discounted_reward = r + self.gamma * discounted_reward
            discounted_rewards.append(discounted_reward)
        discounted_rewards.reverse() 
        
        for state, reward, action in zip(states, discounted_rewards, actions):
            with tf.GradientTape() as tape:
                action_probabilities = self.policy_net(np.array([state]), training=True)
                loss = self.loss(action_probabilities, action, reward)
            grads = tape.gradient(loss, self.policy_net.trainable_variables)
            self.optimizer.apply_gradients(
                zip(grads, self.policy_net.trainable_variables)
            )
            
    def loss(self, action_probabilities, action, reward):
        dist = tfp.distributions.Categorical(
            probs = action_probabilities, dtype=tf.float32
        )
        log_prob = dist.log_prob(action)
        loss = -log_prob * reward
        return loss 

In [7]:
def train(agent: Agent, env: gym.Env, episodes: int, render=True):
    """
    Train `agent` in `env` for `episodes`
    
    Args:
        agent(Agent) : Agent to train
        env (gym.Env): Environment to train the agent 
        episodes (int): Number of episodes to train
        render (bool) : True=Enable/ False=Disable rendering
    """
    
    for episode in range(episodes):
        done = False 
        state = env.reset()
        total_reward = 0
        rewards = []
        states = []
        actions = []
        while not done: 
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            rewards.append(reward)
            states.append(state)
            actions.append(action)
            state = next_state
            total_reward += reward 
            
            if render:
                env.render()
            if done:
                agent.learn(states, rewards, actions)
                print("\n")
            print(f"Episodes #{episode} ep_reward: {total_reward}", end="\r")
                

In [11]:
agent = Agent()
episodes = 200
env = gym.make("MountainCar-v0")
train(agent, env, episodes, render=False)
env.close()

Episodes #0 ep_reward: -199.0

Episodes #1 ep_reward: -199.0

Episodes #2 ep_reward: -199.0

Episodes #3 ep_reward: -199.0

Episodes #4 ep_reward: -199.0

Episodes #5 ep_reward: -199.0

Episodes #6 ep_reward: -199.0

Episodes #7 ep_reward: -199.0

Episodes #8 ep_reward: -199.0

Episodes #9 ep_reward: -199.0

Episodes #10 ep_reward: -199.0

Episodes #11 ep_reward: -199.0

Episodes #12 ep_reward: -199.0

Episodes #13 ep_reward: -199.0

Episodes #14 ep_reward: -199.0

Episodes #15 ep_reward: -199.0

Episodes #16 ep_reward: -199.0

Episodes #17 ep_reward: -199.0

Episodes #18 ep_reward: -199.0

Episodes #19 ep_reward: -199.0

Episodes #20 ep_reward: -199.0

Episodes #21 ep_reward: -199.0

Episodes #22 ep_reward: -199.0

Episodes #23 ep_reward: -199.0

Episodes #24 ep_reward: -199.0

Episodes #25 ep_reward: -199.0

Episodes #26 ep_reward: -199.0

Episodes #27 ep_reward: -199.0

Episodes #28 ep_reward: -199.0

Episodes #29 ep_reward: -199.0

Episodes #30 ep_reward: -199.0

Episodes #31 ep_re