In [None]:
# The Policy gradient model

In [4]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense 
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.initializers import HeNormal, GlorotNormal
import tensorflow_probability as tfp

# Code for this section used the following source for reference: 
# https://github.com/abhisheksuran/Reinforcement_Learning/blob/master/Reinforce_(PG).ipynb 

class Model(Model):
    
    def __init__(self):        
        super(Model, self).__init__()
        self.d1 = Dense(100, activation='relu', kernel_initializer=HeNormal(seed=2))
        self.d2 = Dense(25, activation='relu', kernel_initializer=HeNormal(seed=2))
        self.d3 = Dense(2, activation='softmax', kernel_initializer=GlorotNormal(seed=2))
        
    def call(self, x):
        x = self.d1(x)
        x = self.d2(x)
        x = self.d3(x)
        return x

    
class agent():
    
    def __init__(self):
        self.model = Model()
        self.opt = Adam(learning_rate=0.001)
        self.gamma = 1
    
    def act(self, state):
        
        prob = self.model(state)
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        action = dist.sample()
        
        return int(action.numpy()[0])
    
    def a_loss(self, prob, action, reward):
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        log_prob = dist.log_prob(action)
        loss = -log_prob * reward
        
        return loss
    
    def train(self, states, rewards, actions):
        sum_reward = 0
        discount_rewards = []
        rewards.reverse()
        
        for r in rewards:
            sum_reward = r + self.gamma * sum_reward 
            discount_rewards.append(sum_reward)
            discount_rewards.reverse()
        
            for state, reward, action in zip(states, discount_rewards, actions):
                with tf.GradientTape() as tape:
                    p = self.model(state, training=True)
                    loss = self.a_loss(p, action, reward)
                    
                grads = tape.gradient(loss, self.model.trainable_variables)
                self.opt.apply_gradients(zip(grads, self.model.trainable_variables)) 
    