In [None]:
# The Policy gradient model

In [1]:
import import_ipynb
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import tensorflow_probability as tfp
import os

from Utils import *

# Code for this section used the following source for reference: 
# https://github.com/abhisheksuran/Reinforcement_Learning/blob/master/Reinforce_(PG).ipynb 

###########
#Notes 


class Model(Model):
    
    def __init__(self):        
        super(Model, self).__init__()
        self.d1 = tf.keras.layers.Dense(8, activation='relu')
        self.out = Dense(2, activation='softmax')
        
    def call(self, x):
        x = tf.convert_to_tensor(x)
        x = self.d1(x)
        x = self.out(x)
        return x
    
class agent():
    
    def __init__(self, lr, gamma):
        self.model = Model()
        self.opt = Adam(learning_rate=lr)
        self.gamma = gamma
    
    def act(self, state):
        prob = self.model(state, training=False)
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        action = dist.sample()
        
        return int(action.numpy()[0])
    
    def a_loss(self, prob, action, reward):
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        log_prob = dist.log_prob(action)
        loss = -log_prob * reward
        return loss
    
    def train(self, states, rewards, actions):
        sum_reward = 0
        discount_rewards = []
        rewards.reverse()
        
        # for the progress bar
        count = 0 
    
        for r in rewards:
            sum_reward = r + self.gamma * sum_reward 
            discount_rewards.append(sum_reward)
        discount_rewards.reverse()
        
        discount_rewards -= np.mean(discount_rewards)
        discount_rewards /= (np.std(discount_rewards) + 1e-7)    
        
        for state, reward, action in zip(states, discount_rewards, actions):
            
            progress_bar(count, len(states))
            
            with tf.GradientTape() as tape:
                p = self.model(state, training=True)
                loss = self.a_loss(p, action, reward)

            grads = tape.gradient(loss, self.model.trainable_variables)
            self.opt.apply_gradients(zip(grads, self.model.trainable_variables)) 
            
            count += 1
            
    def save_model(self, name):
        self.model.save_weights(name)
        
    def load_model(self, name):
        if os.path.isfile(name):
            print('Model Loaded')
            self.model.load_weights(name)
    

importing Jupyter notebook from Utils.ipynb
