In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras import optimizers, losses
from tensorflow.keras import Model
import numpy as np
import gym

In [2]:
env=gym.make('CartPole-v0')
num_action=env.action_space.n
print(num_action)

2




In [3]:
class ActorModel(Model):
    def __init__(self):
        super(ActorModel,self).__init__()
        self.layer_a1=Dense(64,activation='relu')
        self.layer_a2=Dense(64,activation='relu')
        self.logits=Dense(num_action,activation='softmax')
    def call(self,state):
        layer_a1=self.layer_a1(state)
        layer_a2=self.layer_a2(layer_a1)
        logits=self.logits(layer_a2)
        return logits
    

In [4]:
class CriticModel(Model):
    def __init__(self):
        super(CriticModel,self).__init__()
        self.layer_c1=Dense(64,activation='relu')
        self.layer_c2=Dense(64,activation='relu')
        self.value=Dense(1)
        
    def call(self,state):
        layer_c1=self.layer_c1(state)
        layer_c2=self.layer_c2(layer_c1)
        value=self.value(layer_c2)
        return value

In [5]:
class ActorCriticTrain:
    def __init__(self):
        self.lr=0.001
        self.lr2=0.001
        self.df=0.99
        self.en=0.001
        
        self.actor_model=ActorModel()
        self.actor_opt=optimizers.Adam(lr=self.lr, )
        self.critic_model=CriticModel()
        self.critic_opt=optimizers.Adam(lr=self.lr2, )
        
    def actor_loss(self,states,actions,advantages):
        policy=self.actor_model(tf.convert_to_tensor(np.vstack(states),
                                                     dtype=tf.float32))
        entropy=losses.categorical_crossentropy(policy,policy,
                                               from_logits=False)
        ce_loss=losses.SparseCategoricalCrossentropy(from_logits=False)
        log_pi=ce_loss(actions,policy)
        policy_loss=log_pi*np.array(advantages)
        policy_loss=tf.reduce_mean(policy_loss)
        
        return policy_loss-self.en*entropy
    
    def critic_loss(self,states,rewards,dones):
        last_state=states[-1]
        if dones[-1]==True:
            reward_sum=0
        else:
            reward_sum=self.critic_model(tf.convert_to_tensor(last_state[None,:],dtype=tf.float32))
        discounted_rewards=[]
        for reward in rewards[::-1]:
            reward_sum=reward+self.df*reward_sum
            discounted_rewards.append(reward_sum)
        discounted_rewards.reverse()
        discounted_rewards=tf.convert_to_tensor(np.array(discounted_rewards)[:,None],dtype=tf.float32)
        values=self.critic_model(tf.convert_to_tensor(np.vstack(states),dtype=tf.float32))
        error=tf.square(values-discounted_rewards)*0.5
        error=tf.reduce_mean(error)
        return error
    
    def train(self,states,actions,rewards,next_states,dones):
        critic_variable=self.critic_model.trainable_variables
        with tf.GradientTape() as tape_critic:
            tape_critic.watch(critic_variable)
            critic_loss=self.critic_loss(states,rewards,dones)
        critic_grads=tape_critic.gradient(critic_loss,critic_variable)
        self.critic_opt.apply_gradients(zip(critic_grads,critic_variable))
        
        advantages=self.compute_advantages(states,rewards,dones)
        actor_variable=self.actor_model.trainable_variables
        with tf.GradientTape() as tape:
            tape.watch(actor_variable)
            actor_loss=self.actor_loss(states,actions,advantages)
        actor_grads=tape.gradient(actor_loss,actor_variable)
        self.actor_opt.apply_gradients(zip(actor_grads,actor_variable))
        
    def compute_advantages(self,states,rewards,dones):
        last_state=states[-1]
        if dones[-1]==True:
            reward_sum=0
        else:
            reward_sum=self.critic_model(tf.convert_to_tensor(last_state[None, :],dtype=tf.float32))
        discounted_rewards=[]
        for reward in rewards[::-1]:
            reward_sum=reward+self.df*reward_sum
            discounted_rewards.append(reward_sum)
        discounted_rewards.reverse()
        values=self.critic_model(tf.convert_to_tensor(np.vstack(states),dtype=tf.float32))
        advantages=discounted_rewards-values
        return advantages
    def run(self):
        t_end=500
        epi=1000
        train_size=20
        states=[]
        actions=[]
        rewards=[]
        next_states=[]
        dones=[]
        state=env.reset()
        for e in range(epi):
            total_reward=0
            for t in range(t_end):
                policy = self.actor_model(tf.convert_to_tensor(state[None, :], dtype=tf.float32))
                action = tf.squeeze(tf.random.categorical(policy, 1), axis=-1)
                action = np.array(action)[0]
                next_state, reward, done, _ = env.step(action) 
                env.render()
                if t==t_end:
                    done=True
                    reward+=10
                if t<t_end and done:
                    reward=-1
                total_reward+=reward
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                next_states.append(next_state)
                dones.append(done)
            
                state=next_state
            
                if len(states)==train_size or done:
                    self.train(states,actions,rewards,next_states,dones)
                    states = []
                    actions = []
                    rewards = []
                    next_states = []
                    dones = []
                if done:
                    print("e : ", e, " reward : ", total_reward, " step : ", t)
                    env.reset()
                    break

In [6]:
ActorCritic = ActorCriticTrain()
ActorCritic.run()

e :  0  reward :  13.0  step :  14
e :  1  reward :  17.0  step :  18
e :  2  reward :  40.0  step :  41
e :  3  reward :  18.0  step :  19
e :  4  reward :  18.0  step :  19
e :  5  reward :  60.0  step :  61
e :  6  reward :  11.0  step :  12
e :  7  reward :  14.0  step :  15
e :  8  reward :  14.0  step :  15
e :  9  reward :  14.0  step :  15
e :  10  reward :  73.0  step :  74
e :  11  reward :  16.0  step :  17
e :  12  reward :  30.0  step :  31
e :  13  reward :  14.0  step :  15
e :  14  reward :  12.0  step :  13
e :  15  reward :  18.0  step :  19
e :  16  reward :  14.0  step :  15
e :  17  reward :  15.0  step :  16
e :  18  reward :  17.0  step :  18
e :  19  reward :  24.0  step :  25
e :  20  reward :  14.0  step :  15
e :  21  reward :  17.0  step :  18
e :  22  reward :  15.0  step :  16
e :  23  reward :  15.0  step :  16
e :  24  reward :  19.0  step :  20
e :  25  reward :  24.0  step :  25
e :  26  reward :  22.0  step :  23
e :  27  reward :  31.0  step :  32
e 

e :  224  reward :  42.0  step :  43
e :  225  reward :  25.0  step :  26
e :  226  reward :  85.0  step :  86
e :  227  reward :  26.0  step :  27
e :  228  reward :  9.0  step :  10
e :  229  reward :  32.0  step :  33
e :  230  reward :  94.0  step :  95
e :  231  reward :  139.0  step :  140
e :  232  reward :  158.0  step :  159
e :  233  reward :  35.0  step :  36
e :  234  reward :  69.0  step :  70
e :  235  reward :  93.0  step :  94
e :  236  reward :  111.0  step :  112
e :  237  reward :  62.0  step :  63
e :  238  reward :  26.0  step :  27
e :  239  reward :  54.0  step :  55
e :  240  reward :  63.0  step :  64
e :  241  reward :  146.0  step :  147
e :  242  reward :  123.0  step :  124
e :  243  reward :  62.0  step :  63
e :  244  reward :  64.0  step :  65
e :  245  reward :  49.0  step :  50
e :  246  reward :  17.0  step :  18
e :  247  reward :  108.0  step :  109
e :  248  reward :  198.0  step :  199
e :  249  reward :  17.0  step :  18
e :  250  reward :  198.0

KeyboardInterrupt: 