In [28]:
import random
import time
import numpy as np
import gymnasium as gym


from collections import namedtuple, deque
from collections import defaultdict 
import tensorflow as tf
from  tensorflow.keras.layers import Dense
from  tensorflow.keras import Sequential,Input
import tensorflow_probability as tfp
import random

In [29]:
# set up environment
env_kwargs = {
    "id": "LunarLander-v2",
    "continuous": False,
    "gravity" : -8.0,
    "enable_wind": False,
}
env = gym.make(**env_kwargs)

In [30]:
# get some information about the env dimensions
print(f'observation space: {env.observation_space}, high: {env.observation_space.high}, low: {env.observation_space.low}')
print(f'action space: {env.action_space}')

observation space: Box([-1.5       -1.5       -5.        -5.        -3.1415927 -5.
 -0.        -0.       ], [1.5       1.5       5.        5.        3.1415927 5.        1.
 1.       ], (8,), float32), high: [1.5       1.5       5.        5.        3.1415927 5.        1.
 1.       ], low: [-1.5       -1.5       -5.        -5.        -3.1415927 -5.
 -0.        -0.       ]
action space: Discrete(4)


In [31]:
class PolicyNetwork():
    """Deep FFNetwork for Gym classical envs
    consisting of 3 linear layers with ReLU activation

    Args:
        input_dim (int): dimension of input, shape of observation space
        output_dim (int): dimension of output, number of possible actions
        hidden_dim (int): number of units in hidden layer

    """

    def __init__(self, input_dim, output_dim, hidden_dim=128):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        
    def create_model(self):
        self.model = Sequential([
            Input(shape=(self.input_dim,)),
            Dense(self.hidden_dim, activation="sigmoid"),
            Dense(self.output_dim )]
        )
        return self.model

In [32]:
class Trainer:
    """Trainer class for REINFORCE training of Policy NN on classical Gym environments

    Args:
        env_kwargs:  params for env
        model (nn.Module):  deep Q-Network model
        gamma (float):  discount factor gamma for MDP
        lr (float): learning rate for optimizer
        entropy_coeff (float): coefficient for entropy regularization
        clip_grad (int or float):   value for gradient clipping, no clipping if 0
        report_iters (int): report mean results every report_iters iterations
        seed (int): seed for RNG
    """
    def __init__(self,
                 env_kwargs,
                 model,
                 gamma=0.99,
                 lr=0.001,
                 entropy_coeff=0.0,
                 clip_grad=1.0,
                 report_iters=50,
                 seed: int = 1
                 ):
        # env params
        self.env_kwargs = env_kwargs
        self.env = gym.make(**self.env_kwargs)
        self.n_actions = self.env.action_space.n

        # model params
        self.model = model

        self.lr = lr
        self.entropy_coeff = entropy_coeff
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.lr)

        # meta params
        self.gamma = gamma
        self.clip_grad = clip_grad
        self.report_iters = report_iters

        self.rollout = None
        self.seed = seed
        self._rnds = np.random.RandomState(seed)
        self._step = 0
        
        
        
    def train(self, num_episodes):
        """Training of network model"""
        print(f'Start training model for {num_episodes} episodes:')
        reward_report = deque(maxlen=self.report_iters)
        loss_report = deque(maxlen=self.report_iters)
        for i in range(num_episodes):

            reward_report.append(self.do_episode())
            tot_loss, loss, entropy= self.update_model()
            loss_report.append(tot_loss)

            # report
            if i % self.report_iters == 0:
                mean_r = np.mean(reward_report)
                try:
                    mean_l = np.mean(loss_report)
                except TypeError:
                    mean_l = 0
                print(f'eps: {i:04} - mean reward/loss over last {self.report_iters} episodes: '
                      f'{mean_r:.1f}/{mean_l:.4f}, entropy: {entropy:.4f}')

        print('training finished.')
        
    def get_action(self, state):
        """Do forward pass and sample an action from model"""
        logprob = self.model.predict(state)
        
        # We need to sample an action from logprob
        # Doing this gives the balance between exploitation and exploration
        distribution = tfp.distributions.Categorical(logits=logprob)
        action = distribution.sample().numpy().item()
        entropy = distribution.entropy().numpy().item()
        return action, logprob, entropy        
        
       
    def do_episode(self, render=False):
        """Do one episode in environment"""
        if render:
            self.env = gym.make(**self.env_kwargs, render_mode="human")
        stop = False
        total_reward = 0
        state, _ = self.env.reset(seed=self.seed)
        state = tf.convert_to_tensor(state.reshape(-1,8), dtype=tf.float32)

        self.rollout = defaultdict(list)

        while not stop:
            self.rollout['state'].append(state)
            action, logprob, entropy = self.get_action(state)
            next_state, reward, term, trunc, info = self.env.step(action)
            stop = term or trunc

            # render env
            if render:
                self.env.render()
                time.sleep(0.05)

            total_reward += reward

            next_state = tf.convert_to_tensor(next_state.reshape(-1,8), dtype=tf.float32)
            reward = reward

            # save transition to rollout (reward, logprob, entropy, ...)
            self.rollout['reward'].append(reward)
            self.rollout['logprob'].append(logprob)
            self.rollout['entropy'].append(entropy)
            state = next_state

        return total_reward
    
    

    def update_model(self):
        """Do batch update of model parameters"""

        # get transitions from rollout
        rewards = self.rollout['reward']
        logprob = self.rollout['logprob']
        entropy = self.rollout['entropy']
        state = tf.concat(self.rollout['state'], axis=0)
        

        # calculate discounted cumulative reward
        discounted_rewards = np.power(self.gamma, np.arange(len(rewards))) * rewards
        cumulative_rewards = np.cumsum(discounted_rewards[::-1])[::-1]
        
        # score rewards by logprobs
        with tf.GradientTape() as tape:
            # We need to maximize
            # Recompute log probability, initial logprob is disconnected
            score = - self.model(state) * np.mean(cumulative_rewards)  
        gradients = tape.gradient(score, self.model.trainable_weights)
        # update params
        #ToDo: maximize not minimize
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_weights))
        total_loss = None
        loss= None  
         

        return total_loss, loss, np.array(entropy).sum()
    
    
    
    
    
    
    
SEED = 123
N = 1500

# initialize model
model = PolicyNetwork(8,4,128).create_model()

# initialize trainer
trainer = Trainer(
    env_kwargs,
    model,
    seed=SEED,
    lr=0.0001,
    clip_grad=0.08
)

# train...

trainer.train(N)    


Start training model for 1500 episodes:
eps: 0000 - mean reward/loss over last 50 episodes: -91.2/0.0000, entropy: 79.3777


In [None]:
SEED = 123
N = 1500

# initialize model
model = PolicyNetwork(8,4,128).create_model()

# initialize trainer
trainer = Trainer(
    env_kwargs,
    model,
    seed=SEED,
    lr=0.0001,
    clip_grad=0.08
)

# train...
trainer.train(N)

In [None]:
trainer.do_episode(render=True)


-0.8480336584644306
-0.9689037396067306
-0.9690818968023791
-0.9674164884776815
-0.963405975250339
-0.9570794142265697
-0.9484766066740065
-0.9377631421919546
-0.9250610403839801
-0.9104710706787387
-0.8941562395535243
-0.8762142538319893
-0.8567419612757305
-0.8358999594615568
-0.8137722013105986
-0.790458801527052
-0.7660229514430625
-0.740673974222716
-0.71437350526773
-0.6872765209890588
-0.6594836856118889
-0.631065088244668
-0.6020874508883196
-0.5726617210344216
-0.6104168734147208
-0.5103605658588037
-0.48013676560245244
-0.44982304279614027
-0.41944935721903676
-0.38920622851441067
-0.35911648930456863
-0.32932675946287304
-0.299989090678622
-0.27123445629993626
-0.24317616610056803
-0.2160419745934803
-0.19000736958440712
-0.16531723047705782
-0.1422364112531227
-0.12108484372873818
-0.10226846288318825
-0.08619888917263552
-0.07338688648525249
-0.06454717631595486
-0.06039984194336512
-0.061916929857460445
-0.07023547589247414
-0.08678827484317253
-0.11331085934756402
-0.151

-101.5486312802604

In [None]:
# 1. Using Graph.as_default():
g = tf.Graph()
with g.as_default():
  c = tf.constant(5.0)
  assert c.graph is g



In [None]:
# 2. Constructing and making default:
with tf.Graph().as_default() as g:
  c = tf.constant(5.0)
  assert c.graph is g