# Steps in the REINFORCE Algorithm
1. Collect trajectories by interacting with the environment.
2. Compute returns G_t
3. Compute policy gradient updates using ∇𝜃𝐽(𝜃)
4. Update the policy network using gradient ascent.

In [2]:
%pip install gym

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple, https://pypi.garenanow.com/
Collecting gym
  Using cached https://pypi.garenanow.com/mirror/packages/ab/b1/eb05a423eb801ab7d0715d6a3b28d92589e30b437052553df19ca2087240/gym-0.26.2.tar.gz (721 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting cloudpickle>=1.2.0
  Using cached https://pypi.garenanow.com/mirror/packages/7e/e8/64c37fadfc2816a7701fa8a6ed8d87327c7d54eacfbfb6edab14a2f2be75/cloudpickle-3.1.1-py3-none-any.whl (20 kB)
Collecting gym_notices>=0.0.4
  Using cached https://pypi.garenanow.com/mirror/packages/25/26/d786c6bec30fe6110fd3d22c9a273a2a0e56c0b73b93e25ea1af5a53243b/gym_notices-0.0.8-py3-none-any.whl (3.0 kB)
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml) ... [?25ldone
[?25h  Created wheel for gym: filename=gym-0.26.2-py3-none-any.whl size=827728 

In [7]:
%pip install tensorflow

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple, https://pypi.garenanow.com/
Collecting tensorflow
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/05/c7/6a1be731753934a1965fa7d751dab30d5cdea1800ca34e0fe57c1d40ac35/tensorflow-2.16.2-cp39-cp39-macosx_10_15_x86_64.whl (259.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.5/259.5 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hCollecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3
  Downloading https://pypi.garenanow.com/mirror/packages/48/d5/cccc7e82bbda9909ced3e7a441a24205ea07fea4ce23a772743c0c7611fa/protobuf-4.25.6.tar.gz (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.6/380.6 KB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting wrapt>=1.11.0
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a2/a9/712a53f8f4f4545768ac53

In [None]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Hyperparameters
learning_rate = 0.01
gamma = 0.99
episodes = 1000

# Create Policy Network (Actor)
class PolicyNetwork(keras.Model):
    def __init__(self, state_dim, action_dim, hidden_units=128):
        super(PolicyNetwork, self).__init__()
        self.fc1 = layers.Dense(hidden_units, activation="relu")
        self.fc2 = layers.Dense(action_dim, activation="softmax")

    def call(self, state):
        x = self.fc1(state)
        return self.fc2(x)  # Output action probabilities

# Select action based on policy
def select_action(model, state):    
    # print(f"state={state}")
    state = tf.convert_to_tensor(state, dtype=tf.float32)
    state = tf.expand_dims(state, axis=0)  # Add batch dimension
    action_probs = model(state)
    action = np.random.choice(len(action_probs.numpy()[0]), p=action_probs.numpy()[0])
    return action, tf.math.log(action_probs[0, action])

# Compute discounted returns
def compute_returns(rewards, gamma=0.99):
    returns = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    return np.array(returns)


In [None]:

# Train Policy Gradient (REINFORCE)
def train_policy_gradient(env, model, optimizer, episodes=1000, gamma=0.99):
    for episode in range(episodes):
        state, _ = env.reset()
        done = False
        log_probs = []
        rewards = []
        with tf.GradientTape() as tape:
            while not done:
                # action, log_prob = select_action(model, state)
                state = tf.convert_to_tensor(state, dtype=tf.float32)
                state = tf.expand_dims(state, axis=0)  # Add batch dimension
                action_probs = model(state)
                action = np.random.choice(len(action_probs.numpy()[0]), p=action_probs.numpy()[0])
                log_prob = tf.math.log(action_probs[0, action])

                next_state, reward, done, truncated, _ = env.step(action)            

                log_probs.append(log_prob)
                rewards.append(reward)
                state = next_state

            # Compute discounted returns
            returns = compute_returns(rewards, gamma)
            returns = tf.convert_to_tensor(returns, dtype=tf.float32)

            # Compute policy loss
            loss = -tf.reduce_sum(tf.stack(log_probs) * returns)

        print(f"loss = {loss}")

        # Update policy network
        grads = tape.gradient(loss, model.trainable_variables)
        # Debugging: Check if any gradient is None
        # for i, g in enumerate(grads):
        #     if g is None:
        #         print(f"Gradient for variable {i} is None!")
                        
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if episode % 2 == 0:
            print(f"Episode {episode}: Total Reward = {sum(rewards)}")



In [33]:

# Main function
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

policy_net = PolicyNetwork(state_dim, action_dim)
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

In [None]:
train_policy_gradient(env, policy_net, optimizer, episodes=10)

loss = 2736.755859375
Episode 0: Total Reward = 155.0
loss = 3608.968017578125
loss = 3960.79345703125
loss = 3742.524658203125
loss = 3691.3115234375
loss = 3220.919189453125
loss = 3519.19873046875
loss = 3837.348388671875
loss = 3537.975341796875
loss = 4933.89208984375
loss = 4287.88037109375
loss = 3931.04052734375
loss = 3988.744873046875
loss = 4337.841796875
loss = 4570.18017578125
loss = 5460.9638671875
loss = 6987.01416015625
loss = 7196.244140625
loss = 8701.853515625
loss = 11747.6865234375
loss = 18236.8828125
loss = 30380.0078125
loss = 224208.34375


KeyboardInterrupt: 

In [16]:
state, _ = env.reset()
tf.convert_to_tensor(state)

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([ 0.00751302, -0.02676319, -0.01451282, -0.04946909], dtype=float32)>