In [20]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

class PolicyGradientAgent:
    def __init__(self, state_size, action_size, learning_rate=0.01, gamma=0.99):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.build_model()

    def build_model(self):
        input_layer = Input(shape=(self.state_size,))
        dense1 = Dense(24, activation='relu')(input_layer)
        dense2 = Dense(24, activation='relu')(dense1)
        output_probs = Dense(self.action_size, activation='softmax')(dense2)

        # Create the model
        self.model = Model(inputs=input_layer, outputs=output_probs)

        # Define optimizer and compile model
        optimizer = Adam(lr=self.learning_rate)
        self.model.compile(optimizer=optimizer, loss='categorical_crossentropy')

    def select_action(self, state):
        state = np.reshape(state, [1, self.state_size])
        probabilities = self.model.predict(state)[0]
        action = np.random.choice(self.action_size, p=probabilities)
        return action

    def train(self, states, actions, discounted_rewards):
        actions = tf.keras.utils.to_categorical(actions, num_classes=self.action_size)
        self.model.train_on_batch(states, actions, sample_weight=discounted_rewards)



env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Initialize Policy Gradient agent
pg_agent = PolicyGradientAgent(state_size, action_size)

num_episodes = 1000
episode_rewards_pg = []

for episode in range(num_episodes):
    state = env.reset()
    states, actions, rewards = [], [], []
    total_reward = 0
    done = False
    while not done:
        action = pg_agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        
        Record states, actions and rewards
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        
        total_reward += reward
        state = next_state
    discounted_rewards = pg_agent.discount_rewards(rewards)
    pg_agent.train(np.array(states), np.array(actions), discounted_rewards)
    episode_rewards_pg.append(total_reward)
    if (episode + 1) % 100 == 0:
        print(f"Episode: {episode + 1}, Total Reward: {total_reward}")
avg_reward_pg = np.mean(episode_rewards_pg)
print(f"Policy Gradient Agent Average Reward: {avg_reward_pg}")
env.close()


IndentationError: expected an indented block after 'while' statement on line 59 (2730791037.py, line 76)