In [None]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


In [None]:
# Create the CartPole environment
env = gym.make("CartPole-v1")

# Environment details
print("Action Space:", env.action_space)
print("Observation Space:", env.observation_space)


In [None]:
# Define the policy model
def create_policy_model(input_shape, action_space):
    model = Sequential([
        Dense(24, activation='relu', input_shape=input_shape),
        Dense(24, activation='relu'),
        Dense(action_space, activation='softmax')  # Output probabilities for actions
    ])
    model.compile(optimizer=Adam(learning_rate=0.01), loss='categorical_crossentropy')
    return model

# Initialize the policy model
policy_model = create_policy_model(input_shape=(env.observation_space.shape[0],), action_space=env.action_space.n)
print("Policy Model Summary:")
policy_model.summary()


In [None]:
# Function to sample an action from the policy
def sample_action(policy, state):
    action_prob = policy.predict(state, verbose=0)
    return np.random.choice(len(action_prob[0]), p=action_prob[0])

# Function to compute discounted rewards
def compute_discounted_rewards(rewards, gamma=0.99):
    discounted_rewards = np.zeros_like(rewards, dtype=np.float32)
    cumulative = 0
    for t in reversed(range(len(rewards))):
        cumulative = rewards[t] + gamma * cumulative
        discounted_rewards[t] = cumulative
    return discounted_rewards


In [None]:
# Training parameters
episodes = 500
gamma = 0.99  # Discount factor

for episode in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, env.observation_space.shape[0]])
    
    episode_states, episode_actions, episode_rewards = [], [], []
    done = False
    total_reward = 0

    # Generate an episode
    while not done:
        # Sample an action
        action = sample_action(policy_model, state)
        
        # Take the action
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, env.observation_space.shape[0]])
        
        # Store episode data
        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)
        
        state = next_state
        total_reward += reward

    # Compute discounted rewards
    discounted_rewards = compute_discounted_rewards(episode_rewards, gamma)
    discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / np.std(discounted_rewards)  # Normalize

    # Convert data to arrays
    episode_states = np.vstack(episode_states)
    episode_actions = np.array(episode_actions)
    discounted_rewards = np.array(discounted_rewards)

    # Train the policy model
    actions_one_hot = tf.keras.utils.to_categorical(episode_actions, num_classes=env.action_space.n)
    policy_model.fit(episode_states, actions_one_hot, sample_weight=discounted_rewards, verbose=0)

    # Log progress
    print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")

env.close()


In [None]:
state = env.reset()
state = np.reshape(state, [1, env.observation_space.shape[0]])
done = False
total_reward = 0

while not done:
    # Sample the best action
    action = np.argmax(policy_model.predict(state, verbose=0))
    state, reward, done, _ = env.step(action)
    state = np.reshape(state, [1, env.observation_space.shape[0]])
    total_reward += reward
    env.render()

print(f"Total Reward from Trained Policy: {total_reward}")
env.close()


In [None]:
import matplotlib.pyplot as plt

# Assume rewards_log contains total rewards for each episode (collected during training)
rewards_log = [total_reward]  # Replace with actual list during training

plt.plot(rewards_log)
plt.title("Training Progress")
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.show()
