In [None]:
# Use OpenAI Gym to create an environment for the DQN agent.

import gym

# Create the CartPole environment
env = gym.make('CartPole-v1')

# Print environment details
print("State Space:", env.observation_space)
print("Action Space:", env.action_space)

In [None]:
# Define a neural network to approximate the Q-values for state-action pairs.

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Build the Q-Network
def build_q_network(input_shape, action_space):
    model = Sequential([
        Dense(24, activation='relu', input_shape=input_shape),
        Dense(24, activation='relu'),
        Dense(action_space, activation='linear')
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

# Initialize the Q-Network
state_shape = env.observation_space.shape
action_space = env.action_space.n
q_network = build_q_network(state_shape, action_space)
q_network.summary()

In [None]:
# Implement a replay buffer to store experience tuples (state, action, reward, next_state, done).

import random
import numpy as np

class ReplayBuffer:
    def __init__(self, max_size):
        self.buffer = []
        self.max_size = max_size

    def add(self, experience):
        if len(self.buffer) >= self.max_size:
            self.buffer.pop(0)
        self.buffer.append(experience)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

# Initialize the replay buffer
replay_buffer = ReplayBuffer(max_size=2000)

In [None]:
# Train the DQN agent using experience replay and the epsilon-greedy strategy for action selection.

# Hyperparameters
episodes = 500
batch_size = 64
gamma = 0.99  # Discount factor
epsilon = 1.0  # Initial exploration rate
epsilon_min = 0.01  # Minimum exploration rate
epsilon_decay = 0.995  # Decay rate for exploration

# Training loop
for episode in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_shape[0]])
    total_reward = 0

    for time in range(500):  # Maximum steps per episode
        # Epsilon-greedy action selection
        if np.random.rand() <= epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(q_network.predict(state))  # Exploit

        # Take action and observe result
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_shape[0]])
        total_reward += reward

        # Add experience to replay buffer
        replay_buffer.add((state, action, reward, next_state, done))
        state = next_state

        # Train the Q-network with samples from the replay buffer
        if len(replay_buffer.buffer) > batch_size:
            minibatch = replay_buffer.sample(batch_size)
            for s, a, r, ns, d in minibatch:
                target = r
                if not d:
                    target += gamma * np.amax(q_network.predict(ns))
                target_f = q_network.predict(s)
                target_f[0][a] = target
                q_network.fit(s, target_f, epochs=1, verbose=0)

        # End episode if done
        if done:
            print(f"Episode: {episode + 1}/{episodes}, Score: {total_reward}, Epsilon: {epsilon:.2f}")
            break

    # Decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

In [None]:
# Test the trained agent by letting it play in the environment without exploration.

for episode in range(5):  # Test for 5 episodes
    state = env.reset()
    state = np.reshape(state, [1, state_shape[0]])
    total_reward = 0
    while True:
        env.render()  # Render the environment
        action = np.argmax(q_network.predict(state))
        next_state, reward, done, _ = env.step(action)
        state = np.reshape(next_state, [1, state_shape[0]])
        total_reward += reward
        if done:
            print(f"Test Episode: {episode + 1}, Score: {total_reward}")
            break
env.close()