In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random
import matplotlib.pyplot as plt

# Hyperparameters
STATE_DIM = 6  # Acrobot state space dimension
ACTION_DIM = 3  # Discrete action space: 3 actions
MAX_EPISODES = 500
MAX_TIMESTEPS = 200
BATCH_SIZE = 64
GAMMA = 0.99  # Discount factor
TAU = 0.005  # Target network update rate
LR = 1e-3  # Learning rate for the actor and critic
EXPLORATION_NOISE = 0.1  # Noise added to actions for exploration
BUFFER_SIZE = 100000  # Replay buffer size

# Define Actor Network
class Actor(nn.Module):
    def _init_(self):
        super(Actor, self)._init_()
        self.fc1 = nn.Linear(STATE_DIM, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, ACTION_DIM)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        action = torch.tanh(self.fc3(x))  # Tanh to keep actions within [-1, 1]
        return action

# Define Critic Network
class Critic(nn.Module):
    def _init_(self):
        super(Critic, self)._init_()
        self.fc1 = nn.Linear(STATE_DIM + ACTION_DIM, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)

    def forward(self, state, action):
        x = torch.relu(self.fc1(torch.cat([state, action], dim=-1)))
        x = torch.relu(self.fc2(x))
        q_value = self.fc3(x)
        return q_value

# Define the DDPG Agent
class DDPGAgent:
    def _init_(self, env):
        self.env = env
        self.actor = Actor().to(device)
        self.target_actor = Actor().to(device)
        self.critic = Critic().to(device)
        self.target_critic = Critic().to(device)

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=LR)

        self.replay_buffer = deque(maxlen=BUFFER_SIZE)

        self.update_target_networks()

    def update_target_networks(self):
        # Soft update of target networks
        for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data)

    def select_action(self, state, noise=0.0):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        action = self.actor(state).cpu().detach().numpy()[0]
        return np.clip(action + noise * np.random.randn(ACTION_DIM), -1.0, 1.0)

    def store_experience(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def sample_batch(self):
        batch = random.sample(self.replay_buffer, BATCH_SIZE)
        state, action, reward, next_state, done = zip(*batch)
        return (
            torch.FloatTensor(state).to(device),
            torch.FloatTensor(action).to(device),
            torch.FloatTensor(reward).to(device),
            torch.FloatTensor(next_state).to(device),
            torch.FloatTensor(done).to(device)
        )

    def train(self):
        if len(self.replay_buffer) < BATCH_SIZE:
            return

        state, action, reward, next_state, done = self.sample_batch()

        # Train Critic
        with torch.no_grad():
            next_action = self.target_actor(next_state)
            target_q = reward + GAMMA * (1 - done) * self.target_critic(next_state, next_action)

        current_q = self.critic(state, action)
        critic_loss = nn.MSELoss()(current_q, target_q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Train Actor
        actor_loss = -self.critic(state, self.actor(state)).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Soft update target networks
        self.update_target_networks()

# Training Loop
if _name_ == "_main_":
    env = gym.make("Acrobot-v1")
    agent = DDPGAgent(env)

    all_rewards = []
    for episode in range(MAX_EPISODES):
        state = env.reset()
        total_reward = 0
        for t in range(MAX_TIMESTEPS):
            action = agent.select_action(state, noise=EXPLORATION_NOISE)
            next_state, reward, done, _ = env.step(action)
            agent.store_experience(state, action, reward, next_state, done)
            agent.train()

            state = next_state
            total_reward += reward
            if done:
                break

        all_rewards.append(total_reward)
        print(f"Episode {episode+1}/{MAX_EPISODES}, Reward: {total_reward}")

        # Plot rewards periodically
        if episode % 50 == 0:
            plt.plot(all_rewards)
            plt.xlabel("Episode")
            plt.ylabel("Reward")
            plt.show()

    # Close environment
    env.close()