In [1]:
!pip install matplotlib



In [None]:
import gym
import random
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from collections import deque

# Define the Q-network
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Define the replay buffer
class ReplayBuffer:
    def __init__(self, buffer_size, batch_size):
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size

    def add(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)
        states, actions, rewards, next_states, dones = zip(*experiences)
        return (
            torch.tensor(states, dtype=torch.float32),
            torch.tensor(actions, dtype=torch.int64),
            torch.tensor(rewards, dtype=torch.float32),
            torch.tensor(next_states, dtype=torch.float32),
            torch.tensor(dones, dtype=torch.uint8)
        )

    def __len__(self):
        return len(self.memory)

# Define the Double DQN agent
class DoubleDQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.qnetwork_local = QNetwork(state_size, action_size)
        self.qnetwork_target = QNetwork(state_size, action_size)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=0.001)
        self.criterion = nn.MSELoss()
        self.replay_buffer = ReplayBuffer(buffer_size=10000, batch_size=64)
        self.gamma = 0.99
        self.tau = 1e-3
        self.update_every = 4
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0 and len(self.replay_buffer) > self.replay_buffer.batch_size:
            experiences = self.replay_buffer.sample()
            self.learn(experiences)

    def act(self, state, eps=0.):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # Double DQN update
        Q_local_next = self.qnetwork_local(next_states).detach()
        best_actions = torch.argmax(Q_local_next, dim=1).unsqueeze(1)
        Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, best_actions)
        Q_targets = rewards.unsqueeze(1) + (self.gamma * Q_targets_next * (1 - dones.unsqueeze(1)))

        Q_expected = self.qnetwork_local(states).gather(1, actions.unsqueeze(1))

        loss = self.criterion(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.soft_update(self.qnetwork_local, self.qnetwork_target)

    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)

# Create the environment
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DoubleDQNAgent(state_size, action_size)

# Train the agent
n_episodes = 1000
max_t = 200
eps_start = 1.0
eps_end = 0.01
eps_decay = 0.995
rewards_list = []

for i_episode in range(1, n_episodes + 1):
    state = env.reset()
    total_reward = 0
    for t in range(max_t):
        action = agent.act(state, eps_start)
        next_state, reward, done, _ = env.step(action)
        agent.step(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        if done:
            break
    eps_start = max(eps_end, eps_decay * eps_start)
    rewards_list.append(total_reward)
    if i_episode % 100 == 0:
        print(f"Episode {i_episode}/{n_episodes}, Average Reward: {np.mean(rewards_list[-100:])}")

# Plot the rewards
plt.plot(rewards_list)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Training Progress')
plt.show()

# Visualize some episodes
for i in range(3):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = agent.act(state)
        state, reward, done, _ = env.step(action)
        total_reward += reward
        env.render()
    print(f"Episode {i+1}, Total Reward: {total_reward}")

env.close()


  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):
  torch.tensor(states, dtype=torch.float32),


Episode 100/1000, Average Reward: 20.32
Episode 200/1000, Average Reward: 16.28
Episode 300/1000, Average Reward: 13.96


# Initialize replay memory D with capacity N
D = ReplayMemory(capacity=N)

# Initialize Q-network with random weights
Q = NeuralNetwork()

# Initialize target Q-network with the same weights as Q
Q_target = NeuralNetwork()

# Initialize exploration rate
epsilon = 1.0

# Main training loop
for episode in range(num_episodes):
    # Initialize state
    state = env.reset()
    
    # Initialize episode-specific variables
    total_reward = 0
    done = False
    
    # While episode is not done
    while not done:
        # Epsilon-greedy action selection
        if random() < epsilon:
            action = env.action_space.sample()  # Explore: choose random action
        else:
            # Double Q-learning action selection
            with torch.no_grad():
                best_action = argmax(Q(state))
                action = best_action
        
        # Execute action in environment
        next_state, reward, done, _ = env.step(action)
        
        # Store transition (state, action, reward, next_state, done) in replay memory
        D.push(state, action, reward, next_state, done)
        
        # Update total reward
        total_reward += reward
        
        # Sample random minibatch of transitions from replay memory
        minibatch = D.sample_batch(batch_size)
        
        # Compute Q-values using Q-network
        states, actions, rewards, next_states, dones = zip(*minibatch)
        
        # Compute Q-values for actions using Q-network
        with torch.no_grad():
            next_actions = argmax(Q(next_states), axis=1)
            Q_next = Q_target(next_states)
            Q_targets = rewards + gamma * Q_next[range(batch_size), next_actions] * (1 - dones)
        
        # Update Q-network using gradient descent
        Q.update(states, actions, Q_targets)
        
        # Every C steps, update target Q-network
        if steps % C == 0:
            Q_target.weights = Q.weights
        
        # Move to next state
        state = next_state
        
    # Decay exploration rate epsilon
    epsilon = max(epsilon * epsilon_decay, epsilon_min)
    
    # Print episode statistics
    print(f"Episode {episode}: Total Reward = {total_reward}")
