<a href="https://colab.research.google.com/github/hamagami/is2024/blob/main/12_DeepQlearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Q-learning
The learning process of the inverted pendulum using Deep-Q learning is demonstrated here. At the end of the process, an animation is created. Please note that generating the animation may take some time, so kindly be patient. The animation showcases the control performance based on the final results of the trained model.

In [3]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

In [4]:
%matplotlib notebook

In [5]:
# Define the DQN model
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

In [6]:
# Replay memory
class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, transition):
        if len(self.memory) >= self.capacity:
            self.memory.pop(0)
        self.memory.append(transition)

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [7]:
# Initialize the CartPole environment
env = gym.make("CartPole-v1")
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

# Parameters for training
epochs = 500
learning_rate = 0.001
gamma = 0.99  # Discount factor
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
batch_size = 64
memory_size = 10000

# Initialize DQN
policy_net = DQN(input_dim, output_dim)
target_net = DQN(input_dim, output_dim)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
memory = ReplayMemory(memory_size)

  deprecation(
  deprecation(


In [8]:
# Training loop
steps_done = 0
progress = []
epsilon = epsilon_start
for epoch in range(epochs):
    state = env.reset()
    total_reward = 0
    steps = 0

    while True:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        if random.random() < epsilon:
            action = random.choice(range(output_dim))
        else:
            with torch.no_grad():
                action = policy_net(state_tensor).argmax(dim=1).item()

        next_state, reward, done, _ = env.step(action)

        if done:
            reward = -1

        memory.push((state, action, reward, next_state, done))
        state = next_state
        total_reward += 1
        steps += 1

        if done:
            break

        if len(memory) >= batch_size:
            # Sample a batch
            transitions = memory.sample(batch_size)
            batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(*transitions)

            batch_state = torch.FloatTensor(batch_state)
            batch_action = torch.LongTensor(batch_action).unsqueeze(1)
            batch_reward = torch.FloatTensor(batch_reward)
            batch_next_state = torch.FloatTensor(batch_next_state)
            batch_done = torch.BoolTensor(batch_done)

            # Compute Q values
            q_values = policy_net(batch_state).gather(1, batch_action).squeeze()
            next_q_values = target_net(batch_next_state).max(1)[0].detach()
            next_q_values[batch_done] = 0.0
            target_q_values = batch_reward + gamma * next_q_values

            # Update policy network
            loss = nn.MSELoss()(q_values, target_q_values)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    progress.append((epoch + 1, steps, total_reward))
    epsilon = max(epsilon_end, epsilon * epsilon_decay)

    # Update target network
    if (epoch + 1) % 10 == 0:
        target_net.load_state_dict(policy_net.state_dict())

    if (epoch + 1) % 50 == 0:
        print(f"Epoch {epoch + 1}/{epochs}, Steps: {steps}, Total Reward: {total_reward}, Epsilon: {epsilon:.2f}")

  if not isinstance(terminated, (bool, np.bool8)):


Epoch 50/500, Steps: 48, Total Reward: 48, Epsilon: 0.78
Epoch 100/500, Steps: 41, Total Reward: 41, Epsilon: 0.61
Epoch 150/500, Steps: 131, Total Reward: 131, Epsilon: 0.47
Epoch 200/500, Steps: 20, Total Reward: 20, Epsilon: 0.37
Epoch 250/500, Steps: 116, Total Reward: 116, Epsilon: 0.29
Epoch 300/500, Steps: 143, Total Reward: 143, Epsilon: 0.22
Epoch 350/500, Steps: 154, Total Reward: 154, Epsilon: 0.17
Epoch 400/500, Steps: 128, Total Reward: 128, Epsilon: 0.13
Epoch 450/500, Steps: 172, Total Reward: 172, Epsilon: 0.10
Epoch 500/500, Steps: 131, Total Reward: 131, Epsilon: 0.08


In [9]:
# Visualize training progress
progress = np.array(progress)
plt.figure(figsize=(12, 5))
plt.plot(progress[:, 0], progress[:, 1], label="Steps per Epoch")
plt.plot(progress[:, 0], progress[:, 2], label="Total Reward per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Steps / Reward")
plt.legend()
plt.title("Training Progress")
plt.show()

# Test the trained agent with animation
def test_agent_animation(env, policy_net):
    state = env.reset()
    frames = []

    while True:
        frame = env.render(mode="rgb_array")
        frames.append(frame)
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            action = policy_net(state_tensor).argmax(dim=1).item()

        next_state, _, done, _ = env.step(action)
        state = next_state

        if done:
            break

    env.close()

    return frames

# Generate animation
frames = test_agent_animation(env, policy_net)

def update(frame):
    plt.imshow(frame)
    plt.axis("off")

fig = plt.figure(figsize=(8, 6))
ani = FuncAnimation(fig, update, frames=frames, interval=50, blit=False)
HTML(ani.to_jshtml())


<IPython.core.display.Javascript object>

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


<IPython.core.display.Javascript object>