In [1]:
! pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0


In [4]:
pip uninstall torch torchvision torchaudioy

Found existing installation: torch 2.5.0+cu121
Uninstalling torch-2.5.0+cu121:
  Would remove:
    /usr/local/bin/convert-caffe2-to-onnx
    /usr/local/bin/convert-onnx-to-caffe2
    /usr/local/bin/torchfrtrace
    /usr/local/bin/torchrun
    /usr/local/lib/python3.10/dist-packages/functorch/*
    /usr/local/lib/python3.10/dist-packages/torch-2.5.0+cu121.dist-info/*
    /usr/local/lib/python3.10/dist-packages/torch/*
    /usr/local/lib/python3.10/dist-packages/torchgen/*
Proceed (Y/n)? y
  Successfully uninstalled torch-2.5.0+cu121
Found existing installation: torchvision 0.20.0+cu121
Uninstalling torchvision-0.20.0+cu121:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/torchvision-0.20.0+cu121.dist-info/*
    /usr/local/lib/python3.10/dist-packages/torchvision.libs/libcudart.7ec1eba6.so.12
    /usr/local/lib/python3.10/dist-packages/torchvision.libs/libjpeg.ceea7512.so.62
    /usr/local/lib/python3.10/dist-packages/torchvision.libs/libnvjpeg.f00ca762.so.12
    /usr/local/l

In [1]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu


Cartpole Environment

In [2]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque
import imageio
import os

# Super Simple Neural Network for DQN
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

# Initialize environment and model
env = gym.make("CartPole-v1", render_mode='rgb_array')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

model = DQN(state_dim, action_dim)
target_model = DQN(state_dim, action_dim)
target_model.load_state_dict(model.state_dict())
optimizer = optim.Adam(model.parameters(), lr=1e-3)
replay_buffer = deque(maxlen=10000)

# Hyperparameters
gamma = 0.99
batch_size = 32
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.995
update_target_every = 100

# Helper functions
def select_action(state):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = model(state_tensor)
        return torch.argmax(q_values).item()

def train():
    if len(replay_buffer) < batch_size:
        return
    batch = random.sample(replay_buffer, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions)
    rewards = torch.FloatTensor(rewards)
    next_states = torch.FloatTensor(next_states)
    dones = torch.FloatTensor(dones)

    q_values = model(states).gather(1, actions.unsqueeze(1)).squeeze()
    with torch.no_grad():
        next_q_values = target_model(next_states).max(1)[0]
        targets = rewards + gamma * next_q_values * (1 - dones)

    loss = nn.MSELoss()(q_values, targets)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Create a directory for videos if it doesn't exist
os.makedirs("videos", exist_ok=True)

# Training loop
num_episodes = 1000
rewards_list = []

for episode in range(num_episodes):
    state, info = env.reset()
    total_reward = 0

    while True:
        action = select_action(state)
        next_state, reward, done, truncated, info = env.step(action)
        terminal = done or truncated
        replay_buffer.append((state, action, reward, next_state, terminal))
        train()
        state = next_state
        total_reward += reward
        if terminal:
            break

    rewards_list.append(total_reward)
    global epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # Update target model
    if episode % update_target_every == 0:
        target_model.load_state_dict(model.state_dict())

    # Generate video every 250 episodes
    if episode % 250 == 0:
        frames = []
        state, info = env.reset()
        for _ in range(500):  # Increased to 500 frames
            frames.append(env.render())
            action = select_action(state)
            next_state, _, done, truncated, _ = env.step(action)
            state = next_state
            if done or truncated:
                break
        video_path = f"videos/cartpole_episode_{episode}.mp4"
        imageio.mimsave(video_path, frames, fps=60)  # Increased to 60 fps
        print(f"Video saved: {video_path}")

    if episode % 50 == 0:
        avg_reward = np.mean(rewards_list[-50:])
        print(f"Episode {episode}, Reward: {total_reward}, Avg Reward (last 50): {avg_reward}")

  states = torch.FloatTensor(states)


Video saved: videos/cartpole_episode_0.mp4
Episode 0, Reward: 59.0, Avg Reward (last 50): 59.0
Episode 50, Reward: 11.0, Avg Reward (last 50): 17.66
Episode 100, Reward: 17.0, Avg Reward (last 50): 14.44
Episode 150, Reward: 13.0, Avg Reward (last 50): 12.48
Episode 200, Reward: 12.0, Avg Reward (last 50): 11.72




Video saved: videos/cartpole_episode_250.mp4
Episode 250, Reward: 15.0, Avg Reward (last 50): 16.54
Episode 300, Reward: 31.0, Avg Reward (last 50): 20.58
Episode 350, Reward: 11.0, Avg Reward (last 50): 41.56
Episode 400, Reward: 71.0, Avg Reward (last 50): 37.46
Episode 450, Reward: 161.0, Avg Reward (last 50): 112.08




Video saved: videos/cartpole_episode_500.mp4
Episode 500, Reward: 143.0, Avg Reward (last 50): 99.46
Episode 550, Reward: 254.0, Avg Reward (last 50): 220.38
Episode 600, Reward: 383.0, Avg Reward (last 50): 254.04
Episode 650, Reward: 195.0, Avg Reward (last 50): 238.92
Episode 700, Reward: 342.0, Avg Reward (last 50): 247.02




Video saved: videos/cartpole_episode_750.mp4
Episode 750, Reward: 244.0, Avg Reward (last 50): 244.84
Episode 800, Reward: 178.0, Avg Reward (last 50): 239.08
Episode 850, Reward: 119.0, Avg Reward (last 50): 137.24
Episode 900, Reward: 123.0, Avg Reward (last 50): 121.02
Episode 950, Reward: 128.0, Avg Reward (last 50): 114.98


In [7]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 512)
        self.fc3 = nn.Linear(512, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# Initialize environment and model
env = gym.make("MountainCar-v0", render_mode='rgb_array')
env._max_episode_steps = 1000  # Extend episode length
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

model = DQN(state_dim, action_dim)
target_model = DQN(state_dim, action_dim)
target_model.load_state_dict(model.state_dict())
optimizer = optim.Adam(model.parameters(), lr=0.004)
replay_buffer = deque(maxlen=1000000)

# Hyperparameters
gamma = 0.99
batch_size = 64
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
update_target_every = 1000

def select_action(state):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = model(state_tensor)
        return torch.argmax(q_values).item()

def custom_reward(state, next_state, action, reward):
    # Reward shaping
    position = next_state[0]
    velocity = next_state[1]
    reward += np.exp(position * 5)  # Reward for moving right
    if position >= 0.5:
        reward += 100  # Bonus for reaching the goal
    if position > state[0] and action == 2:  # Moving right
        reward += 1
    return reward

def train():
    if len(replay_buffer) < batch_size:
        return
    batch = random.sample(replay_buffer, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions)
    rewards = torch.FloatTensor(rewards)
    next_states = torch.FloatTensor(next_states)
    dones = torch.FloatTensor(dones)

    # Double DQN
    current_q_values = model(states).gather(1, actions.unsqueeze(1)).squeeze()
    next_actions = model(next_states).max(1)[1].unsqueeze(1)
    next_q_values = target_model(next_states).gather(1, next_actions).squeeze()
    targets = rewards + gamma * next_q_values * (1 - dones)

    loss = nn.MSELoss()(current_q_values, targets.detach())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Training loop
num_episodes = 5000
rewards_list = []

for episode in range(num_episodes):
    state, _ = env.reset()
    total_reward = 0
    step = 0

    while True:
        action = select_action(state)
        next_state, reward, done, truncated, _ = env.step(action)
        reward = custom_reward(state, next_state, action, reward)
        terminal = done or truncated
        replay_buffer.append((state, action, reward, next_state, terminal))

        if step % 4 == 0:
            train()

        state = next_state
        total_reward += reward
        step += 1

        if terminal:
            break

    rewards_list.append(total_reward)

    global epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # Update target model
    if episode % update_target_every == 0:
        target_model.load_state_dict(model.state_dict())

    if episode % 50 == 0:
        avg_reward = np.mean(rewards_list[-100:])
        print(f"Episode {episode}, Avg Reward (last 100): {avg_reward}")

    # Check for solving condition
    if avg_reward > -110:
        print(f"MountainCar-v0 solved in {episode} episodes!")
        break

print("Training completed.")

Episode 0, Avg Reward (last 100): -736.4163371710349
Episode 50, Avg Reward (last 100): -577.1948536722937
Episode 100, Avg Reward (last 100): -299.55248282181856
Episode 150, Avg Reward (last 100): 14.485462199432039
MountainCar-v0 solved in 150 episodes!
Training completed.


In [4]:
# Improved Neural Network for Double DQN
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

# Initialize environment and model
env = gym.make("Acrobot-v1", render_mode='rgb_array')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

model = DQN(state_dim, action_dim)
target_model = DQN(state_dim, action_dim)
target_model.load_state_dict(model.state_dict())
optimizer = optim.Adam(model.parameters(), lr=1e-4)
replay_buffer = deque(maxlen=1000000)

# Hyperparameters
gamma = 0.99
batch_size = 64
epsilon = 1.0
epsilon_min = 1e-4
epsilon_decay = 0.99
update_target_every = 1000

# Helper functions
def select_action(state):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = model(state_tensor)
        return torch.argmax(q_values).item()

def train():
    if len(replay_buffer) < batch_size:
        return
    batch = random.sample(replay_buffer, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions)
    rewards = torch.FloatTensor(rewards)
    next_states = torch.FloatTensor(next_states)
    dones = torch.FloatTensor(dones)

    # Double DQN
    current_q_values = model(states).gather(1, actions.unsqueeze(1)).squeeze()
    next_actions = model(next_states).max(1)[1].unsqueeze(1)
    next_q_values = target_model(next_states).gather(1, next_actions).squeeze()
    targets = rewards + gamma * next_q_values * (1 - dones)

    loss = nn.MSELoss()(current_q_values, targets.detach())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Create a directory for videos if it doesn't exist
os.makedirs("videos", exist_ok=True)

# Training loop
num_episodes = 1000
rewards_list = []
avg_rewards = []

for episode in range(num_episodes):
    state, info = env.reset()
    total_reward = 0
    step = 0

    while True:
        action = select_action(state)
        next_state, reward, done, truncated, info = env.step(action)
        terminal = done or truncated
        replay_buffer.append((state, action, reward, next_state, terminal))

        if step % 4 == 0:
            train()

        state = next_state
        total_reward += reward
        step += 1

        if terminal:
            break

    rewards_list.append(total_reward)
    avg_reward = np.mean(rewards_list[-100:])
    avg_rewards.append(avg_reward)

    global epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # Update target model
    if episode % update_target_every == 0:
        target_model.load_state_dict(model.state_dict())

    # Generate video every 500 episodes
    if episode % 500 == 0:
        frames = []
        state, info = env.reset()
        for _ in range(500):
            frames.append(env.render())
            action = select_action(state)
            next_state, _, done, truncated, _ = env.step(action)
            state = next_state
            if done or truncated:
                break
        video_path = f"videos/acrobot_episode_{episode}.mp4"
        imageio.mimsave(video_path, frames, fps=60)
        print(f"Video saved: {video_path}")

    if episode % 50 == 0:
        print(f"Episode {episode}, Reward: {total_reward}, Avg Reward (last 100): {avg_reward}")

    # Check for solving condition
    if avg_reward > -92.95:
        print(f"Acrobot-v1 solved in {episode} episodes!")
        break

print("Training completed.")



Video saved: videos/acrobot_episode_0.mp4
Episode 0, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 50, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 100, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 150, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 200, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 250, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 300, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 350, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 400, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 450, Reward: -500.0, Avg Reward (last 100): -500.0




Video saved: videos/acrobot_episode_500.mp4
Episode 500, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 550, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 600, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 650, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 700, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 750, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 800, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 850, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 900, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 950, Reward: -500.0, Avg Reward (last 100): -500.0
Training completed.


In [8]:
# Improved Neural Network for Double DQN
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

# Initialize environment and model
env = gym.make("Acrobot-v1", render_mode='rgb_array')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

model = DQN(state_dim, action_dim)
target_model = DQN(state_dim, action_dim)
target_model.load_state_dict(model.state_dict())
optimizer = optim.Adam(model.parameters(), lr=1e-4)
replay_buffer = deque(maxlen=1000000)

# Hyperparameters
gamma = 0.99
batch_size = 64
epsilon = 1.0
epsilon_min = 1e-4
epsilon_decay = 0.99
update_target_every = 1000

# Helper functions
def select_action(state):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = model(state_tensor)
        return torch.argmax(q_values).item()

def train():
    if len(replay_buffer) < batch_size:
        return
    batch = random.sample(replay_buffer, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions)
    rewards = torch.FloatTensor(rewards)
    next_states = torch.FloatTensor(next_states)
    dones = torch.FloatTensor(dones)

    # Double DQN
    current_q_values = model(states).gather(1, actions.unsqueeze(1)).squeeze()
    next_actions = model(next_states).max(1)[1].unsqueeze(1)
    next_q_values = target_model(next_states).gather(1, next_actions).squeeze()
    targets = rewards + gamma * next_q_values * (1 - dones)

    loss = nn.MSELoss()(current_q_values, targets.detach())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Create a directory for videos if it doesn't exist
os.makedirs("videos", exist_ok=True)

# Training loop
num_episodes = 5000
rewards_list = []
avg_rewards = []

for episode in range(num_episodes):
    state, info = env.reset()
    total_reward = 0
    step = 0

    while True:
        action = select_action(state)
        next_state, reward, done, truncated, info = env.step(action)
        terminal = done or truncated
        replay_buffer.append((state, action, reward, next_state, terminal))

        if step % 4 == 0:
            train()

        state = next_state
        total_reward += reward
        step += 1

        if terminal:
            break

    rewards_list.append(total_reward)
    avg_reward = np.mean(rewards_list[-100:])
    avg_rewards.append(avg_reward)

    global epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # Update target model
    if episode % update_target_every == 0:
        target_model.load_state_dict(model.state_dict())

    # Generate video every 500 episodes
    if episode % 500 == 0:
        frames = []
        state, info = env.reset()
        for _ in range(500):
            frames.append(env.render())
            action = select_action(state)
            next_state, _, done, truncated, _ = env.step(action)
            state = next_state
            if done or truncated:
                break
        video_path = f"videos/acrobot_episode_{episode}.mp4"
        imageio.mimsave(video_path, frames, fps=60)
        print(f"Video saved: {video_path}")

    if episode % 50 == 0:
        print(f"Episode {episode}, Reward: {total_reward}, Avg Reward (last 100): {avg_reward}")

    # Check for solving condition
    if avg_reward > -92.95:
        print(f"Acrobot-v1 solved in {episode} episodes!")
        break

print("Training completed.")



Video saved: videos/acrobot_episode_0.mp4
Episode 0, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 50, Reward: -500.0, Avg Reward (last 100): -499.7450980392157
Episode 100, Reward: -500.0, Avg Reward (last 100): -499.87
Episode 150, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 200, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 250, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 300, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 350, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 400, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 450, Reward: -500.0, Avg Reward (last 100): -500.0




Video saved: videos/acrobot_episode_500.mp4
Episode 500, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 550, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 600, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 650, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 700, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 750, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 800, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 850, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 900, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 950, Reward: -500.0, Avg Reward (last 100): -500.0




Video saved: videos/acrobot_episode_1000.mp4
Episode 1000, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1050, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1100, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1150, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1200, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1250, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1300, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1350, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1400, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1450, Reward: -500.0, Avg Reward (last 100): -500.0




Video saved: videos/acrobot_episode_1500.mp4
Episode 1500, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1550, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1600, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1650, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1700, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1750, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1800, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1850, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1900, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 1950, Reward: -500.0, Avg Reward (last 100): -500.0




Video saved: videos/acrobot_episode_2000.mp4
Episode 2000, Reward: -500.0, Avg Reward (last 100): -500.0
Episode 2050, Reward: -500.0, Avg Reward (last 100): -471.58
Episode 2100, Reward: -500.0, Avg Reward (last 100): -404.67
Episode 2150, Reward: -500.0, Avg Reward (last 100): -407.64
Episode 2200, Reward: -264.0, Avg Reward (last 100): -428.31
Episode 2250, Reward: -500.0, Avg Reward (last 100): -403.26
Episode 2300, Reward: -347.0, Avg Reward (last 100): -418.78
Episode 2350, Reward: -500.0, Avg Reward (last 100): -457.26
Episode 2400, Reward: -500.0, Avg Reward (last 100): -449.47
Episode 2450, Reward: -367.0, Avg Reward (last 100): -432.67




Video saved: videos/acrobot_episode_2500.mp4
Episode 2500, Reward: -293.0, Avg Reward (last 100): -436.38
Episode 2550, Reward: -500.0, Avg Reward (last 100): -439.48
Episode 2600, Reward: -500.0, Avg Reward (last 100): -454.45
Episode 2650, Reward: -500.0, Avg Reward (last 100): -460.4
Episode 2700, Reward: -265.0, Avg Reward (last 100): -458.65
Episode 2750, Reward: -500.0, Avg Reward (last 100): -466.25
Episode 2800, Reward: -394.0, Avg Reward (last 100): -466.32
Episode 2850, Reward: -500.0, Avg Reward (last 100): -460.1
Episode 2900, Reward: -265.0, Avg Reward (last 100): -465.41
Episode 2950, Reward: -500.0, Avg Reward (last 100): -458.84




Video saved: videos/acrobot_episode_3000.mp4
Episode 3000, Reward: -500.0, Avg Reward (last 100): -460.6
Episode 3050, Reward: -201.0, Avg Reward (last 100): -372.71
Episode 3100, Reward: -160.0, Avg Reward (last 100): -243.36
Episode 3150, Reward: -166.0, Avg Reward (last 100): -205.71
Episode 3200, Reward: -146.0, Avg Reward (last 100): -197.76
Episode 3250, Reward: -117.0, Avg Reward (last 100): -211.66
Episode 3300, Reward: -152.0, Avg Reward (last 100): -204.43
Episode 3350, Reward: -303.0, Avg Reward (last 100): -197.78
Episode 3400, Reward: -113.0, Avg Reward (last 100): -200.04
Episode 3450, Reward: -500.0, Avg Reward (last 100): -189.5




Video saved: videos/acrobot_episode_3500.mp4
Episode 3500, Reward: -170.0, Avg Reward (last 100): -194.19
Episode 3550, Reward: -181.0, Avg Reward (last 100): -201.97
Episode 3600, Reward: -116.0, Avg Reward (last 100): -197.02
Episode 3650, Reward: -141.0, Avg Reward (last 100): -190.71
Episode 3700, Reward: -164.0, Avg Reward (last 100): -204.74
Episode 3750, Reward: -142.0, Avg Reward (last 100): -223.21
Episode 3800, Reward: -237.0, Avg Reward (last 100): -209.84
Episode 3850, Reward: -209.0, Avg Reward (last 100): -209.68
Episode 3900, Reward: -90.0, Avg Reward (last 100): -208.1
Episode 3950, Reward: -131.0, Avg Reward (last 100): -223.96




Video saved: videos/acrobot_episode_4000.mp4
Episode 4000, Reward: -109.0, Avg Reward (last 100): -238.77
Episode 4050, Reward: -314.0, Avg Reward (last 100): -221.21
Episode 4100, Reward: -470.0, Avg Reward (last 100): -220.55
Episode 4150, Reward: -142.0, Avg Reward (last 100): -195.46
Episode 4200, Reward: -163.0, Avg Reward (last 100): -179.88
Episode 4250, Reward: -113.0, Avg Reward (last 100): -181.06
Episode 4300, Reward: -128.0, Avg Reward (last 100): -178.59
Episode 4350, Reward: -177.0, Avg Reward (last 100): -189.99
Episode 4400, Reward: -186.0, Avg Reward (last 100): -253.61
Episode 4450, Reward: -500.0, Avg Reward (last 100): -271.19




Video saved: videos/acrobot_episode_4500.mp4
Episode 4500, Reward: -332.0, Avg Reward (last 100): -251.65
Episode 4550, Reward: -145.0, Avg Reward (last 100): -302.68
Episode 4600, Reward: -500.0, Avg Reward (last 100): -310.35
Episode 4650, Reward: -500.0, Avg Reward (last 100): -298.53
Episode 4700, Reward: -500.0, Avg Reward (last 100): -349.81
Episode 4750, Reward: -500.0, Avg Reward (last 100): -423.24
Episode 4800, Reward: -148.0, Avg Reward (last 100): -410.43
Episode 4850, Reward: -254.0, Avg Reward (last 100): -365.1
Episode 4900, Reward: -458.0, Avg Reward (last 100): -363.7
Episode 4950, Reward: -500.0, Avg Reward (last 100): -349.37
Training completed.
