In [15]:
import gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
from gym.wrappers import RecordVideo

# Check if running in IPython for inline plots
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# Device configuration: GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
# Named tuple for storing transitions
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

# Replay Memory for experience replay
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [17]:
# Q-Network
class DQN(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(n_observations, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, n_actions)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [18]:
# Initialise MountainCar environment
env = gym.make("MountainCar-v0")
env = RecordVideo(env, './video', episode_trigger=lambda e_idx: e_idx % 50 == 0)

# Hyperparameters
BATCH_SIZE = 64
GAMMA = 0.99
EPS_START = 1.0     # Start with a higher epsilon
EPS_END = 0.01      # Lower final epsilon
EPS_DECAY = 500     # More gradual decay
TAU = 0.01          # Soft update rate
LR = 1e-3
MEMORY_SIZE = 20000
NUM_EPISODES = 800  # Increased episodes for better exploration

# Get number of actions and state space size
n_actions = env.action_space.n
state, info = env.reset()
n_observations = len(state)

# Initialise Policy, Target for Double DQN, and Optimiser
policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=LR)
memory = ReplayMemory(MEMORY_SIZE)
scheduler = StepLR(optimizer, step_size=50, gamma=0.95)

steps_done = 0

def select_action(state):
    """Epsilon-greedy policy"""
    global steps_done
    sample = random.random()
    # Compute decayed epsilon
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).argmax(dim=1).view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)

def optimize_model():
    if len(memory) < BATCH_SIZE:
        return

    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))

    # Create batch tensors
    non_final_mask = torch.tensor(
        tuple(s is not None for s in batch.next_state),
        device=device, dtype=torch.bool
    )
    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Q(s, a) from policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Double DQN: 
    # 1) Select best action using policy_net
    next_state_actions = policy_net(non_final_next_states).argmax(dim=1, keepdim=True)
    # 2) Evaluate that action using target_net
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).gather(
            1, next_state_actions).squeeze()

    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    optimizer.zero_grad()
    loss.backward()
    # Gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 10)
    optimizer.step()

# Logging
episode_rewards = []

def plot_rewards(show_result=False):
    plt.figure(1)
    rewards_t = torch.tensor(episode_rewards, dtype=torch.float)
    if show_result:
        plt.title('Training Result')
    else:
        plt.clf()
        plt.title(f'Training... Episode {len(episode_rewards)}')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.plot(rewards_t.numpy())
    # Plot 100-episode average
    if len(rewards_t) >= 100:
        means = rewards_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy(), label='100-Episode Average')
        plt.legend()
    plt.pause(0.001)
    if is_ipython:
        if not show_result:
            display.display(plt.gcf())
            display.clear_output(wait=True)
        else:
            display.display(plt.gcf())


  logger.warn(
  logger.warn(
  gym.logger.warn(


In [19]:
# Training loop
for i_episode in range(NUM_EPISODES):
    state, _ = env.reset()
    state = torch.tensor([state], device=device, dtype=torch.float32)
    total_reward = 0

    for t in count():
        action = select_action(state)
        next_state_raw, reward, terminated, truncated, _ = env.step(action.item())

        # Enhanced reward shaping
        position, velocity = next_state_raw
        # Reward for moving further to the right
        shaped_reward = position + 0.5
        # Provide a strong reward for reaching the goal
        if position >= env.goal_position:
            shaped_reward += 10.0

        total_reward += shaped_reward

        done = terminated or truncated
        next_state = None if done else torch.tensor([next_state_raw], device=device, dtype=torch.float32)

        shaped_reward_tensor = torch.tensor([shaped_reward], device=device, dtype=torch.float32)

        memory.push(state, action, next_state, shaped_reward_tensor)

        state = next_state

        optimize_model()

        # Soft update of target network
        for target_param, policy_param in zip(target_net.parameters(), policy_net.parameters()):
            target_param.data.copy_(TAU * policy_param.data + (1.0 - TAU) * target_param.data)

        if done:
            episode_rewards.append(total_reward)
            plot_rewards()
            break

    # Update learning rate scheduler
    scheduler.step()

print("Training Complete")
plot_rewards(show_result=True)
plt.ioff()
plt.show()


KeyboardInterrupt: 

<Figure size 640x480 with 0 Axes>

In [20]:
# Record best model for demonstration
# Save policy net if best reward
if episode_rewards and max(episode_rewards) == episode_rewards[-1]:
    torch.save(policy_net.state_dict(), 'best_model_mountain_car.pth')

# Demonstrate the best policy
from gym.wrappers import RecordVideo
simulation_env = gym.make('MountainCar-v0', render_mode="rgb_array")
simulation_env = RecordVideo(simulation_env, './video_mountain_car', episode_trigger=lambda e_idx: True)
model = DQN(n_observations, n_actions).to(device)
model.load_state_dict(torch.load('best_model_mountain_car.pth'))
model.eval()

state, _ = simulation_env.reset()
state = torch.tensor([state], device=device, dtype=torch.float32)
done = False
while not done:
    with torch.no_grad():
        action = model(state).argmax(dim=1).view(1, 1)
    next_state_raw, _, terminated, truncated, _ = simulation_env.step(action.item())
    next_state = torch.tensor([next_state_raw], device=device, dtype=torch.float32)
    state = next_state
    done = terminated or truncated

simulation_env.close()
print("Video recorded. Check the 'video_mountain_car' folder for output.")


RuntimeError: Error(s) in loading state_dict for DQN:
	Missing key(s) in state_dict: "fc1.weight", "fc1.bias", "fc2.weight", "fc2.bias", "fc3.weight", "fc3.bias". 
	Unexpected key(s) in state_dict: "layer1.weight", "layer1.bias", "layer2.weight", "layer2.bias", "layer3.weight", "layer3.bias". 