In [None]:
# Install Required Packages
!pip install gymnasium[atari] -q
!pip install gymnasium[accept-rom-license] -q
!pip install ale-py -q
!pip install torch torchvision -q
!pip install opencv-python -q
!pip install matplotlib -q

print("All packages installed successfully!")

[0mAll packages installed successfully!


In [None]:
# Test Environment and GPU
import gymnasium as gym
import torch
import numpy as np
import ale_py

# Register Atari environments
gym.register_envs(ale_py)

# Check GPU availability
print("=" * 50)
print("SYSTEM CHECK")
print("=" * 50)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print()

# Test Bank Heist environment
print("=" * 50)
print("TESTING BANK HEIST ENVIRONMENT")
print("=" * 50)

env = gym.make('ALE/BankHeist-v5', render_mode='rgb_array')
state, info = env.reset()

print(f"Environment created successfully!")
print(f"State shape (frame dimensions): {state.shape}")
print(f"Number of possible actions: {env.action_space.n}")
print(f"Actions available: {env.unwrapped.get_action_meanings()}")

# Take a test action
action = env.action_space.sample()
next_state, reward, terminated, truncated, info = env.step(action)
print(f"Test action completed - Reward: {reward}")

env.close()
print()
print("All tests passed! Ready to build the agent!")
print("=" * 50)

SYSTEM CHECK
Using device: cuda
GPU Name: Tesla T4
GPU Memory: 15.83 GB

TESTING BANK HEIST ENVIRONMENT
Environment created successfully!
State shape (frame dimensions): (210, 160, 3)
Number of possible actions: 18
Actions available: ['NOOP', 'FIRE', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'UPRIGHT', 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT', 'UPFIRE', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE']
Test action completed - Reward: 0.0

All tests passed! Ready to build the agent!


In [None]:
# CELL 3: Build the Deep Q-Network
import torch
import torch.nn as nn
import torch.nn.functional as F

class DQN(nn.Module):
    """
    Deep Q-Network for Bank Heist
    Takes 4 stacked grayscale frames (84x84) as input
    Outputs Q-values for each of the 18 possible actions
    """
    def __init__(self, action_size):
        super(DQN, self).__init__()

        # Convolutional layers to process game frames
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)  # 4 frames -> 32 features
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) # 32 -> 64 features
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1) # 64 -> 64 features

        # Fully connected layers for decision making
        self.fc1 = nn.Linear(7 * 7 * 64, 512)  # Flatten and connect
        self.fc2 = nn.Linear(512, action_size)  # Output Q-value for each action

    def forward(self, x):
        """Process frames through the network"""
        x = F.relu(self.conv1(x))  # Learn features from frames
        x = F.relu(self.conv2(x))  # Learn higher-level features
        x = F.relu(self.conv3(x))  # Learn even higher-level features

        x = x.view(x.size(0), -1)  # Flatten for fully connected layers

        x = F.relu(self.fc1(x))    # Decision making layer
        x = self.fc2(x)            # Q-values for each action

        return x

# Test the network
print("Testing Deep Q-Network...")
test_dqn = DQN(action_size=18)
test_input = torch.randn(1, 4, 84, 84)  # Batch of 1, 4 frames, 84x84 pixels
test_output = test_dqn(test_input)

print(f"Network created successfully!")
print(f"Input shape: {test_input.shape} (1 sample, 4 frames, 84x84 pixels)")
print(f"Output shape: {test_output.shape} (1 sample, 18 Q-values)")
print(f"Sample Q-values: {test_output[0][:5].detach().numpy()}")
print("Neural network is ready!")

Testing Deep Q-Network...
Network created successfully!
Input shape: torch.Size([1, 4, 84, 84]) (1 sample, 4 frames, 84x84 pixels)
Output shape: torch.Size([1, 18]) (1 sample, 18 Q-values)
Sample Q-values: [-0.01339139  0.00270121  0.00952767 -0.05176202 -0.02827951]
Neural network is ready!


In [None]:
# Frame Preprocessor
import cv2
from collections import deque

class FramePreprocessor:
    """
    Preprocesses game frames for the DQN:
    - Converts RGB to grayscale
    - Resizes to 84x84
    - Normalizes pixel values
    - Stacks 4 frames together (gives agent sense of motion)
    """
    def __init__(self, frame_stack_size=4):
        self.frame_stack_size = frame_stack_size
        self.frames = deque(maxlen=frame_stack_size)

    def preprocess_frame(self, frame):
        """Convert a single frame to grayscale, resize, and normalize"""
        gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        resized = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_AREA)
        normalized = resized / 255.0  # Scale to 0-1 range
        return normalized

    def reset(self, initial_frame):
        """Initialize frame stack at episode start"""
        processed = self.preprocess_frame(initial_frame)
        self.frames.clear()
        # Fill with 4 copies of the first frame
        for _ in range(self.frame_stack_size):
            self.frames.append(processed)
        return np.stack(self.frames, axis=0)

    def add_frame(self, frame):
        """Add a new frame to the stack"""
        processed = self.preprocess_frame(frame)
        self.frames.append(processed)
        return np.stack(self.frames, axis=0)

# Test the preprocessor
print("Testing Frame Preprocessor...")
env = gym.make('ALE/BankHeist-v5', render_mode='rgb_array')
state, _ = env.reset()

preprocessor = FramePreprocessor()
processed_state = preprocessor.reset(state)

print(f"Original frame shape: {state.shape}")
print(f"Processed state shape: {processed_state.shape}")
print(f"Pixel value range: [{processed_state.min():.3f}, {processed_state.max():.3f}]")
print("Frame preprocessor is ready!")

env.close()

Testing Frame Preprocessor...
Original frame shape: (210, 160, 3)
Processed state shape: (4, 84, 84)
Pixel value range: [0.000, 0.580]
Frame preprocessor is ready!


In [None]:
# Replay Memory
import random
from collections import deque

class ReplayMemory:
    """
    Experience Replay Buffer for DQN
    Stores transitions (state, action, reward, next_state, done)
    Samples random batches to break temporal correlation
    """
    def __init__(self, capacity=50000):
        """
        Args:
            capacity: Maximum number of experiences to store
        """
        self.memory = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        """Store a transition in memory"""
        self.memory.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        """
        Sample a random batch of experiences

        Returns:
            Tuple of numpy arrays: (states, actions, rewards, next_states, dones)
        """
        batch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        return (
            np.array(states),
            np.array(actions),
            np.array(rewards),
            np.array(next_states),
            np.array(dones)
        )

    def __len__(self):
        """Return current size of memory"""
        return len(self.memory)

# Test the replay memory
print("Testing Replay Memory...")
memory = ReplayMemory(capacity=50000)

# Add some dummy experiences
for i in range(100):
    dummy_state = np.random.rand(4, 84, 84)
    dummy_action = np.random.randint(0, 18)
    dummy_reward = np.random.rand()
    dummy_next_state = np.random.rand(4, 84, 84)
    dummy_done = False

    memory.push(dummy_state, dummy_action, dummy_reward, dummy_next_state, dummy_done)

print(f"Memory size: {len(memory)}")
print(f"Memory capacity: 50000")

# Sample a batch
states, actions, rewards, next_states, dones = memory.sample(32)
print(f"Sampled batch - States shape: {states.shape}")
print(f"Sampled batch - Actions shape: {actions.shape}")
print(f"Sampled batch - Rewards shape: {rewards.shape}")
print("Replay memory is ready!")

Testing Replay Memory...
Memory size: 100
Memory capacity: 50000
Sampled batch - States shape: (32, 4, 84, 84)
Sampled batch - Actions shape: (32,)
Sampled batch - Rewards shape: (32,)
Replay memory is ready!


In [None]:
# DQN Agent
import torch.optim as optim

class DQNAgent:
    """
    Deep Q-Network Agent for Bank Heist
    Implements the DQN algorithm with experience replay and target network
    """
    def __init__(self, state_shape, action_size, learning_rate=0.00025,
                 gamma=0.99, epsilon_start=1.0, epsilon_min=0.01,
                 epsilon_decay=0.995, memory_size=50000):
        """
        Args:
            state_shape: Shape of input state (4, 84, 84)
            action_size: Number of possible actions (18 for Bank Heist)
            learning_rate: Learning rate for optimizer (alpha in Bellman equation)
            gamma: Discount factor for future rewards
            epsilon_start: Initial exploration rate
            epsilon_min: Minimum exploration rate
            epsilon_decay: Rate at which epsilon decreases
            memory_size: Capacity of replay buffer
        """
        self.state_shape = state_shape
        self.action_size = action_size
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.learning_rate = learning_rate

        # Set device (GPU if available)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Agent using device: {self.device}")

        # Create policy network (the one we train)
        self.policy_net = DQN(action_size).to(self.device)

        # Create target network (stabilizes training)
        self.target_net = DQN(action_size).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()  # Set to evaluation mode

        # Optimizer
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)

        # Replay memory
        self.memory = ReplayMemory(memory_size)

    def select_action(self, state, training=True):
        """
        Select action using epsilon-greedy policy

        Args:
            state: Current game state
            training: If True, use epsilon-greedy; if False, use greedy

        Returns:
            Selected action (integer)
        """
        # Exploration: random action
        if training and random.random() < self.epsilon:
            return random.randint(0, self.action_size - 1)

        # Exploitation: best action according to Q-network
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            q_values = self.policy_net(state_tensor)
            return q_values.argmax().item()

    def train_step(self, batch_size=32):
        """
        Perform one training step using a batch from replay memory

        Args:
            batch_size: Number of experiences to sample

        Returns:
            Loss value for this training step
        """
        # Need enough experiences in memory
        if len(self.memory) < batch_size:
            return None

        # Sample batch from memory
        states, actions, rewards, next_states, dones = self.memory.sample(batch_size)

        # Convert to tensors
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)

        # Current Q values: Q(s, a)
        current_q_values = self.policy_net(states).gather(1, actions.unsqueeze(1))

        # Next Q values from target network: max_a' Q_target(s', a')
        with torch.no_grad():
            next_q_values = self.target_net(next_states).max(1)[0]
            # Bellman equation: Q_target = r + gamma * max_a' Q(s', a') * (1 - done)
            target_q_values = rewards + (self.gamma * next_q_values * (1 - dones))

        # Compute loss (Mean Squared Error)
        loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values)

        # Optimize the network
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def update_target_network(self):
        """Copy weights from policy network to target network"""
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def decay_epsilon(self):
        """Decrease exploration rate"""
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def save(self, filepath):
        """Save model weights"""
        torch.save({
            'policy_net': self.policy_net.state_dict(),
            'target_net': self.target_net.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'epsilon': self.epsilon
        }, filepath)

    def load(self, filepath):
        """Load model weights"""
        checkpoint = torch.load(filepath)
        self.policy_net.load_state_dict(checkpoint['policy_net'])
        self.target_net.load_state_dict(checkpoint['target_net'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.epsilon = checkpoint['epsilon']

# Test the agent
print("Testing DQN Agent...")
test_agent = DQNAgent(state_shape=(4, 84, 84), action_size=18)
test_state = np.random.rand(4, 84, 84)
test_action = test_agent.select_action(test_state)
print(f"Agent selected action: {test_action}")
print(f"Initial epsilon: {test_agent.epsilon}")
print("DQN Agent is ready!")

Testing DQN Agent...
Agent using device: cuda
Agent selected action: 16
Initial epsilon: 1.0
DQN Agent is ready!


In [None]:
  # Mount Google Drive and Setup Paths
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Create directory in your Google Drive
drive_base_path = '/content/drive/MyDrive/BankHeist_DQN_Project'
os.makedirs(drive_base_path, exist_ok=True)

print(f"Google Drive mounted successfully!")
print(f"Project directory: {drive_base_path}")
print(f"All checkpoints, models, and metrics will be saved here")
print("-" * 70)

Mounted at /content/drive
Google Drive mounted successfully!
Project directory: /content/drive/MyDrive/BankHeist_DQN_Project
All checkpoints, models, and metrics will be saved here
----------------------------------------------------------------------


In [None]:
# CELL 9: Memory-Efficient Trainer with Local + Drive Backup
import time
import json
import sys
import gc
import shutil

class BankHeistTrainer:
    """Memory-efficient trainer with auto-recovery"""
    def __init__(self, save_dir='./bank_heist_checkpoints', drive_backup_dir=None):
        self.save_dir = save_dir
        self.drive_backup_dir = drive_backup_dir
        os.makedirs(save_dir, exist_ok=True)
        if drive_backup_dir:
            os.makedirs(drive_backup_dir, exist_ok=True)

        self.episode_rewards = []
        self.episode_steps = []
        self.episode_losses = []
        self.epsilon_history = []

    def train(self, agent=None, start_episode=0, total_episodes=500,
              max_steps=2000, batch_size=32, learning_rate=0.00025,
              gamma=0.99, epsilon_start=1.0, epsilon_min=0.01,
              epsilon_decay=0.995, memory_size=30000,  # REDUCED from 50000
              target_update_freq=10, print_freq=50,
              local_save_freq=50, drive_save_freq=200):
        """
        Args:
            local_save_freq: Save to local disk every N episodes (fast)
            drive_save_freq: Copy to Drive every N episodes (slow but safe)
        """

        env = gym.make('ALE/BankHeist-v5', render_mode='rgb_array')
        preprocessor = FramePreprocessor()

        if agent is None:
            agent = DQNAgent(
                state_shape=(4, 84, 84), action_size=18,
                learning_rate=learning_rate, gamma=gamma,
                epsilon_start=epsilon_start, epsilon_min=epsilon_min,
                epsilon_decay=epsilon_decay, memory_size=memory_size
            )
            print(f"New agent created | Memory: {memory_size}")
        else:
            print(f"Resuming from episode {start_episode}")

        start_time = time.time()

        for episode in range(start_episode, start_episode + total_episodes):
            state, _ = env.reset()
            state = preprocessor.reset(state)
            episode_reward = 0
            episode_loss = []

            for step in range(max_steps):
                action = agent.select_action(state, training=True)
                next_state, reward, terminated, truncated, _ = env.step(action)
                done = terminated or truncated
                next_state = preprocessor.add_frame(next_state)
                agent.memory.push(state, action, reward, next_state, done)

                loss = agent.train_step(batch_size)
                if loss is not None:
                    episode_loss.append(loss)

                episode_reward += reward
                state = next_state
                if done:
                    break

            if episode % target_update_freq == 0:
                agent.update_target_network()
            agent.decay_epsilon()

            self.episode_rewards.append(episode_reward)
            self.episode_steps.append(step + 1)
            self.episode_losses.append(np.mean(episode_loss) if episode_loss else 0)
            self.epsilon_history.append(agent.epsilon)

            # FAST LOCAL SAVE (every 50 episodes)
            if (episode + 1) % local_save_freq == 0:
                self._save_local_checkpoint(agent, episode + 1)

            # SLOW DRIVE BACKUP (every 200 episodes)
            if self.drive_backup_dir and (episode + 1) % drive_save_freq == 0:
                self._backup_to_drive()
                print(f"  Backed up to Drive at episode {episode + 1}")
                sys.stdout.flush()

            # Print progress
            if (episode + 1) % print_freq == 0:
                avg_reward = np.mean(self.episode_rewards[-min(100, len(self.episode_rewards)):])
                elapsed = time.time() - start_time
                eps_done = episode - start_episode + 1
                remaining = (elapsed/eps_done)*(total_episodes - eps_done)/60
                print(f"Ep {episode+1} | R:{episode_reward:.0f} | "
                      f"Avg:{avg_reward:.1f} | ε:{agent.epsilon:.3f} | "
                      f"Mem:{len(agent.memory)} | {remaining:.0f}m")
                sys.stdout.flush()

        env.close()

        # Final save to both local and Drive
        final_checkpoint = f"final_ep{start_episode + total_episodes}.pt"
        final_metrics = f"metrics_ep{start_episode + total_episodes}.json"

        agent.save(f"{self.save_dir}/{final_checkpoint}")
        self.save_metrics(f"{self.save_dir}/{final_metrics}")

        if self.drive_backup_dir:
            self._backup_to_drive()
            print("Final backup to Drive complete")

        elapsed = (time.time()-start_time)/60
        final_avg = np.mean(self.episode_rewards[-min(100, len(self.episode_rewards)):])
        print(f"\nComplete: {elapsed:.1f}m | Avg:{final_avg:.1f} | ε:{agent.epsilon:.3f}")
        sys.stdout.flush()

        return agent

    def _save_local_checkpoint(self, agent, episode):
        """Quick save to local disk"""
        agent.save(f"{self.save_dir}/checkpoint_ep{episode}.pt")
        self.save_metrics(f"{self.save_dir}/metrics_ep{episode}.json")

    def _backup_to_drive(self):
        """Copy all local files to Drive"""
        if self.drive_backup_dir:
            for filename in os.listdir(self.save_dir):
                src = os.path.join(self.save_dir, filename)
                dst = os.path.join(self.drive_backup_dir, filename)
                shutil.copy2(src, dst)

    def save_metrics(self, filepath):
        metrics = {
            'episode_rewards': self.episode_rewards,
            'episode_steps': self.episode_steps,
            'episode_losses': self.episode_losses,
            'epsilon_history': self.epsilon_history,
            'total_episodes': len(self.episode_rewards),
            'avg_reward_last_100': float(np.mean(self.episode_rewards[-min(100, len(self.episode_rewards)):])),
            'avg_steps_last_100': float(np.mean(self.episode_steps[-min(100, len(self.episode_steps)):]))
        }
        with open(filepath, 'w') as f:
            json.dump(metrics, f, indent=2)

    def cleanup(self):
        """Free memory after experiment"""
        self.episode_rewards.clear()
        self.episode_steps.clear()
        self.episode_losses.clear()
        self.epsilon_history.clear()
        gc.collect()
        torch.cuda.empty_cache()
        print("Memory cleared")

print("Memory-efficient trainer ready")

Memory-efficient trainer ready


In [None]:
# CELL 10: Load Checkpoint and Resume Training
def load_checkpoint_and_resume(checkpoint_path, trainer):
    """Load a checkpoint and return agent + last episode number"""
    agent = DQNAgent(
        state_shape=(4, 84, 84), action_size=18,
        learning_rate=0.00025, gamma=0.99,
        epsilon_start=1.0, epsilon_min=0.01,
        epsilon_decay=0.995, memory_size=30000
    )
    agent.load(checkpoint_path)

    # Extract episode number from filename
    import re
    match = re.search(r'ep(\d+)', checkpoint_path)
    last_episode = int(match.group(1)) if match else 0

    print(f"Loaded checkpoint from episode {last_episode}")
    print(f"Agent epsilon: {agent.epsilon}")

    return agent, last_episode

# Example usage if you need to resume:
# agent, last_ep = load_checkpoint_and_resume('./local_baseline/checkpoint_ep400.pt', trainer_baseline)
# Then continue training from last_ep

print("Recovery helper ready")

Recovery helper ready


In [None]:
# CELL 11: Baseline Batch 1 (Episodes 1-400)
local_dir = './local_baseline'
drive_dir = f"{drive_base_path}/baseline_run"

trainer_baseline = BankHeistTrainer(
    save_dir=local_dir,
    drive_backup_dir=drive_dir
)

print("=" * 60)
print("BASELINE - BATCH 1: Episodes 1-400")
print("Saving locally every 50 eps, Drive backup every 200 eps")
print("=" * 60)

agent_baseline = trainer_baseline.train(
    agent=None,
    start_episode=0,
    total_episodes=400,
    max_steps=2000,
    batch_size=32,
    learning_rate=0.00025,
    gamma=0.99,
    epsilon_start=1.0,
    epsilon_min=0.01,
    epsilon_decay=0.995,
    memory_size=30000,
    target_update_freq=10,
    print_freq=50,
    local_save_freq=50,
    drive_save_freq=200
)

BASELINE - BATCH 1: Episodes 1-400
Saving locally every 50 eps, Drive backup every 200 eps
Agent using device: cuda
New agent created | Memory: 30000
Ep 50 | R:10 | Avg:15.2 | ε:0.778 | Mem:26609 | 31m
Ep 100 | R:80 | Avg:14.9 | ε:0.606 | Mem:30000 | 26m
Ep 150 | R:20 | Avg:17.1 | ε:0.471 | Mem:30000 | 22m
  Backed up to Drive at episode 200
Ep 200 | R:10 | Avg:21.8 | ε:0.367 | Mem:30000 | 18m
Ep 250 | R:20 | Avg:23.7 | ε:0.286 | Mem:30000 | 14m
Ep 300 | R:20 | Avg:26.8 | ε:0.222 | Mem:30000 | 9m
Ep 350 | R:20 | Avg:29.3 | ε:0.173 | Mem:30000 | 5m
  Backed up to Drive at episode 400
Ep 400 | R:10 | Avg:27.9 | ε:0.135 | Mem:30000 | 0m
Final backup to Drive complete

Complete: 37.2m | Avg:27.9 | ε:0.135


In [None]:
# Quick test - see how long episodes actually last
env = gym.make('ALE/BankHeist-v5', render_mode='rgb_array')
preprocessor = FramePreprocessor()

print("Testing 10 episodes to see natural length...")
step_counts = []

for i in range(10):
    state, _ = env.reset()
    state = preprocessor.reset(state)

    for step in range(5000):  # High limit
        action = env.action_space.sample()  # Random actions
        next_state, reward, terminated, truncated, _ = env.step(action)

        if terminated or truncated:
            step_counts.append(step + 1)
            print(f"Episode {i+1}: {step+1} steps")
            break

env.close()

print(f"\nAverage episode length: {np.mean(step_counts):.0f} steps")
print(f"Max episode length: {max(step_counts)} steps")
print(f"Min episode length: {min(step_counts)} steps")

Testing 10 episodes to see natural length...
Episode 1: 838 steps
Episode 2: 668 steps
Episode 3: 388 steps
Episode 4: 584 steps
Episode 5: 495 steps
Episode 6: 808 steps
Episode 7: 699 steps
Episode 8: 443 steps
Episode 9: 656 steps
Episode 10: 364 steps

Average episode length: 594 steps
Max episode length: 838 steps
Min episode length: 364 steps


In [None]:
# CELL 12: Baseline Batch 2 (Episodes 401-800)
print("=" * 60)
print("BASELINE - BATCH 2: Episodes 401-800")
print("=" * 60)

agent_baseline = trainer_baseline.train(
    agent=agent_baseline,
    start_episode=400,
    total_episodes=400,
    max_steps=2000,
    batch_size=32,
    print_freq=50,
    local_save_freq=50,
    drive_save_freq=200
)

# Final save and cleanup
agent_baseline.save(f"{drive_dir}/final_baseline.pt")
trainer_baseline.save_metrics(f"{drive_dir}/final_baseline_metrics.json")
print("Baseline complete!")

# Free memory before next experiment
del agent_baseline
trainer_baseline.cleanup()
gc.collect()
torch.cuda.empty_cache()
print("Memory freed for next experiment")

BASELINE - BATCH 2: Episodes 401-800
Resuming from episode 400
Ep 450 | R:40 | Avg:33.5 | ε:0.096 | Mem:30000 | 36m
Ep 500 | R:10 | Avg:37.7 | ε:0.075 | Mem:30000 | 33m
Ep 550 | R:50 | Avg:41.9 | ε:0.058 | Mem:30000 | 27m
  Backed up to Drive at episode 600
Ep 600 | R:80 | Avg:62.9 | ε:0.045 | Mem:30000 | 22m
Ep 650 | R:110 | Avg:89.8 | ε:0.035 | Mem:30000 | 17m
Ep 700 | R:200 | Avg:137.1 | ε:0.027 | Mem:30000 | 12m
Ep 750 | R:190 | Avg:195.2 | ε:0.021 | Mem:30000 | 6m
  Backed up to Drive at episode 800
Ep 800 | R:90 | Avg:215.7 | ε:0.017 | Mem:30000 | 0m
Final backup to Drive complete

Complete: 52.6m | Avg:215.7 | ε:0.017
Baseline complete!
Memory cleared
Memory freed for next experiment


In [None]:
# Reconnect after crash
from google.colab import drive
import os
import json
import numpy as np

drive.mount('/content/drive')
drive_base_path = '/content/drive/MyDrive/BankHeist_DQN_Project'

# Verify baseline is saved
print("Checking saved baseline...")
with open(f"{drive_base_path}/baseline_run/final_baseline_metrics.json", 'r') as f:
    baseline_metrics = json.load(f)

print(f"Baseline (800 eps): {baseline_metrics['avg_reward_last_100']:.1f} points")
print("Ready to start experiments!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Checking saved baseline...
Baseline (800 eps): 215.7 points
Ready to start experiments!


In [None]:
# CELL: Experiment 3 - Slower Epsilon Decay (400 episodes)
import gc
import torch
import sys

local_dir_exp3 = './local_exp3'
drive_dir_exp3 = f"{drive_base_path}/experiment3_epsilon"

trainer_exp3 = BankHeistTrainer(
    save_dir=local_dir_exp3,
    drive_backup_dir=drive_dir_exp3
)

print("=" * 60)
print("EXPERIMENT 3: Slower Epsilon Decay (0.99 vs baseline 0.995)")
print("Testing if more exploration improves learning")
print("=" * 60)

agent_exp3 = trainer_exp3.train(
    agent=None,
    start_episode=0,
    total_episodes=400,
    max_steps=2000,
    batch_size=32,
    learning_rate=0.00025,
    gamma=0.99,
    epsilon_start=1.0,
    epsilon_min=0.01,
    epsilon_decay=0.99,  # CHANGED: Slower decay (baseline is 0.995)
    memory_size=30000,
    target_update_freq=10,
    print_freq=50,
    local_save_freq=50,
    drive_save_freq=200
)

agent_exp3.save(f"{drive_dir_exp3}/final_exp3.pt")
trainer_exp3.save_metrics(f"{drive_dir_exp3}/final_exp3_metrics.json")

del agent_exp3
trainer_exp3.cleanup()
gc.collect()
torch.cuda.empty_cache()
print("Experiment 3 complete, memory cleared")

EXPERIMENT 3: Slower Epsilon Decay (0.99 vs baseline 0.995)
Testing if more exploration improves learning
Agent using device: cuda
New agent created | Memory: 30000
Ep 50 | R:10 | Avg:12.6 | ε:0.605 | Mem:25339 | 29m
Ep 100 | R:10 | Avg:15.1 | ε:0.366 | Mem:30000 | 27m
Ep 150 | R:40 | Avg:20.9 | ε:0.221 | Mem:30000 | 23m
  Backed up to Drive at episode 200
Ep 200 | R:30 | Avg:26.9 | ε:0.134 | Mem:30000 | 19m
Ep 250 | R:20 | Avg:30.2 | ε:0.081 | Mem:30000 | 15m
Ep 300 | R:30 | Avg:31.0 | ε:0.049 | Mem:30000 | 10m
Ep 350 | R:110 | Avg:39.7 | ε:0.030 | Mem:30000 | 5m
  Backed up to Drive at episode 400
Ep 400 | R:170 | Avg:71.9 | ε:0.018 | Mem:30000 | 0m
Final backup to Drive complete

Complete: 45.4m | Avg:71.9 | ε:0.018
Memory cleared
Experiment 3 complete, memory cleared


In [None]:
# CELL: Final Results Summary
import json

print("=" * 70)
print("FINAL RESULTS - ALL EXPERIMENTS")
print("=" * 70)

# All your results
results = {
    'Baseline (1000 eps)': 493.0,
    'Baseline (400 eps)': 27.9,
    'Exp 1: Higher Alpha (α=0.0005)': 8.8,
    'Exp 2: Lower Gamma (γ=0.95)': 23.0,
    'Exp 3: Slower Epsilon (decay=0.99)': 71.9,
    'Exp 4: Boltzmann Policy': 38.7
}

print("\n1. PERFORMANCE AT 400 EPISODES")
print("-" * 70)
baseline_400 = 27.9
for name, score in [
    ('Baseline', 27.9),
    ('Exp 1: Higher Alpha', 8.8),
    ('Exp 2: Lower Gamma', 23.0),
    ('Exp 3: Slower Epsilon', 71.9),
    ('Exp 4: Boltzmann', 38.7)
]:
    change = ((score/baseline_400)-1)*100
    marker = " ← WINNER" if score == 71.9 else ""
    print(f"{name:30s}: {score:5.1f} ({change:+6.1f}%){marker}")

print("\n2. KEY FINDINGS")
print("-" * 70)
print("Winner: Experiment 3 (Slower Epsilon Decay)")
print("  • 71.9 avg at 400 episodes (158% improvement)")
print("  • Slower decay (0.99 vs 0.995) = more exploration")
print("  • Better learning rate than all other configurations")
print()
print("Best Overall: Baseline at 1000 episodes (493.0 avg)")
print("  • Shows continued improvement with more training")
print("  • Exp 3 would likely exceed this with 1000 episodes")

print("\n3. TOTAL TRAINING")
print("-" * 70)
print(f"Total episodes: 2700")
print(f"  • Baseline: 1000 episodes")
print(f"  • Experiments 1-4: 1700 episodes")
print("=" * 70)

# Save summary
summary = {
    'all_results': results,
    'winner': 'Experiment 3: Slower Epsilon Decay',
    'winner_score': 71.9,
    'improvement': 158,
    'total_episodes': 2700
}

with open(f'{drive_base_path}/final_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nSummary saved: {drive_base_path}/final_summary.json")

FINAL RESULTS - ALL EXPERIMENTS

1. PERFORMANCE AT 400 EPISODES
----------------------------------------------------------------------
Baseline                      :  27.9 (  +0.0%)
Exp 1: Higher Alpha           :   8.8 ( -68.5%)
Exp 2: Lower Gamma            :  23.0 ( -17.6%)
Exp 3: Slower Epsilon         :  71.9 (+157.7%) ← WINNER
Exp 4: Boltzmann              :  38.7 ( +38.7%)

2. KEY FINDINGS
----------------------------------------------------------------------
Winner: Experiment 3 (Slower Epsilon Decay)
  • 71.9 avg at 400 episodes (158% improvement)
  • Slower decay (0.99 vs 0.995) = more exploration
  • Better learning rate than all other configurations

Best Overall: Baseline at 1000 episodes (493.0 avg)
  • Shows continued improvement with more training
  • Exp 3 would likely exceed this with 1000 episodes

3. TOTAL TRAINING
----------------------------------------------------------------------
Total episodes: 2700
  • Baseline: 1000 episodes
  • Experiments 1-4: 1700 episod