<a href="https://colab.research.google.com/github/i-ganza007/PacMan_Formative/blob/main/CNNPOLICY_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install -q --force-reinstall "numpy==1.26.4" "scipy<1.13" "scikit-learn<1.6" "matplotlib<3.9"

!apt-get update -qq && apt-get install -y -qq swig cmake libopenmpi-dev zlib1g-dev

!pip install -q \
    "gymnasium[box2d,atari,accept-rom-license]" \
    "stable-baselines3[extra]>=2.0.0" \
    "ale-py" \
    "torch>=2.0" \
    "tensorflow<2.17" \
    "keras<3.0"

!pip cache purge

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import warnings
warnings.filterwarnings('ignore')

import ale_py
import gymnasium as gym
gym.register_envs(ale_py)
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
import pandas as pd
import numpy as np
import random
import torch
import gc

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
CONFIG_ID = 5

configs = [
    {"lr": 0.0001, "gamma": 0.99,  "batch": 8,  "eps_start": 1.0, "eps_end": 0.05, "eps_decay": 0.3},
    {"lr": 0.0005, "gamma": 0.95,  "batch": 16, "eps_start": 1.0, "eps_end": 0.1,  "eps_decay": 0.2},
    {"lr": 0.0002, "gamma": 0.99,  "batch": 8,  "eps_start": 1.0, "eps_end": 0.01, "eps_decay": 0.4},
    {"lr": 0.0003, "gamma": 0.999, "batch": 12, "eps_start": 1.0, "eps_end": 0.08, "eps_decay": 0.25},
    {"lr": 0.00015,"gamma": 0.98,  "batch": 8,  "eps_start": 1.0, "eps_end": 0.02, "eps_decay": 0.35},
    {"lr": 0.0004, "gamma": 0.99,  "batch": 16, "eps_start": 1.0, "eps_end": 0.1,  "eps_decay": 0.3},
    {"lr": 0.0001, "gamma": 0.995, "batch": 12, "eps_start": 1.0, "eps_end": 0.05, "eps_decay": 0.2},
    {"lr": 0.0002, "gamma": 0.98,  "batch": 8,  "eps_start": 1.0, "eps_end": 0.01, "eps_decay": 0.5},
    {"lr": 0.00025,"gamma": 0.99,  "batch": 16, "eps_start": 1.0, "eps_end": 0.05, "eps_decay": 0.3},
    {"lr": 0.0003, "gamma": 0.999, "batch": 12, "eps_start": 1.0, "eps_end": 0.02, "eps_decay": 0.25},
]

cfg = configs[CONFIG_ID - 1]


print(f"Policy: CnnPolicy")
print(f"Hyperparameters:")
print(f"  • Learning Rate (lr): {cfg['lr']}")
print(f"  • Gamma (γ): {cfg['gamma']}")
print(f"  • Batch Size: {cfg['batch']}")
print(f"  • Epsilon Start: {cfg['eps_start']}")
print(f"  • Epsilon End: {cfg['eps_end']}")
print(f"  • Epsilon Decay: {cfg['eps_decay']}")

Policy: CnnPolicy
Hyperparameters:
  • Learning Rate (lr): 0.00015
  • Gamma (γ): 0.98
  • Batch Size: 8
  • Epsilon Start: 1.0
  • Epsilon End: 0.02
  • Epsilon Decay: 0.35


In [None]:
env_test = gym.make("ALE/SpaceInvaders-v5")
random_scores = []

for episode in range(1, 21):
    obs, _ = env_test.reset()
    done = False
    truncated = False
    score = 0
    while not (done or truncated):
        action = random.choice([0, 1, 2, 3, 4, 5])
        obs, reward, done, truncated, _ = env_test.step(action)
        score += reward
    random_scores.append(score)
    if episode % 5 == 0:
        print(f"  Episode {episode}: {score}")

avg_random_score = np.mean(random_scores)
print(f"\nRandom Agent Average Score: {avg_random_score:.2f}")
env_test.close()

  Episode 5: 220.0
  Episode 10: 135.0
  Episode 15: 50.0
  Episode 20: 75.0

Random Agent Average Score: 142.50


In [None]:
env = gym.make("ALE/SpaceInvaders-v5", frameskip=4)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)

In [None]:
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
log_dir = "./logs/spaceinvaders/"
os.makedirs(log_dir, exist_ok=True)
model = DQN(
    "CnnPolicy",
    env,
    learning_rate=cfg["lr"],
    gamma=cfg["gamma"],
    batch_size=cfg["batch"],
    buffer_size=10000,
    learning_starts=5000,
    exploration_initial_eps=cfg["eps_start"],
    exploration_final_eps=cfg["eps_end"],
    exploration_fraction=cfg["eps_decay"],
    target_update_interval=1000,
    train_freq=4,
    gradient_steps=1,
    tensorboard_log=log_dir,
    verbose=1,
    device="cuda"
)

print("Model created successfully!")
print(f"   Buffer size: {model.buffer_size:,}")
print(f"   Learning starts at: {model.learning_starts:,} steps")
print(f"   Policy: CnnPolicy (Convolutional Neural Network)")

print(f"\nInitial GPU memory: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

Using cuda device
Wrapping the env in a VecTransposeImage.
Model created successfully!
   Buffer size: 10,000
   Learning starts at: 5,000 steps
   Policy: CnnPolicy (Convolutional Neural Network)

Initial GPU memory: 0.32 GB


In [None]:

checkpoint_callback = CheckpointCallback(
    save_freq=50_000,
    save_path="./checkpoints/",
    name_prefix="dqn_spaceinvaders"
)

eval_callback = EvalCallback(
    env,
    best_model_save_path="./best_model/",
    log_path="./eval_logs/",
    eval_freq=10_000,
    deterministic=True,
    render=False
)

callback_list = [checkpoint_callback, eval_callback]

try:
    model.learn(
        total_timesteps=500_000,
        tb_log_name=f"SpaceInvaders_Config{CONFIG_ID}_CnnPolicy",
        log_interval=50,
        progress_bar=True,
        callback=callback_list
    )

    print("\nTraining completed successfully!")
    print(f"Max GPU memory used: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB")

except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print("\nOUT OF MEMORY ERROR")
        print(f"Current settings: buffer={model.buffer_size}, batch={cfg['batch']}")
        print("\nEMERGENCY FIX:")
        print("   1. Change buffer_size=5000")
        print("   2. Change batch_size=4")
        print("   3. Or use device='cpu'")
    raise e

Logging to ./logs/spaceinvaders/SpaceInvaders_Config5_CnnPolicy_1


Output()

----------------------------------
| eval/               |          |
|    mean_ep_length   | 494      |
|    mean_reward      | 105      |
| rollout/            |          |
|    exploration_rate | 0.944    |
| time/               |          |
|    total_timesteps  | 10000    |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.000326 |
|    n_updates        | 1249     |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 506      |
|    mean_reward      | 15       |
| rollout/            |          |
|    exploration_rate | 0.888    |
| time/               |          |
|    total_timesteps  | 20000    |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 2.43     |
|    n_updates        | 3749     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.854    |
| time/               |          |
|    episodes         | 50       |
|    fps              | 211      |
|    time_elapsed     | 123      |
|    total_timesteps  | 26022    |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.00251  |
|    n_updates        | 5255     |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 462      |
|    mean_reward      | 130      |
| rollout/            |          |
|    exploration_rate | 0.832    |
| time/               |          |
|    total_timesteps  | 30000    |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.00295  |
|    n_updates        | 6249     |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 602      |
|    mean_reward      | 140      |
| rollout/            |          |
|    exploration_rate | 0.776    |
| time/               |          |
|    total_timesteps  | 40000    |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.00166  |
|    n_updates        | 8749     |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 678      |
|    mean_reward      | 161      |
| rollout/            |          |
|    exploration_rate | 0.72     |
| time/               |          |
|    total_timesteps  | 50000    |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.00766  |
|    n_updates        | 11249    |
----------------------------------


----------------------------------
| rollout/            |          |
|    exploration_rate | 0.707    |
| time/               |          |
|    episodes         | 100      |
|    fps              | 195      |
|    time_elapsed     | 267      |
|    total_timesteps  | 52324    |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.00772  |
|    n_updates        | 11830    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 657      |
|    mean_reward      | 143      |
| rollout/            |          |
|    exploration_rate | 0.664    |
| time/               |          |
|    total_timesteps  | 60000    |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.418    |
|    n_updates        | 13749    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 601      |
|    mean_reward      | 100      |
| rollout/            |          |
|    exploration_rate | 0.608    |
| time/               |          |
|    total_timesteps  | 70000    |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.0277   |
|    n_updates        | 16249    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 486      |
|    mean_reward      | 131      |
| rollout/            |          |
|    exploration_rate | 0.552    |
| time/               |          |
|    total_timesteps  | 80000    |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.501    |
|    n_updates        | 18749    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.542    |
| time/               |          |
|    episodes         | 150      |
|    fps              | 188      |
|    time_elapsed     | 433      |
|    total_timesteps  | 81814    |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.0151   |
|    n_updates        | 19203    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 791      |
|    mean_reward      | 244      |
| rollout/            |          |
|    exploration_rate | 0.496    |
| time/               |          |
|    total_timesteps  | 90000    |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.371    |
|    n_updates        | 21249    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 660      |
|    mean_reward      | 186      |
| rollout/            |          |
|    exploration_rate | 0.44     |
| time/               |          |
|    total_timesteps  | 100000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.0319   |
|    n_updates        | 23749    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 512      |
|    mean_reward      | 124      |
| rollout/            |          |
|    exploration_rate | 0.384    |
| time/               |          |
|    total_timesteps  | 110000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.0265   |
|    n_updates        | 26249    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.36     |
| time/               |          |
|    episodes         | 200      |
|    fps              | 182      |
|    time_elapsed     | 626      |
|    total_timesteps  | 114225   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.00949  |
|    n_updates        | 27306    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 672      |
|    mean_reward      | 202      |
| rollout/            |          |
|    exploration_rate | 0.328    |
| time/               |          |
|    total_timesteps  | 120000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.464    |
|    n_updates        | 28749    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 665      |
|    mean_reward      | 162      |
| rollout/            |          |
|    exploration_rate | 0.272    |
| time/               |          |
|    total_timesteps  | 130000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.323    |
|    n_updates        | 31249    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 693      |
|    mean_reward      | 0        |
| rollout/            |          |
|    exploration_rate | 0.216    |
| time/               |          |
|    total_timesteps  | 140000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.00332  |
|    n_updates        | 33749    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.188    |
| time/               |          |
|    episodes         | 250      |
|    fps              | 177      |
|    time_elapsed     | 819      |
|    total_timesteps  | 145032   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.00462  |
|    n_updates        | 35007    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 638      |
|    mean_reward      | 120      |
| rollout/            |          |
|    exploration_rate | 0.16     |
| time/               |          |
|    total_timesteps  | 150000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 2.4      |
|    n_updates        | 36249    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 470      |
|    mean_reward      | 5        |
| rollout/            |          |
|    exploration_rate | 0.104    |
| time/               |          |
|    total_timesteps  | 160000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.184    |
|    n_updates        | 38749    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 620      |
|    mean_reward      | 295      |
| rollout/            |          |
|    exploration_rate | 0.048    |
| time/               |          |
|    total_timesteps  | 170000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.104    |
|    n_updates        | 41249    |
----------------------------------


----------------------------------
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 300      |
|    fps              | 172      |
|    time_elapsed     | 1025     |
|    total_timesteps  | 177158   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 3.92     |
|    n_updates        | 43039    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 469      |
|    mean_reward      | 142      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 180000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.172    |
|    n_updates        | 43749    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 458      |
|    mean_reward      | 47       |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 190000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.0133   |
|    n_updates        | 46249    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 477      |
|    mean_reward      | 138      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 200000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.0194   |
|    n_updates        | 48749    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 350      |
|    fps              | 169      |
|    time_elapsed     | 1201     |
|    total_timesteps  | 204168   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.0571   |
|    n_updates        | 49791    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 679      |
|    mean_reward      | 220      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 210000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.0384   |
|    n_updates        | 51249    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 698      |
|    mean_reward      | 375      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 220000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.96     |
|    n_updates        | 53749    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 947      |
|    mean_reward      | 524      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 230000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.109    |
|    n_updates        | 56249    |
----------------------------------


----------------------------------
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 400      |
|    fps              | 166      |
|    time_elapsed     | 1432     |
|    total_timesteps  | 238539   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.0979   |
|    n_updates        | 58384    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 580      |
|    mean_reward      | 175      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 240000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.269    |
|    n_updates        | 58749    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 658      |
|    mean_reward      | 360      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 250000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 3.03     |
|    n_updates        | 61249    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 409      |
|    mean_reward      | 168      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 260000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.0301   |
|    n_updates        | 63749    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 450      |
|    fps              | 164      |
|    time_elapsed     | 1623     |
|    total_timesteps  | 267115   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.348    |
|    n_updates        | 65528    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 391      |
|    mean_reward      | 136      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 270000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.0137   |
|    n_updates        | 66249    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 595      |
|    mean_reward      | 307      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 280000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 1.1      |
|    n_updates        | 68749    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 447      |
|    mean_reward      | 177      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 290000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.052    |
|    n_updates        | 71249    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 500      |
|    fps              | 163      |
|    time_elapsed     | 1786     |
|    total_timesteps  | 291337   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.0603   |
|    n_updates        | 71584    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 402      |
|    mean_reward      | 173      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 300000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 2.4      |
|    n_updates        | 73749    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 386      |
|    mean_reward      | 110      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 310000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.91     |
|    n_updates        | 76249    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 550      |
|    fps              | 162      |
|    time_elapsed     | 1944     |
|    total_timesteps  | 316221   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.17     |
|    n_updates        | 77805    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 423      |
|    mean_reward      | 154      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 320000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.123    |
|    n_updates        | 78749    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 297      |
|    mean_reward      | 109      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 330000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.0127   |
|    n_updates        | 81249    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 503      |
|    mean_reward      | 197      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 340000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.255    |
|    n_updates        | 83749    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 600      |
|    fps              | 161      |
|    time_elapsed     | 2112     |
|    total_timesteps  | 341762   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.958    |
|    n_updates        | 84190    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 297      |
|    mean_reward      | 101      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 350000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.55     |
|    n_updates        | 86249    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 504      |
|    mean_reward      | 212      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 360000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.312    |
|    n_updates        | 88749    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 650      |
|    fps              | 161      |
|    time_elapsed     | 2254     |
|    total_timesteps  | 363665   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.169    |
|    n_updates        | 89666    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 582      |
|    mean_reward      | 300      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 370000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.169    |
|    n_updates        | 91249    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 367      |
|    mean_reward      | 112      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 380000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 5.25     |
|    n_updates        | 93749    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 700      |
|    fps              | 160      |
|    time_elapsed     | 2397     |
|    total_timesteps  | 385485   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.552    |
|    n_updates        | 95121    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 434      |
|    mean_reward      | 157      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 390000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.218    |
|    n_updates        | 96249    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 517      |
|    mean_reward      | 212      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 400000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.277    |
|    n_updates        | 98749    |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 413      |
|    mean_reward      | 180      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 410000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.46     |
|    n_updates        | 101249   |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 750      |
|    fps              | 160      |
|    time_elapsed     | 2570     |
|    total_timesteps  | 411554   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 3.17     |
|    n_updates        | 101638   |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 328      |
|    mean_reward      | 132      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 420000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.198    |
|    n_updates        | 103749   |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 511      |
|    mean_reward      | 242      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 430000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 1.49     |
|    n_updates        | 106249   |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 800      |
|    fps              | 159      |
|    time_elapsed     | 2710     |
|    total_timesteps  | 433407   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.363    |
|    n_updates        | 107101   |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 467      |
|    mean_reward      | 205      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 440000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.407    |
|    n_updates        | 108749   |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 379      |
|    mean_reward      | 155      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 450000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.175    |
|    n_updates        | 111249   |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 850      |
|    fps              | 159      |
|    time_elapsed     | 2849     |
|    total_timesteps  | 454793   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.493    |
|    n_updates        | 112448   |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 524      |
|    mean_reward      | 251      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 460000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 5.05     |
|    n_updates        | 113749   |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 434      |
|    mean_reward      | 181      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 470000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.693    |
|    n_updates        | 116249   |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 900      |
|    fps              | 159      |
|    time_elapsed     | 2987     |
|    total_timesteps  | 476023   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.254    |
|    n_updates        | 117755   |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 379      |
|    mean_reward      | 150      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 480000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.505    |
|    n_updates        | 118749   |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 378      |
|    mean_reward      | 138      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 490000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 0.209    |
|    n_updates        | 121249   |
----------------------------------


----------------------------------
| eval/               |          |
|    mean_ep_length   | 504      |
|    mean_reward      | 234      |
| rollout/            |          |
|    exploration_rate | 0.02     |
| time/               |          |
|    total_timesteps  | 500000   |
| train/              |          |
|    learning_rate    | 0.00015  |
|    loss             | 1.06     |
|    n_updates        | 123749   |
----------------------------------



Training completed successfully!
Max GPU memory used: 0.50 GB


In [None]:
mean_reward, std_reward = evaluate_policy(
    model,
    env,
    n_eval_episodes=10,
    deterministic=True
)

print(f"\nResults over 10 episodes:")
print(f"   Mean Reward: {mean_reward:.2f} ± {std_reward:.2f}")
print(f"   Random Baseline: {avg_random_score:.2f}")
print(f"   Improvement: {mean_reward - avg_random_score:.2f}")


Results over 10 episodes:
   Mean Reward: 227.00 ± 46.05
   Random Baseline: 142.50
   Improvement: 84.50


In [None]:
model_name = "dqn_model_5"
model.save(model_name)

backup_name = f"dqn_spaceinvaders_exp{CONFIG_ID}"
model.save(backup_name)

print(f"\nModels saved:")
print(f"   • {model_name}.zip (required submission)")
print(f"   • {backup_name}.zip (backup with config ID)")


Models saved:
   • dqn_model_5.zip (required submission)
   • dqn_spaceinvaders_exp5.zip (backup with config ID)
