In [1]:
import torch
import torchvision.datasets as datasets # for Mist
import torchvision.transforms as transforms # Transformations we can perform on our dataset for augmentation
from torch import optim # For optimizers like SGD, Adam, etc.
from torch import nn # To inherit our neural network
from torch.utils.data import DataLoader # For management of the dataset (batches)
from tqdm import tqdm # For nice progress bar!
from torch.optim.lr_scheduler import StepLR
from torch.optim.lr_scheduler import CosineAnnealingLR
import numpy as np
import flappy_bird_gymnasium as flappy_bird
import gymnasium as gym
import random
from collections import deque, namedtuple
from preprocessing.preprocessing import preprocess_frame
from PIL import Image
from gymnasium.wrappers import RecordVideo
from static_variables import CHECKPOINTS_DIR, VIDEOS_DIR, PHOTOS_DIR, make_dirs
import os
import hashlib
from preprocessing.preprocessing import preprocess_frame
from model import DQN_CNN
from save_model.utils import save_model, load_model, record_trained_agent_video
from replay_buffer.ReplayBuffer import ReplayBuffer
from model import save_input_frames
from save_model.utils import transition
import time
from collections import deque


  from pkg_resources import resource_stream, resource_exists


In [2]:
make_dirs()

In [3]:
if torch.cuda.is_available():
    device = "cuda:0"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

device = torch.device(device)
print("Device: ", device)

Device:  cpu


In [4]:
# Environment
frame_skip = 4
train_every = 2   

# Training
data_type = torch.float32
batch_size = 64
T_iterations = 1_000_000
initial_exploration = 15_000

# Replay buffer
replay_buffer_size = 400_000

# DQN
alpha = 1e-4
gamma = 0.99

# Target network (soft update)
target_network_incorporation_rate = 0.005

# Epsilon-greedy
e_start = 0.1
e_end = 0.013
k_epsilon = -1 / T_iterations * np.log(e_end / e_start)
decay_factor = np.exp(-k_epsilon)



# Early stopping parameters
avg_window = 100          # moving average window
patience = 2000            # episodes to wait without improvement
min_delta = 0.5           # minimum improvement to count as progress

reward_window = deque(maxlen=avg_window)
best_avg_reward = -float("inf")
no_improve_counter = 0


In [None]:

env = gym.make("FlappyBird-v0", render_mode="rgb_array")

policy_net = DQN_CNN(frame_skip).to(device)
target_net = DQN_CNN(frame_skip).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=alpha)
memory = ReplayBuffer(replay_buffer_size)

global_step = 0
transition_steps = 0
frame_id = 0
best_reward = -float("inf")

criterion = nn.SmoothL1Loss()

# timing trackers
training_start_time = time.perf_counter()
episode_times = deque(maxlen=100)


for episode in range(T_iterations):
    episode_start_time = time.perf_counter()
    env.reset()
    done = False
    episode_reward = 0

    current_state, _, _ = transition(0, env, frame_skip)
    # save the frames from current_state for debug
    

    while not done:
        global_step += 1

        epsilon = e_end + (e_start - e_end) * np.exp(-k_epsilon * transition_steps)

        state_tensor = torch.tensor(
            current_state, dtype=torch.float32, device=device
        ).unsqueeze(0)

        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                action = policy_net(state_tensor).argmax(dim=1).item()
                # if global_step % 100 == 0:
                #     save_input_frames(
                #         state_tensor.cpu(),
                #         f"vis/step_{global_step:06d}_input.png"
                #     )

        next_state, reward, done = transition(action, env, frame_skip)


        memory.push(current_state, action, reward, next_state, done)
        episode_reward += reward
        current_state = next_state
        transition_steps += 1

        if len(memory) >= batch_size and global_step >= initial_exploration and global_step % train_every == 0:
            states, actions, rewards, next_states, dones = memory.sample(batch_size)

            states = torch.from_numpy(states).to(device=device, dtype=torch.float32)
            actions = torch.tensor(actions, dtype=torch.long, device=device).unsqueeze(1)

            rewards = torch.tensor(rewards, dtype=torch.float32, device=device).unsqueeze(1)
            rewards = rewards.clamp(-1, 1)

            next_states = torch.from_numpy(next_states).to(device=device, dtype=torch.float32)
            dones = torch.tensor(dones, dtype=torch.float32, device=device).unsqueeze(1)

            q_values = policy_net(states).gather(1, actions)

            with torch.no_grad():
                # Action selection using online (policy) network
                next_actions = policy_net(next_states).argmax(1, keepdim=True)

                # Action evaluation using target network
                next_q = target_net(next_states).gather(1, next_actions)

                target_q = rewards + gamma * next_q * (1 - dones)


            loss = criterion(q_values, target_q)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 10.0)
            optimizer.step()

            with torch.no_grad():
                for t_param, p_param in zip(
                    target_net.parameters(), policy_net.parameters()
                ):
                    t_param.data.mul_(1.0 - target_network_incorporation_rate)
                    t_param.data.add_(target_network_incorporation_rate * p_param.data)

    # ---------- episode end ----------
    reward_window.append(episode_reward)

    if len(reward_window) == avg_window:
        avg_reward = np.mean(reward_window)

        if avg_reward > best_avg_reward + min_delta:
            best_avg_reward = avg_reward
            no_improve_counter = 0

            # Save the best model
            torch.save(policy_net.state_dict(), "best_dqn.pt")
        else:
            no_improve_counter += 1

        if no_improve_counter >= patience and epsilon < 0.05:
            print(
                f"\nEarly stopping!\n"
                f"Best Avg Reward: {best_avg_reward:.2f}\n"
                f"Episode: {episode}"
                f"Steps: {transition_steps}"
            )
            break

    best_reward = max(best_reward, episode_reward)

        # ---------- timing ----------
    episode_time = time.perf_counter() - episode_start_time
    episode_times.append(episode_time)

    total_training_time = time.perf_counter() - training_start_time
    avg_last_100_time = np.mean(episode_times)


    print(
        f"Episode {episode:5d} | "
        f"Reward: {episode_reward:7.2f} | "
        f"Avg({avg_window}): {np.mean(reward_window):7.2f} | "
        f"Epsilon: {epsilon:.4f} | "
        f"Best Avg Reward: {best_avg_reward:7.2f} | "
        f"Best Reward: {best_reward:7.2f} | "
        f"Steps: {transition_steps:7d} | "
        f"Ep Time: {episode_time:6.2f}s | "
        f"Avg100 Time: {avg_last_100_time:6.2f}s | "
        f"Total Time: {total_training_time/60:7.2f}m"
    )

env.close()

total_time = time.time() - training_start_time
print("\n====== Training Time Summary ======")
print(f"Total training time: {total_time/3600:.2f} hours")
print(f"Average episode time (last 100): {np.mean(episode_times):.2f} seconds")


  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(f"{pre} is not within the observation space.")


Episode     0 | Reward:    0.40 | Avg(100):    0.40 | Epsilon: 0.1000 | Best Avg Reward:    -inf | Best Reward:    0.40 | Steps:       7 | Ep Time:   1.07s | Avg100 Time:   1.07s | Total Time:    0.02m
Episode     1 | Reward:   -0.30 | Avg(100):    0.05 | Epsilon: 0.1000 | Best Avg Reward:    -inf | Best Reward:    0.40 | Steps:      18 | Ep Time:   1.72s | Avg100 Time:   1.40s | Total Time:    0.05m
Episode     2 | Reward:    0.40 | Avg(100):    0.17 | Epsilon: 0.1000 | Best Avg Reward:    -inf | Best Reward:    0.40 | Steps:      25 | Ep Time:   0.97s | Avg100 Time:   1.26s | Total Time:    0.06m
Episode     3 | Reward:    0.40 | Avg(100):    0.22 | Epsilon: 0.1000 | Best Avg Reward:    -inf | Best Reward:    0.40 | Steps:      32 | Ep Time:   1.03s | Avg100 Time:   1.20s | Total Time:    0.08m
Episode     4 | Reward:    0.40 | Avg(100):    0.26 | Epsilon: 0.1000 | Best Avg Reward:    -inf | Best Reward:    0.40 | Steps:      39 | Ep Time:   1.07s | Avg100 Time:   1.17s | Total Time:

In [None]:
path = save_model(
    policy_net,
    target_net,
    optimizer,
    dir=CHECKPOINTS_DIR,
    global_step=global_step,
    best_reward=2,
)


In [None]:
record_trained_agent_video(
    model_path=path,
    video_dir=VIDEOS_DIR,
    frame_skip=frame_skip,
    device=device,
)


