In [1]:
import torch
import torchvision.datasets as datasets # for Mist
import torchvision.transforms as transforms # Transformations we can perform on our dataset for augmentation
from torch import optim # For optimizers like SGD, Adam, etc.
from torch import nn # To inherit our neural network
from torch.utils.data import DataLoader # For management of the dataset (batches)
from tqdm import tqdm # For nice progress bar!
from torch.optim.lr_scheduler import StepLR
from torch.optim.lr_scheduler import CosineAnnealingLR
import numpy as np
import flappy_bird_gymnasium as flappy_bird
import gymnasium as gym
import random
from collections import deque, namedtuple
from preprocessing.preprocessing import preprocess_frame
from PIL import Image
from gymnasium.wrappers import RecordVideo
from static_variables import CHECKPOINTS_DIR, VIDEOS_DIR, PHOTOS_DIR, make_dirs
import os
import hashlib
from preprocessing.preprocessing import preprocess_frame
from model import DQN_CNN
from save_model.utils import save_model, load_model, record_trained_agent_video
from replay_buffer.ReplayBuffer import ReplayBuffer
from model import save_input_frames
from save_model.utils import transition

  from pkg_resources import resource_stream, resource_exists


In [2]:
make_dirs()

In [3]:
if torch.cuda.is_available():
    device = "cuda:0"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

device = torch.device(device)
print("Device: ", device)

Device:  mps


In [4]:
# Environment
frame_skip = 4

# Training
data_type = torch.float32
batch_size = 64
T_iterations = 50000   # very important

# Replay buffer
replay_buffer_size = 20000
# DQN
alpha = 1e-4
gamma = 0.99

# Target network (soft update)
target_network_incorporation_rate = 0.005

# Epsilon-greedy
e_start = 1.0
e_end = 0.01
k_epsilon = -1 / T_iterations * np.log(e_end / e_start)


In [None]:

env = gym.make("FlappyBird-v0", render_mode="rgb_array")

policy_net = DQN_CNN(frame_skip).to(device)
target_net = DQN_CNN(frame_skip).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=alpha)
memory = ReplayBuffer(replay_buffer_size)

global_step = 0
frame_id = 0
best_reward = -float("inf")

for episode in range(T_iterations):
    env.reset()

    # ---------- initialize frame stack ----------

    done = False
    episode_reward = 0
    current_state, _, _ = transition(0, env, frame_skip)

    while not done:
        global_step += 1
        #print("Current state: ", current_state.shape, type(current_state), len(current_state))
        # ---------- epsilon decay ----------
        epsilon = e_end + (e_start - e_end) * np.exp(-k_epsilon * global_step)
        
        state_tensor = torch.tensor(
             current_state, dtype=torch.float32
        ).unsqueeze(0).to(device)
        #print("State tensor shape: ", state_tensor.shape)

        # ---------- epsilon-greedy ----------
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                # if global_step % 100 == 0:
                #     save_input_frames(
                #         state_tensor.cpu(),
                #         f"vis/step_{global_step:06d}_input.png"
                #     )
                q_values = policy_net(state_tensor)
                action = q_values.argmax(dim=1).item()

        next_state, reward, done = transition(action, env, frame_skip)
        memory.push(current_state, action, reward, next_state, done)

        episode_reward += reward
        current_state = next_state

        # ---------- learning ----------
        if len(memory) >= batch_size:
            states, actions, rewards, next_states, dones = memory.sample(batch_size)

            states = torch.tensor(states, dtype=torch.float32).to(device)
            actions = torch.tensor(actions, dtype=torch.long).unsqueeze(1).to(device)
            rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(1).to(device)
            next_states = torch.tensor(next_states, dtype=torch.float32).to(device)
            dones = torch.tensor(dones, dtype=torch.float32).unsqueeze(1).to(device)

            q_values = policy_net(states).gather(1, actions)

            with torch.no_grad():
                next_q = target_net(next_states).max(1, keepdim=True)[0]
                target_q = rewards + gamma * next_q * (1 - dones)

            loss = nn.SmoothL1Loss()(q_values, target_q)

            optimizer.zero_grad()
            loss.backward()

            # gradient clipping
            torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 10)

            optimizer.step()

            # ---------- soft target update ----------
            for target_param, policy_param in zip(
                target_net.parameters(), policy_net.parameters()
            ):
                target_param.data.copy_(
                    target_network_incorporation_rate * policy_param.data
                    + (1.0 - target_network_incorporation_rate) * target_param.data
                )
                
    best_reward = max(best_reward, episode_reward)
    print(
        f"Episode {episode} | Reward: {episode_reward:.2f} | Epsilon: {epsilon:.4f} | Best Reward: {best_reward:.2f} | Replay Buffer Size: {len(memory)}"
    )

env.close()


  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(f"{pre} is not within the observation space.")


Episode 0 | Reward: -0.70 | Epsilon: 0.9989 | Best Reward: -0.70
Episode 1 | Reward: -2.50 | Epsilon: 0.9978 | Best Reward: -0.70
Episode 2 | Reward: -3.70 | Epsilon: 0.9967 | Best Reward: -0.70
Episode 3 | Reward: -1.30 | Epsilon: 0.9956 | Best Reward: -0.70
Episode 4 | Reward: -1.30 | Epsilon: 0.9945 | Best Reward: -0.70
Episode 5 | Reward: -1.30 | Epsilon: 0.9935 | Best Reward: -0.70
Episode 6 | Reward: -1.30 | Epsilon: 0.9924 | Best Reward: -0.70
Episode 7 | Reward: -1.30 | Epsilon: 0.9913 | Best Reward: -0.70
Episode 8 | Reward: -1.30 | Epsilon: 0.9902 | Best Reward: -0.70
Episode 9 | Reward: -1.30 | Epsilon: 0.9891 | Best Reward: -0.70
Episode 10 | Reward: -1.70 | Epsilon: 0.9879 | Best Reward: -0.70
Episode 11 | Reward: -1.30 | Epsilon: 0.9868 | Best Reward: -0.70
Episode 12 | Reward: -1.30 | Epsilon: 0.9857 | Best Reward: -0.70
Episode 13 | Reward: -1.30 | Epsilon: 0.9846 | Best Reward: -0.70
Episode 14 | Reward: -1.30 | Epsilon: 0.9835 | Best Reward: -0.70
Episode 15 | Reward:

In [None]:
path = save_model(
    policy_net,
    target_net,
    optimizer,
    dir=CHECKPOINTS_DIR,
    global_step=global_step,
    best_reward=2,
)


âœ… Model saved to ../checkpoints/0d3d127ae33c6fc76f0744bd4756046c


In [None]:
record_trained_agent_video(
    model_path=path,
    video_dir=VIDEOS_DIR,
    frame_skip=frame_skip,
    device=device,
)


  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(f"{pre} is not within the observation space.")


ðŸŽ¥ Video saved in '../videos/' | Reward: 0.20
