In [3]:
!pip install flappy-bird-gymnasium
!pip install gym

Collecting flappy-bird-gymnasium
  Downloading flappy_bird_gymnasium-0.4.0-py3-none-any.whl (37.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gymnasium (from flappy-bird-gymnasium)
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium->flappy-bird-gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, flappy-bird-gymnasium
Successfully installed farama-notifications-0.0.4 flappy-bird-gymnasium-0.4.0 gymnasium-0.29.1


In [4]:
import numpy as np

from collections import deque

import matplotlib.pyplot as plt
%matplotlib inline

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Gym
import flappy_bird_gymnasium
import gymnasium as gym



In [3]:
env_id = 'FlappyBird-v0'
env = gym.make(env_id, render_mode="human", use_lidar=True)

# evaluation env
eval_env = gym.make(env_id, render_mode="human", use_lidar=True)

# state space and action space
s_size = env.observation_space.shape[0]
a_size = env.action_space.n

In [4]:
print(s_size)
print(a_size)

180
2


In [6]:
class Policy(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, h_size*2)
        self.fc3 = nn.Linear(h_size*2, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

In [6]:
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):

    scores_deque = deque(maxlen=100)
    scores = []

    for i_episode in range(1, n_training_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()[0]

        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, info, _ = env.step(action)
            rewards.append(reward)
            if done:
                break
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))

        returns = deque(maxlen=max_t)
        n_steps = len(rewards)

        for t in range(n_steps)[::-1]:
            disc_return_t = (returns[0] if len(returns)>0 else 0)
            returns.appendleft( gamma*disc_return_t + rewards[t]   )

        eps = np.finfo(np.float32).eps.item()

        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))

    return scores

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [8]:
flappybird_hyperparameters = {
    "h_size": 64,
    "n_training_episodes": 50000,
    "n_evaluation_episodes": 10,
    "max_t": 10000,
    "gamma": 0.99,
    "lr": 1e-4,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
}

In [9]:
flappybird_policy = Policy(flappybird_hyperparameters["state_space"], flappybird_hyperparameters["action_space"], flappybird_hyperparameters["h_size"]).to(device)
flappybird_optimizer = optim.Adam(flappybird_policy.parameters(), lr=flappybird_hyperparameters["lr"])

In [None]:
scores = reinforce(flappybird_policy,
                   flappybird_optimizer,
                   flappybird_hyperparameters["n_training_episodes"],
                   flappybird_hyperparameters["max_t"],
                   flappybird_hyperparameters["gamma"],
                   100)

  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(f"{pre} is not within the observation space.")


Episode 100	Average Score: -7.20
Episode 200	Average Score: -6.58
Episode 300	Average Score: -3.26
Episode 400	Average Score: -0.59
Episode 500	Average Score: -0.14
Episode 600	Average Score: -0.45
Episode 700	Average Score: -0.41
Episode 800	Average Score: -0.26
Episode 900	Average Score: -0.34
Episode 1000	Average Score: -0.46
Episode 1100	Average Score: -0.37
Episode 1200	Average Score: -0.05
Episode 1300	Average Score: -0.11
Episode 1400	Average Score: -0.49
Episode 1500	Average Score: -0.13
Episode 1600	Average Score: -0.66
Episode 1700	Average Score: -0.60
Episode 1800	Average Score: -0.24
Episode 1900	Average Score: -0.19
Episode 2000	Average Score: -0.37
Episode 2100	Average Score: -0.26
Episode 2200	Average Score: -0.24
Episode 2300	Average Score: -0.13
Episode 2400	Average Score: -0.28
Episode 2500	Average Score: -0.19
Episode 2600	Average Score: 0.06
Episode 2700	Average Score: -0.32
Episode 2800	Average Score: -0.31
Episode 2900	Average Score: -0.09
Episode 3000	Average Sco

In [7]:
torch.save(policy.state_dict(), 'policy.pth')

NameError: name 'policy' is not defined