In [None]:
# !pip install pyyaml
# !pip install dndice
# !pip install python-i18n
# !pip install gymnasium
# !pip install inflect
# !pip install collections-extended
# !pip install openai
# !pip install -e ..


In [1]:
import unittest
from natural20.map import Map, Terrain
from natural20.battle import Battle
from natural20.player_character import PlayerCharacter
from natural20.map_renderer import MapRenderer
from natural20.die_roll import DieRoll
from natural20.generic_controller import GenericController
from natural20.utils.utils import Session
from natural20.actions.move_action import MoveAction
from natural20.action import Action
from natural20.gym.dndenv import dndenv
from gymnasium import register, envs, make
from model import QNetwork
import torch
import tqdm as tqdm
import random
import torch.optim as optim
import torch.nn as nn
import gc
import numpy as np
import sys
import collections

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
env = make("dndenv-v0", root_path="../templates")
print(env.observation_space)
print(env.action_space.sample)

Dict('health_enemy': Box(0.0, 1.0, (1,), float64), 'health_pct': Box(0.0, 1.0, (1,), float64), 'map': Box(-1, 255, (12, 12, 3), int64), 'movement': Discrete(255), 'turn_info': Box(0, 1, (3,), int64))
<bound method Tuple.sample of Tuple(Box(-1, 8, (1,), int64), Box(-1, 1, (2,), int64), Box(-6, 6, (2,), int64), Discrete(2))>


In [4]:
model = QNetwork(device=device)
model.to(device)
state, info = env.reset()
moves = info["available_moves"]

model.eval()
print(model(state, moves[0]))




loading map from ../templates/maps/game_map.yml
map size: [6, 6]
==== Player Character ====
name: gomerin
level: 1
character class: {'fighter': 1}
hp: 24
max hp: 24
ac: 18
speed: 30



==== Player Character ====
name: rumblebelly
level: 1
character class: {'fighter': 1}
hp: 24
max hp: 24
ac: 18
speed: 30



gomerin -> initiative roll: (14) + 5 value: 19.2
rumblebelly -> initiative roll: (10) + 5 value: 15.2
gomerin starts their turn.
tensor([[-0.0542]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [5]:
def generate_trajectory(env, model, policy='e-greedy', temperature=5.0, epsilon=0.1, quick_exit=False):
    state, info = env.reset()
    done = False
    truncated = False
    states = []
    actions = []
    rewards = []
    dones = []
    truncateds = []
    infos = []

    while not done and not truncated:
        # instead of sampling  (e.g. env.action_space.sample()) we can ask help from the enivronment to obtain valid moves
        # as there are sparse valid moves in the environment
        available_moves = info["available_moves"]
        with torch.no_grad():
            if policy == 'e-greedy':
                if random.random() < epsilon:
                    chosen_index = random.choice(range(len(available_moves)))
                else:
                    values = torch.stack([model(state, move) for move in available_moves])
                    chosen_index = torch.argmax(values).item()
            elif policy == 'greedy':
                    values = torch.stack([model(state, move) for move in available_moves])
                    chosen_index = torch.argmax(values).item()
            else:
                raise ValueError(f"Unknown policy: {policy}")
        
        action = available_moves[chosen_index]
        state, reward, done, truncated, info = env.step(action)       
        
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        dones.append(done)
        truncateds.append(truncated)
        infos.append(info)

        if done or truncated:
            break    
        
    return states, actions, rewards, dones, truncateds, infos

In [6]:
trajectory = generate_trajectory(env, model)
print(trajectory)

loading map from ../templates/maps/game_map.yml
map size: [6, 6]
==== Player Character ====
name: gomerin
level: 1
character class: {'fighter': 1}
hp: 24
max hp: 24
ac: 18
speed: 30



==== Player Character ====
name: rumblebelly
level: 1
character class: {'fighter': 1}
hp: 24
max hp: 24
ac: 18
speed: 30



gomerin -> initiative roll: (15) + 5 value: 20.2
rumblebelly -> initiative roll: (14) + 5 value: 19.2
gomerin starts their turn.
gomerin uses Second Wind with (6) + 1 healing
gomerin dashes
gomerin moved to [0, 4] 5 feet
==== end turn ===
rumblebelly starts their turn.
==== current turn rumblebelly 24/24===
rumblebelly: rumblebelly attacks gomerin with longbow
rumblebelly attacks gomerin using Longbow for ((1) + 5) 6 damage!
gomerin takes 6 damage!
rumblebelly: move to [3, 2]
rumblebelly moved to [3, 2] 5 feet
rumblebelly: move to [2, 2]
rumblebelly moved to [2, 2] 5 feet
rumblebelly: move to [1, 3]
rumblebelly moved to [1, 3] 5 feet
rumblebelly: move to [0, 3]
rumblebelly moved to 

In [7]:
EPISODES = 10
avg_reward = 0
for i in tqdm.tqdm(range(EPISODES)):
    states, actions, rewards, dones, truncateds, infos = generate_trajectory(env, model)
    avg_reward += sum(rewards)

avg_reward /= EPISODES
print(f"Average reward: {avg_reward}")

  0%|          | 0/10 [00:00<?, ?it/s]

loading map from ../templates/maps/game_map.yml
map size: [6, 6]
==== Player Character ====
name: gomerin
level: 1
character class: {'fighter': 1}
hp: 24
max hp: 24
ac: 18
speed: 30



==== Player Character ====
name: rumblebelly
level: 1
character class: {'fighter': 1}
hp: 24
max hp: 24
ac: 18
speed: 30



gomerin -> initiative roll: (13) + 5 value: 18.2
rumblebelly -> initiative roll: (8) + 5 value: 13.2
gomerin starts their turn.
gomerin uses Second Wind with (5) + 1 healing
gomerin dashes
gomerin moved to [0, 3] 5 feet
gomerin moved to [0, 2] 5 feet
gomerin moved to [1, 1] 5 feet
gomerin moved to [2, 0] 5 feet
gomerin moved to [3, 0] 5 feet
gomerin moved to [4, 0] 5 feet
gomerin moved to [3, 0] 5 feet
gomerin moved to [2, 0] 5 feet
gomerin moved to [3, 0] 5 feet
gomerin moved to [4, 0] 5 feet
gomerin moved to [3, 0] 5 feet
==== end turn ===
rumblebelly starts their turn.
==== current turn rumblebelly 24/24===
rumblebelly: rumblebelly attacks gomerin with longbow
rumblebelly attacks

 10%|█         | 1/10 [00:00<00:03,  2.32it/s]

rumblebelly attacks gomerin using Longbow for ((2) + 5) 7 damage!
gomerin takes 7 damage!
gomerin died. :(
tpk
Result: tpk
loading map from ../templates/maps/game_map.yml
map size: [6, 6]
==== Player Character ====
name: gomerin
level: 1
character class: {'fighter': 1}
hp: 24
max hp: 24
ac: 18
speed: 30



==== Player Character ====
name: rumblebelly
level: 1
character class: {'fighter': 1}
hp: 24
max hp: 24
ac: 18
speed: 30



gomerin -> initiative roll: (14) + 5 value: 19.2
rumblebelly -> initiative roll: (13) + 5 value: 18.2
gomerin starts their turn.
gomerin uses Second Wind with (6) + 1 healing
gomerin dashes
gomerin moved to [3, 4] 5 feet
gomerin moved to [2, 3] 5 feet
gomerin moved to [1, 2] 5 feet
gomerin moved to [2, 1] 5 feet
gomerin moved to [1, 0] 5 feet
gomerin moved to [1, 1] 5 feet
gomerin moved to [0, 0] 5 feet
gomerin moved to [1, 0] 5 feet
gomerin moved to [2, 0] 5 feet
gomerin moved to [3, 0] 5 feet
gomerin moved to [4, 0] 5 feet
==== end turn ===
rumblebelly starts 

 20%|██        | 2/10 [00:01<00:04,  1.91it/s]

gomerin moved to [5, 0] 5 feet
==== end turn ===
rumblebelly starts their turn.
==== current turn rumblebelly 24/24===
rumblebelly: rumblebelly attacks gomerin with longbow
rumblebelly attacks gomerin using Longbow for ((4) + 5) 9 damage!
gomerin takes 9 damage!
rumblebelly: move to [3, 0]
rumblebelly moved to [3, 0] 5 feet
rumblebelly: move to [4, 0]
rumblebelly moved to [4, 0] 5 feet
rumblebelly: move to [4, 1]
rumblebelly moved to [4, 1] 5 feet
rumblebelly: move to [4, 0]
rumblebelly moved to [4, 0] 5 feet
rumblebelly: move to [4, 1]
rumblebelly moved to [4, 1] 5 feet
no move for rumblebelly
gomerin starts their turn.
Result: False
gomerin dashes
gomerin moved to [4, 0] 5 feet
rumblebelly: rumblebelly uses rapier as a reaction to attack gomerin
gomerin moved to [3, 0] 5 feet
gomerin moved to [3, 1] 5 feet
gomerin moved to [4, 0] 5 feet
rumblebelly: rumblebelly uses rapier as a reaction to attack gomerin
gomerin moved to [5, 0] 5 feet
gomerin moved to [4, 0] 5 feet
rumblebelly: rumbl

 30%|███       | 3/10 [00:01<00:03,  1.92it/s]

gomerin moved to [4, 1] 5 feet
==== end turn ===
rumblebelly starts their turn.
==== current turn rumblebelly 24/24===
rumblebelly: rumblebelly attacks gomerin with rapier
rumblebelly rolls (10) + 7=17 using Rapier and misses gomerin
rumblebelly: move to [3, 0]
rumblebelly moved to [3, 0] 5 feet
rumblebelly: move to [3, 1]
rumblebelly moved to [3, 1] 5 feet
rumblebelly: move to [3, 0]
rumblebelly moved to [3, 0] 5 feet
rumblebelly: move to [3, 1]
rumblebelly moved to [3, 1] 5 feet
rumblebelly: move to [3, 0]
rumblebelly moved to [3, 0] 5 feet
no move for rumblebelly
gomerin starts their turn.
Result: False
gomerin dashes
gomerin moved to [4, 0] 5 feet
rumblebelly: rumblebelly uses rapier as a reaction to attack gomerin
gomerin moved to [4, 1] 5 feet
gomerin moved to [3, 1] 5 feet
gomerin moved to [4, 0] 5 feet
gomerin moved to [3, 1] 5 feet
gomerin moved to [4, 0] 5 feet
gomerin moved to [3, 1] 5 feet
gomerin moved to [2, 0] 5 feet
gomerin moved to [3, 1] 5 feet
gomerin moved to [4, 0]

 40%|████      | 4/10 [00:02<00:04,  1.38it/s]

gomerin dashes
gomerin moved to [3, 0] 5 feet
gomerin moved to [2, 1] 5 feet
rumblebelly: rumblebelly uses rapier as a reaction to attack gomerin
gomerin moved to [1, 2] 5 feet
gomerin moved to [0, 1] 5 feet
gomerin moved to [1, 0] 5 feet
gomerin moved to [2, 1] 5 feet
==== end turn ===
rumblebelly starts their turn.
==== current turn rumblebelly 24/24===
rumblebelly: rumblebelly attacks gomerin with rapier
rumblebelly attacks gomerin using Rapier for ((2) + 5) 7 damage!
gomerin takes 7 damage!
gomerin died. :(
tpk
Result: tpk
loading map from ../templates/maps/game_map.yml
map size: [6, 6]
==== Player Character ====
name: gomerin
level: 1
character class: {'fighter': 1}
hp: 24
max hp: 24
ac: 18
speed: 30



==== Player Character ====
name: rumblebelly
level: 1
character class: {'fighter': 1}
hp: 24
max hp: 24
ac: 18
speed: 30



gomerin -> initiative roll: (18) + 5 value: 23.2
rumblebelly -> initiative roll: (9) + 5 value: 14.2
gomerin starts their turn.
gomerin uses Second Wind with 

 50%|█████     | 5/10 [00:02<00:02,  1.70it/s]

loading map from ../templates/maps/game_map.yml
map size: [6, 6]
==== Player Character ====
name: gomerin
level: 1
character class: {'fighter': 1}
hp: 24
max hp: 24
ac: 18
speed: 30



==== Player Character ====
name: rumblebelly
level: 1
character class: {'fighter': 1}
hp: 24
max hp: 24
ac: 18
speed: 30



gomerin -> initiative roll: (1) + 5 value: 6.2
rumblebelly -> initiative roll: (15) + 5 value: 20.2
rumblebelly starts their turn.
rumblebelly uses Second Wind with (5) + 1 healing
rumblebelly dashes
rumblebelly moved to [1, 4] 5 feet
rumblebelly moved to [0, 3] 5 feet
rumblebelly moved to [1, 2] 5 feet
rumblebelly moved to [2, 1] 5 feet
rumblebelly moved to [3, 0] 5 feet
rumblebelly moved to [4, 0] 5 feet
rumblebelly moved to [3, 0] 5 feet
rumblebelly moved to [2, 0] 5 feet
rumblebelly moved to [3, 0] 5 feet
rumblebelly moved to [4, 0] 5 feet
rumblebelly moved to [3, 0] 5 feet
==== end turn ===
gomerin starts their turn.
==== current turn gomerin 24/24===
Result: False
gomerin uses

 60%|██████    | 6/10 [00:03<00:02,  1.36it/s]

rumblebelly: rumblebelly uses rapier as a reaction to attack gomerin
gomerin moved to [3, 0] 5 feet
gomerin moved to [2, 0] 5 feet
rumblebelly: rumblebelly uses rapier as a reaction to attack gomerin
gomerin moved to [3, 0] 5 feet
gomerin moved to [4, 0] 5 feet
gomerin moved to [5, 0] 5 feet
gomerin moved to [5, 1] 5 feet
==== end turn ===
rumblebelly starts their turn.
==== current turn rumblebelly 24/24===
rumblebelly: rumblebelly attacks gomerin with longbow
rumblebelly rolls (6) + 7=13 using Longbow and misses gomerin
rumblebelly: move to [2, 0]
rumblebelly moved to [2, 0] 5 feet
rumblebelly: move to [3, 0]
rumblebelly moved to [3, 0] 5 feet
rumblebelly: move to [4, 0]
rumblebelly moved to [4, 0] 5 feet
rumblebelly: move to [4, 1]
rumblebelly moved to [4, 1] 5 feet
rumblebelly: move to [4, 0]
rumblebelly moved to [4, 0] 5 feet
no move for rumblebelly
gomerin starts their turn.
Result: False
gomerin dashes
gomerin moved to [5, 0] 5 feet
gomerin moved to [4, 1] 5 feet
rumblebelly: ru

 70%|███████   | 7/10 [00:04<00:02,  1.26it/s]

rumblebelly: rumblebelly uses rapier as a reaction to attack gomerin
gomerin moved to [1, 0] 5 feet
gomerin moved to [0, 0] 5 feet
gomerin moved to [1, 0] 5 feet
==== end turn ===
rumblebelly starts their turn.
==== current turn rumblebelly 24/24===
rumblebelly: rumblebelly attacks gomerin with longbow
rumblebelly rolls (1) + 7=8 using Longbow and misses gomerin
rumblebelly: move to [2, 0]
rumblebelly moved to [2, 0] 5 feet
rumblebelly: move to [1, 1]
rumblebelly moved to [1, 1] 5 feet
rumblebelly: move to [0, 0]
rumblebelly moved to [0, 0] 5 feet
rumblebelly: move to [0, 1]
rumblebelly moved to [0, 1] 5 feet
rumblebelly: move to [0, 0]
rumblebelly moved to [0, 0] 5 feet
no move for rumblebelly
gomerin starts their turn.
Result: False
gomerin dashes
gomerin moved to [0, 1] 5 feet
rumblebelly: rumblebelly uses rapier as a reaction to attack gomerin
gomerin moved to [0, 2] 5 feet
gomerin moved to [0, 1] 5 feet
gomerin moved to [1, 0] 5 feet
rumblebelly: rumblebelly uses rapier as a react

 70%|███████   | 7/10 [00:05<00:02,  1.35it/s]

gomerin uses Second Wind with (6) + 1 healing
gomerin dashes
gomerin moved to [0, 3] 5 feet
gomerin moved to [0, 2] 5 feet
gomerin moved to [0, 3] 5 feet
gomerin moved to [1, 2] 5 feet
gomerin moved to [2, 1] 5 feet
gomerin moved to [3, 0] 5 feet
gomerin moved to [2, 0] 5 feet
gomerin moved to [1, 1] 5 feet
gomerin moved to [2, 2] 5 feet
gomerin moved to [3, 1] 5 feet
gomerin moved to [4, 0] 5 feet
==== end turn ===
rumblebelly starts their turn.
==== current turn rumblebelly 24/24===
rumblebelly: rumblebelly attacks gomerin with longbow
rumblebelly attacks gomerin using Longbow for ((6) + 5) 11 damage!
gomerin takes 11 damage!
rumblebelly: move to [4, 3]
rumblebelly moved to [4, 3] 5 feet
rumblebelly: move to [3, 2]
rumblebelly moved to [3, 2] 5 feet
rumblebelly: move to [3, 1]
rumblebelly moved to [3, 1] 5 feet
rumblebelly: move to [3, 0]
rumblebelly moved to [3, 0] 5 feet
rumblebelly: move to [3, 1]
rumblebelly moved to [3, 1] 5 feet
rumblebelly: <natural20.actions.second_wind_actio




NameError: name 'StandAction' is not defined

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def push(self, states, actions, rewards, infos, is_terminal):
        self.buffer.append((states, actions, rewards, infos, is_terminal))

    def sample(self, batch_size):
        buffer = list(self.buffer)
        indices = np.random.choice(len(buffer), batch_size)
        states, actions, rewards, infos, is_terminals = zip(*[buffer[idx] for idx in indices])
        return states, actions, rewards, infos, is_terminals

    def __len__(self):
        return len(self.buffer)
    
    # memory usage of the buffer in bytes
    def memory_usage(self):
        total_size = 0
        for item in self.buffer:
            states, actions, rewards, infos, is_terminals = item
            for s in states:
                total_size += sys.getsizeof(s)
            total_size += sys.getsizeof(actions)
            total_size += sys.getsizeof(rewards)
            total_size += sys.getsizeof(infos)
            total_size += sys.getsizeof(is_terminals)

        return total_size

In [None]:

# generate a batch of trajectories and store them in the replay buffer
def generate_batch_trajectories(env, model, n_rollout, replay_buffer: ReplayBuffer, temperature=5.0, epsilon=0.1, policy='e-greedy'):
    print(f"generating {n_rollout} rollouts")
    for _ in range(n_rollout):
        state, action, reward, done, truncated, info = generate_trajectory(env, model, temperature=temperature, epsilon=epsilon, policy=policy)
        replay_buffer.push(state, action, reward, info, done)

In [None]:
TRAJECTORY_POLICY = "e-greedy"
NUM_UPDATES = 2
TEMP_DECAY = 0.999
BUFFER_CAPACITY = 2000
FRAMES_TO_STORE = 2
MAX_STEPS = 500
BATCH_SIZE = 32
TARGET_UPDATE_FREQ = 1
T_HORIZON = 1024
EPSILON_START = 1.0
EPSILON_FINAL = 0.02
EPSILON_DECAY_FRAMES = 10**3

In [None]:


def train(env, gamma, learning_rate, max_steps=MAX_STEPS, use_td_target=True, n_rollout=8, seed=1337):
  print(f"training with gamma {gamma} and learning rate {learning_rate}")
  env.seed(seed)

  replay_buffer = ReplayBuffer(100)

  # load model checkpoint if available
  model = QNetwork(device).to(device)
  target_model = QNetwork(device).to(device)

  # intialize target network with the same weights as the model
  target_model.load_state_dict(model.state_dict())

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  temperature = 5.0
  reward_per_episode = []
  epsilon = EPSILON_START

  for step in range(max_steps):
    if TRAJECTORY_POLICY == 'softmax':
      print(f"step {step} t={temperature}")
    elif TRAJECTORY_POLICY == 'e-greedy':
      print(f"step {step} epsilon={epsilon}")
    else:
      print(f"step {step}")
    generate_batch_trajectories(env, model, n_rollout, replay_buffer, temperature=temperature, epsilon=epsilon, policy=TRAJECTORY_POLICY)

    states, actions, rewards, infos, is_terminals = replay_buffer.sample(BATCH_SIZE)
    rewards_collected = 0
    for _ in range(NUM_UPDATES):
      rewards_collected = 0
      total_loss = 0.0
      
      for i in range(len(states)):
        s = states[i]
        a = actions[i]
        env_info = infos[i]
        r = torch.tensor(rewards[i]).to(device)
        is_terminal = torch.tensor(is_terminals[i]).to(device)
        
        if use_td_target:
          with torch.no_grad():
            q_targets = target_model(s, a).detach()
        else:
          q_targets = torch.zeros(len(env_info)).to(device)
          for index, (state, info) in enumerate(zip(s, env_info)):
            q_values = [target_model([state], [avail_action]).detach() for avail_action in info["available_moves"]]
            if len(q_values) == 0:
              q_targets[index] = 0
            else:
              q_targets[index] = torch.max(torch.stack(q_values)).item()
          
        targets = r + gamma * q_targets * is_terminal
        output = model(s, a)
        q_sa = output

        print(f"q_sa {q_sa.shape}")
        print(f"targets {targets.shape}")
        raise ValueError("stop")
        value_loss = nn.MSELoss()(q_sa, targets)
        optimizer.zero_grad()
        value_loss.backward()
        total_loss += value_loss.item()
        rewards_collected += r.sum().item()
        optimizer.step()
      print(f"total loss {total_loss/len(states)}")

    # save model checkpoint

    if step % 10 == 0:
      # torch.save(model.state_dict(), f"model_{step}.pt")
      torch.save(model.state_dict(), f"model_{gamma}_{learning_rate}.pt")

    if step % 100 == 0:
      torch.save(model.state_dict(), f"model_{gamma}_{learning_rate}_{step}.pt")

    # if step % 5 == 0 and step > 0:
    _, _, rewards, _, _, _ = generate_trajectory(env, model, policy='greedy')
    total_reward = sum(rewards)
    
    reward_per_episode.append(total_reward)

    print(f"total reward: {total_reward}")
    print(f"{step}: rewards {rewards_collected}")
    gc.collect()
    
    # decay temp
    temperature = np.max([0.1, temperature * TEMP_DECAY])

    # decay epsilon
    epsilon = EPSILON_FINAL + (EPSILON_START - EPSILON_FINAL) * np.exp(-1.0 * step / EPSILON_DECAY_FRAMES)

    if step % TARGET_UPDATE_FREQ == 0:
      # calculate the avg change weights of the model with the target model
      total_change = 0
      for p, p_target in zip(model.parameters(), target_model.parameters()):
        total_change += torch.abs(p - p_target).sum().item()
      print(f"total change: {total_change}")

      target_model.load_state_dict(model.state_dict())

  env.close()
  return reward_per_episode


In [None]:
env = make("dndenv-v0", root_path="../templates")

seed = 1337
# Create a grid of learning rates and gammas
learning_rates = [0.001]
gammas = [0.99]

results = {}
for lr in learning_rates:
  results[lr] = {}
  for gamma in gammas:
    seed = seed + 1
    reward_per_episode = train(env, gamma, lr, max_steps=MAX_STEPS, seed=seed )
    results[lr][gamma] = reward_per_episode


Summarize rewards per episode

In [None]:
for item in results:
  for gamma in results[item]:
    print(f"lr: {item} gamma: {gamma} rewards: {results[item][gamma]}")

In [None]:
# plot the results
import matplotlib.pyplot as plt

for lr in learning_rates:
    for gamma in gammas:
        plt.plot(results[lr][gamma], label=f"lr={lr}, gamma={gamma}")
plt.legend()
plt.show()
