In [None]:
# !pip install pyyaml
# !pip install dndice
# !pip install python-i18n
# !pip install gymnasium
# !pip install inflect
# !pip install collections-extended
# !pip install openai
# !pip install -e ..


In [None]:
import unittest
from natural20.map import Map, Terrain
from natural20.battle import Battle
from natural20.player_character import PlayerCharacter
from natural20.map_renderer import MapRenderer
from natural20.die_roll import DieRoll
from natural20.generic_controller import GenericController
from natural20.utils.utils import Session
from natural20.actions.move_action import MoveAction
from natural20.action import Action
from natural20.gym.dndenv import dndenv
from gymnasium import register, envs, make
from model import QNetwork
import torch
import tqdm as tqdm
import random
import torch.optim as optim
import torch.nn as nn
import gc
import numpy as np
import sys
import collections

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
env = make("dndenv-v0", root_path="../templates")
print(env.observation_space)
print(env.action_space.sample)

In [None]:
model = QNetwork(device=device)
model.to(device)
state, info = env.reset()
moves = info["available_moves"]

model.eval()
print(model(state, moves[0]))




In [None]:
def generate_trajectory(env, model, policy='e-greedy', temperature=5.0, epsilon=0.1, quick_exit=False):
    state, info = env.reset()
    done = False
    truncated = False
    states = []
    actions = []
    rewards = []
    dones = []
    truncateds = []
    infos = []

    while not done and not truncated:
        # instead of sampling  (e.g. env.action_space.sample()) we can ask help from the enivronment to obtain valid moves
        # as there are sparse valid moves in the environment
        available_moves = info["available_moves"]
        with torch.no_grad():
            if policy == 'e-greedy':
                if random.random() < epsilon:
                    chosen_index = random.choice(range(len(available_moves)))
                else:
                    values = torch.stack([model(state, move) for move in available_moves])
                    chosen_index = torch.argmax(values).item()
            elif policy == 'greedy':
                    values = torch.stack([model(state, move) for move in available_moves])
                    chosen_index = torch.argmax(values).item()
            else:
                raise ValueError(f"Unknown policy: {policy}")
        
        action = available_moves[chosen_index]
        state, reward, done, truncated, info = env.step(action)       
        
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        dones.append(done)
        truncateds.append(truncated)
        infos.append(info)

        if done or truncated:
            break    
        
    return states, actions, rewards, dones, truncateds, infos

In [None]:
trajectory = generate_trajectory(env, model)
print(trajectory)

In [None]:
EPISODES = 10
avg_reward = 0
for i in tqdm.tqdm(range(EPISODES)):
    states, actions, rewards, dones, truncateds, infos = generate_trajectory(env, model)
    avg_reward += sum(rewards)

avg_reward /= EPISODES
print(f"Average reward: {avg_reward}")

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def push(self, states, actions, rewards, is_terminal):
        self.buffer.append((states, actions, rewards, is_terminal))

    def sample(self, batch_size):
        buffer = list(self.buffer)
        indices = np.random.choice(len(buffer), batch_size)
        states, actions, rewards, is_terminals = zip(*[buffer[idx] for idx in indices])
        return states, actions, rewards, is_terminals

    def __len__(self):
        return len(self.buffer)
    
    # memory usage of the buffer in bytes
    def memory_usage(self):
        total_size = 0
        for item in self.buffer:
            states, actions, rewards, is_terminals = item
            for s in states:
                total_size += sys.getsizeof(s)
            total_size += sys.getsizeof(actions)
            total_size += sys.getsizeof(rewards)
            total_size += sys.getsizeof(is_terminals)

        return total_size

In [None]:

# generate a batch of trajectories and store them in the replay buffer
def generate_batch_trajectories(env, model, n_rollout, replay_buffer: ReplayBuffer, temperature=5.0, epsilon=0.1, policy='e-greedy'):
    print(f"generating {n_rollout} rollouts")
    for _ in range(n_rollout):
        state, action, reward, done, truncated, info = generate_trajectory(env, model, temperature=temperature, epsilon=epsilon, policy=policy)
        replay_buffer.push(state, action, reward, done)

In [None]:
MAX_STEPS = 10
TRAJECTORY_POLICY = "e-greedy"
NUM_UPDATES = 1
TEMP_DECAY = 0.999
BUFFER_CAPACITY = 2000
FRAMES_TO_STORE = 2
MAX_STEPS = 1000
BATCH_SIZE = 32
TARGET_UPDATE_FREQ = 1
T_HORIZON = 1024
EPSILON_START = 1.0
EPSILON_FINAL = 0.02
EPSILON_DECAY_FRAMES = 10**3

In [None]:
env = make("dndenv-v0", root_path="../templates")

def train(env, gamma, learning_rate, max_steps=MAX_STEPS, n_rollout=8, seed=1337):
  print(f"training with gamma {gamma} and learning rate {learning_rate}")
  env.seed(seed)

  replay_buffer = ReplayBuffer(100)

  # load model checkpoint if available
  model = QNetwork(device).to(device)
  target_model = QNetwork(device).to(device)

  # intialize target network with the same weights as the model
  target_model.load_state_dict(model.state_dict())

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  temperature = 5.0
  reward_per_episode = []
  epsilon = EPSILON_START

  for step in range(max_steps):
    if TRAJECTORY_POLICY == 'softmax':
      print(f"step {step} t={temperature}")
    elif TRAJECTORY_POLICY == 'e-greedy':
      print(f"step {step} epsilon={epsilon}")
    else:
      print(f"step {step}")
    generate_batch_trajectories(env, model, n_rollout, replay_buffer, temperature=temperature, epsilon=epsilon, policy=TRAJECTORY_POLICY)

    states, actions, rewards, is_terminals = replay_buffer.sample(BATCH_SIZE)
    rewards_collected = 0
    for _ in range(NUM_UPDATES):
      rewards_collected = 0
      total_loss = 0.0
      
      for i in range(len(states)):
        s = states[i]
        a = actions[i]
        r = torch.tensor(rewards[i]).to(device)
        is_terminal = torch.tensor(is_terminals[i]).to(device)
        
        q_target = target_model(s, a).detach()
        
        targets = r + gamma * q_target * is_terminal
        output = model(s, a)
        q_sa = output

        # print(f"q_sa {q_sa.shape}")
        # print(f"targets {targets.shape}")
        value_loss = nn.MSELoss()(q_sa, targets)
        optimizer.zero_grad()
        value_loss.backward()
        total_loss += value_loss.item()
        rewards_collected += r.sum().item()
        optimizer.step()
      print(f"total loss {total_loss/len(states)}")

    # save model checkpoint

    if step % 10 == 0:
      # torch.save(model.state_dict(), f"model_{step}.pt")
      torch.save(model.state_dict(), f"model_{gamma}_{lr}.pt")

    if step % 100 == 0:
      torch.save(model.state_dict(), f"model_{gamma}_{lr}_{step}.pt")

    # if step % 5 == 0 and step > 0:
    _, _, rewards, _, _, _ = generate_trajectory(env, model, policy='greedy')
    total_reward = sum(rewards)
    
    reward_per_episode.append(total_reward)

    print(f"total reward: {total_reward}")
    print(f"{step}: rewards {rewards_collected}")
    gc.collect()
    
    # decay temp
    temperature = np.max([0.1, temperature * TEMP_DECAY])

    # decay epsilon
    epsilon = EPSILON_FINAL + (EPSILON_START - EPSILON_FINAL) * np.exp(-1.0 * step / EPSILON_DECAY_FRAMES)

    if step % TARGET_UPDATE_FREQ == 0:
      # calculate the avg change weights of the model with the target model
      total_change = 0
      for p, p_target in zip(model.parameters(), target_model.parameters()):
        total_change += torch.abs(p - p_target).sum().item()
      print(f"total change: {total_change}")

      target_model.load_state_dict(model.state_dict())

  env.close()
  return reward_per_episode

seed = 1337
# Create a grid of learning rates and gammas
learning_rates = [0.0001, 0.001]
gammas = [0.99, 0.1]

results = {}
for lr_index, lr in enumerate(learning_rates):
  results[lr] = {}
  for g_index, gamma in enumerate(gammas):
    reward_per_episode = train(env, gamma, lr, max_steps=MAX_STEPS, seed=seed*lr_index + g_index)
    results[learning_rates.index(lr), gammas.index(gamma)] = reward_per_episode
