In [None]:
!pip install pyyaml
!pip install dndice
!pip install python-i18n
!pip install gymnasium
!pip install inflect
!pip install collections-extended
!pip install openai
!pip install -e ..
!pip install ipywidgets
!pip install iprogress


In [None]:
from gymnasium import make
from model import QNetwork
from natural20.gym.dndenv import dndenv
import torch
import tqdm as tqdm
import tqdm.notebook as tqdm
import random
import torch.optim as optim
import torch.nn as nn
import gc
import numpy as np
import sys
import collections
from natural20.session import Session
from natural20.event_manager import EventManager
from natural20.gym.dqn.replay_buffer import ReplayBuffer
import os
from llm_interface import GPT4Interfacer

In [None]:
if torch.backends.mps.is_available():
  device = torch.device("mps")
else:
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"
print(device)

In [None]:
env_config = "map_with_obstacles"
PROJECT_OUTPUT_PATH = "model_weights_all"
if not os.path.exists(PROJECT_OUTPUT_PATH):
  os.mkdir(PROJECT_OUTPUT_PATH)

In [None]:
session = Session(env_config, event_manager=EventManager())

Show info about the environment and a render of the tabletop map

In [None]:
env = make("dndenv-v0", root_path="map_with_obstacles", show_logs=True,
           custom_session=session,
           damaged_based_reward=True,
           render_mode="ansi")
env.reset()
print(env.render())
print(env.observation_space)
print(env.action_space.sample)

DQN Parameters

In [None]:
TRAJECTORY_POLICY = "e-greedy"
NUM_UPDATES = 2 # number of training steps to update the Q-network
TEMP_DECAY = 0.999
BUFFER_CAPACITY = 3000
FRAMES_TO_STORE = 2
MAX_STEPS = 3000
BATCH_SIZE = 64
TARGET_UPDATE_FREQ = 1 # how often to update the target network
T_HORIZON = 512
EPSILON_START = 1.0
EPSILON_FINAL = 0.01
EPSILON_DECAY_FRAMES = 10**3
EVAL_STEPS = 30

In [None]:
model = QNetwork(device=device)
model.to(device)
state, info = env.reset()
moves = info["available_moves"]
model.eval()
print(model(state, moves[0]))

In [None]:
def act_with_policy(state, info, model, policy='e-greedy', temperature=5.0, epsilon=0.1):
    available_moves = info["available_moves"]
    with torch.no_grad():
        if policy == 'boltzmann':
            values = torch.stack([model(state, move).squeeze() for move in available_moves])
            if len(values) > 1:
                if temperature != 0:
                    values = values / temperature
                else:
                    raise ValueError("Temperature is zero, which can lead to division by zero.")

                # Stabilizing the exponential calculation
                values = values - torch.max(values)  # Subtract the max value for numerical stability
                values = torch.exp(values)
                sum_values = torch.sum(values)

                if sum_values > 0:
                    values = values / sum_values
                    chosen_index = torch.multinomial(values, 1).item()
                else:
                    print("Sum of exponentiated values is zero. Adjust the model or input.")
                    chosen_index = torch.randint(len(available_moves), (1,)).item()
            else:
                chosen_index = 0
        elif policy == 'e-greedy':
            if random.random() < epsilon:
                # place available moves in buckets according to their type
                # this is so that movements are not chosen more often than other types of moves
                move_types = collections.defaultdict(list)
                for orig_index, move in enumerate(available_moves):
                    move_types[move[0]].append(orig_index)
                chosen_move_type = random.choice(list(move_types.keys()))
                chosen_index = random.choice(move_types[chosen_move_type])
            else:
                values = torch.stack([model(state, move) for move in available_moves])
                chosen_index = torch.argmax(values).item()
        elif policy == 'greedy':
                values = torch.stack([model(state, move) for move in available_moves])
                chosen_index = torch.argmax(values).item()
        else:
            raise ValueError(f"Unknown policy: {policy}")
    
    return available_moves[chosen_index]

def generate_trajectory(env, model, policy='e-greedy', temperature=5.0, epsilon=0.1, horizon=500, quick_exit=False):

    done = False
    truncated = False
    states = []
    actions = []
    rewards = []
    dones = []
    truncateds = []
    infos = []
    truncated = False


    def reaction_callback(state, reward, done, truncated, info):
        action = act_with_policy(state, info, model, policy, temperature, epsilon)
        
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        dones.append(done)
        truncateds.append(truncated)
        infos.append(info)

        return action

    state, info = env.reset(reaction_callback=reaction_callback)

    for _ in range(horizon):
        # instead of sampling  (e.g. env.action_space.sample()) we can ask help from the enivronment to obtain valid moves
        # as there are sparse valid moves in the environment
        action = act_with_policy(state, info, model, policy, temperature, epsilon)
        next_state, reward, done, truncated, next_info = env.step(action)

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        dones.append(done)
        truncateds.append(truncated)
        infos.append(info)

        if done:
            break    
        if truncated:
            truncated = True
            break
        state = next_state
        info = next_info
        
    states.append(next_state)
    infos.append(next_info)
    actions.append((-1, (0,0), (0,0), 0, 0))
    return states, actions, rewards, dones, truncateds, infos

In [None]:
trajectory = generate_trajectory(env, model, epsilon=1.0)
print(trajectory)

In [None]:
EPISODES = 10
total_rewards = 0
for i in tqdm.tqdm(range(EPISODES)):
    states, actions, rewards, dones, truncateds, infos = generate_trajectory(env, model, epsilon=1.0)
    total_rewards += sum(rewards)

avg_reward = total_rewards/EPISODES
print(f"Average reward: {avg_reward} Total Reward: {total_rewards}")

In [None]:
# generate a batch of trajectories and store them in the replay buffer
def generate_batch_trajectories(env, model, n_rollout, replay_buffer: ReplayBuffer, temperature=5.0, epsilon=0.1, horizon=30, policy='e-greedy'):
    # print(f"generating {n_rollout} rollouts")
    for _ in range(n_rollout):
        state, action, reward, done, truncated, info = generate_trajectory(env, model, temperature=temperature,
                                                                           epsilon=epsilon,
                                                                           horizon=horizon,policy=policy)
        replay_buffer.push(state, action, reward, info, done)

In [None]:
def train(env, gamma, learning_rate, max_steps=MAX_STEPS, use_td_target=True,
          trajectory_policy='e-greedy',
          label="dnd_egreedy",
          eval_env=None,
          reward_per_episode=None,
          n_rollout=8,
          seed=1337):
  print(f"training with gamma {gamma} and learning rate {learning_rate}")
  env.seed(seed)

  replay_buffer = ReplayBuffer(BUFFER_CAPACITY, device)
  # load model checkpoint if available
  model = QNetwork(device).to(device)
  target_model = QNetwork(device).to(device)

  # intialize target network with the same weights as the model
  target_model.load_state_dict(model.state_dict())

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)
  best_avg = -10
  best_step = 0
  temperature = 5.0
  if reward_per_episode is None:
    reward_per_episode = []
    
  epsilon = EPSILON_START

  for step in tqdm.tqdm(range(max_steps)):
    generate_batch_trajectories(env, model, n_rollout, replay_buffer, temperature=temperature,
                                epsilon=epsilon, policy=trajectory_policy, horizon=T_HORIZON)

    states, actions, rewards, infos, is_terminals = replay_buffer.sample(BATCH_SIZE)
    rewards_collected = 0
    for _ in range(NUM_UPDATES):
      rewards_collected = 0
      total_loss = 0.0

      for i in range(len(states)):
        s = states[i]
        a = actions[i]
        env_info = infos[i]
        r = torch.tensor(rewards[i]).to(device).unsqueeze(1)
        is_terminal = torch.tensor(is_terminals[i]).float().to(device).unsqueeze(1)

        if use_td_target:
          with torch.no_grad():
            s_next = s[1:]
            a_next = a[1:]
            q_targets = target_model.forward(s_next, a_next, pre_converted=True, pre_converted_action=True).detach()
        else: # Q-learning target == "slow"
          with torch.no_grad():
            s_next = s[1:]
            s_info = env_info[1:]
            q_targets = torch.zeros(len(s_next)).to(device)

            for index in range(len(s_info)):
              info = s_info[index]
              state = s_next[index]

              if len(state) == 0:
                q_targets[index] = 0
                continue

              total_available_moves = len(info["available_moves"])
              states_t = [state] * total_available_moves
              avail_actions = info["available_moves"]
              assert len(states_t) > 0, "No available states"
              assert len(avail_actions) > 0, "No available moves"

              q_values = target_model.forward(states_t, avail_actions, pre_converted=True).detach().squeeze(1)
              if len(q_values) == 0:
                q_targets[index] = 0
              else:
                q_targets[index] = torch.max(q_values).item()

            q_targets = q_targets.unsqueeze(1)
            assert q_targets.shape == r.shape, f"q_targets shape {q_targets.shape} != r shape {r.shape}"

        targets = r + gamma * q_targets * (1 - is_terminal)

        s_input = s[0:-1]
        a_input = a[0:-1]
        output = model.forward(s_input, a_input, pre_converted=True, pre_converted_action=True)
        q_sa = output

        value_loss = nn.MSELoss()(q_sa, targets)
        optimizer.zero_grad()
        value_loss.backward()
        total_loss += value_loss.item()
        rewards_collected += r.sum().item()
        optimizer.step()

    # save model checkpoint

    # evaluate model performance
    if step % 10 == 0:
      if eval_env is None:
        eval_env = env

      eval_rewards = []
      for _ in range(EVAL_STEPS):
        _, _, rewards, _, _, _ = generate_trajectory(eval_env, model, policy='greedy')
        total_reward = sum(rewards)
        eval_rewards.append(total_reward)

      avg_rewards = np.mean(eval_rewards)
     
      # print(f"eval rewards: {avg_rewards}")
      reward_per_episode.append(avg_rewards)
      std_rewards = np.std(reward_per_episode)

      # print(f"total reward: {total_reward}")
      if trajectory_policy == "e-greedy":
        print(f"{step}: avg rewards {avg_rewards} std: {std_rewards} best avg {best_avg}@{best_step} epsilon {epsilon}")
      elif trajectory_policy == "boltzmann":
        print(f"{step}: avg rewards {avg_rewards} std: {std_rewards} best avg {best_avg}@{best_step} temperature {temperature}")
      else:
        print(f"{step}: avg rewards {avg_rewards} std: {std_rewards} best avg {best_avg}@{best_step}")

      replay_buffer.print_stats()

      if avg_rewards > best_avg:
        print(f"best: {avg_rewards}")
        best_avg = avg_rewards
        best_step = step
        torch.save(model.state_dict(), f"{PROJECT_OUTPUT_PATH}/model_best_{label}@{step}.pt")
        torch.save(model.state_dict(), f"{PROJECT_OUTPUT_PATH}/model_best_{label}.pt")

      # torch.save(model.state_dict(), f"model_{label}_{gamma}_{learning_rate}.pt")


    # if step % 100 == 0:
    #   torch.save(model.state_dict(), f"model_{label}_{gamma}_{learning_rate}_{step}.pt")


    gc.collect()
    
    # decay temp
    temperature = np.max([0.1, temperature * TEMP_DECAY])

    # decay epsilon
    epsilon = EPSILON_FINAL + (EPSILON_START - EPSILON_FINAL) * np.exp(-1.0 * step / EPSILON_DECAY_FRAMES)

    if step % TARGET_UPDATE_FREQ == 0:
      # calculate the avg change weights of the model with the target model
      total_change = 0
      for p, p_target in zip(model.parameters(), target_model.parameters()):
        total_change += torch.abs(p - p_target).sum().item()
      # print(f"total change: {total_change}")

      target_model.load_state_dict(model.state_dict())

  env.close()
  return reward_per_episode


Specify the location of the game configuration

In [None]:
game_setup_path = "map_with_obstacles"

Create the env setup. Note that we use damaged based rewards to give a denser reward signalling.

In [None]:
def make_env(root_path, render_mode="ansi", show_logs=False, custom_agent=None):
    return make("dndenv-v0", root_path=root_path, show_logs=show_logs,
                render_mode=render_mode,
                damage_based_reward=True,
                custom_agent=custom_agent,
                profiles=lambda: random.choice(['high_elf_fighter', 'high_elf_mage', 'dwarf_cleric', 'halfling_rogue']),
                enemies=lambda: random.choice(['high_elf_fighter', 'high_elf_mage', 'dwarf_cleric', 'halfling_rogue']),
                map_file=lambda: random.choice(['maps/simple_map',\
                                                'maps/complex_map', \
                                                'maps/game_map', \
                                                'maps/walled_map'])
                )

In [None]:
env = make_env(game_setup_path)

In [None]:
seed = 1337
# Create a grid of learning rates and gammas
learning_rates = [0.0001]
gammas = [0.99]

results = {}
for lr in learning_rates:
  results[lr] = {}
  for gamma in gammas:
    seed = seed + 1
    reward_per_episode = train(env, gamma, lr, max_steps=MAX_STEPS, seed=seed, use_td_target=True)
    results[lr][gamma] = reward_per_episode


Summarize rewards per episode

In [None]:
for item in results:
  for gamma in results[item]:
    print(f"lr: {item} gamma: {gamma} rewards: {results[item][gamma]}")

In [None]:
# plot the results
import matplotlib.pyplot as plt

for lr in learning_rates:
    for gamma in gammas:
        plt.plot(results[lr][gamma], label=f"lr={lr}, gamma={gamma}")
plt.legend()
plt.show()


Setup a mixed LLM vs Rules based AI adversary and compare performance

LLM related config, like which LLM to use

In [None]:
USE_OPENAI=False
LLAMA3_BASE_URL= os.environ.get('LLAMA3_BASE_URL', 'http://localhost:8000/v1')

In [None]:
prompt = GPT4Interfacer(debug=False, tools=False, base_url=LLAMA3_BASE_URL, api_key="token1234", variant="NousResearch/Meta-Llama-3.1-8B-Instruct", explain=False)

Setup interface for the adversarial moves

In [None]:
class CustomAgent:
    def action(self, observation, info):
        return self.llm_interface.select_action_for_state(observation, info)

    def __str__(self) -> str:
        return "Custom LLM Agent"

agent = CustomAgent(prompt)

We make an evaluation environment and a training environment. The evaluation environment is the same env config as the previous training run, while the training environment incorporates the LLM as the adversary. We do this to compare the training results.

In [None]:
from natural20.generic_controller import GenericController
from natural20.gym.dndenv_controller import DndenvController

eval_env = make_env(game_setup_path)

def mix_controller(session, mix=0.8, **kwargs):
    # 80% of the time use the generic controller, 20% LLM
    if (random.random() < mix):
        return GenericController(session)
    else:
        return DndenvController(session, agent)

env = make_env(game_setup_path, custom_agent=mix_controller)

In [None]:
seed = 1337
# Create a grid of learning rates and gammas
learning_rates = [0.001]
gammas = [0.99]

results_2 = {}
for lr in learning_rates:
  results_2[lr] = {}
  for gamma in gammas:
    seed = seed + 1
    reward_per_episode = train(env, gamma, lr, max_steps=MAX_STEPS, seed=seed, \
                               eval_env=eval_env, trajectory_policy='e-greedy', label="llm_adversary")
    results_2[lr][gamma] = reward_per_episode

In [None]:
for item in results_2:
  for gamma in results_2[item]:
    print(f"lr: {item} gamma: {gamma} rewards: {results_2[item][gamma]}")

In [None]:
# plot the results
import matplotlib.pyplot as plt

for lr in learning_rates:
    for gamma in gammas:
        plt.plot(results_2[lr][gamma], label=f"lr={lr}, gamma={gamma}")
plt.legend()
plt.show()

Configure prompter for mistral

In [None]:
VARIANT = 'mistralai/Mistral-7B-Instruct-v0.3'
MISTRAL_URL = os.environ.get('MISTRAL_URL', "http://localhost:8000/v1")
prompt_mistral = GPT4Interfacer(debug=False, tools=False, base_url=MISTRAL_URL, api_key="token1234", variant=VARIANT, explain=False)

In [None]:
class CustomAgent:
    def __init__(self, llm_interface):
        self.llm_interface = llm_interface

    def action(self, observation, info):
        return self.llm_interface.select_action_for_state(observation, info)

    def __str__(self) -> str:
        return "Custom LLM Agent"

agent_mistral = CustomAgent(prompt_mistral)

In [None]:
from natural20.generic_controller import GenericController
from natural20.gym.dndenv_controller import DndenvController
from llm_interface import GPT4Interfacer

eval_env = make_env(game_setup_path)

def mix_controller_mistral(session, mix=0.8, **kwargs):
    # 80% of the time use the generic controller, 20% LLM
    if (random.random() < mix):
        return GenericController(session)
    else:
        return DndenvController(session, agent_mistral)

env = make_env(game_setup_path, custom_agent=mix_controller_mistral)

In [None]:
seed = 1337
# Create a grid of learning rates and gammas
learning_rates = [0.001]
gammas = [0.99]

results_4 = {}
for lr in learning_rates:
  results_4[lr] = {}
  for gamma in gammas:
    seed = seed + 1
    reward_per_episode = train(env, gamma, lr, max_steps=MAX_STEPS, seed=seed, \
                               eval_env=eval_env, trajectory_policy='e-greedy', label="llm_adversary_mistral")
    results_4[lr][gamma] = reward_per_episode

In [None]:
for item in results_4:
  for gamma in results_4[item]:
    print(f"lr: {item} gamma: {gamma} rewards: {results_4[item][gamma]}")

# show the best reward found
print(f"Best reward: {max(results_4[0.001][0.99])}")

In [None]:
# plot the results
import matplotlib.pyplot as plt

for lr in learning_rates:
    for gamma in gammas:
        plt.plot(results_4[lr][gamma], label=f"lr={lr}, gamma={gamma}")
plt.legend()
plt.show()

Configure OpenAI GPT-4

In [None]:
VARIANT = 'gpt-4o-mini'
BACKUP_VARIANT = 'NousResearch/Meta-Llama-3.1-8B-Instruct'

In [None]:

prompt_gpt4 = GPT4Interfacer(debug=False, tools=True, api_key="OPENAI_TOKEN", variant=VARIANT, explain=False)
prompt_backup = GPT4Interfacer(debug=False, tools=False, base_url=LLAMA3_BASE_URL, api_key="token1234", variant=BACKUP_VARIANT)

class CustomAgentGPT4:
    """
    Custom agent that uses the GPT-4 model for action selection.
    Uses a backup LLM for movement and free action selection.
    """
    def __init__(self, llm_interface, backup_interface=None):
        self.llm_interface = llm_interface
        self.backup_interface = backup_interface
        self.invocation_count = 0
        self.backup_invocation_count = 0

    def action(self, observation, info):
        action, bonus_action, _ =  observation["turn_info"]
        if self.backup_interface is not None and action==0:
            self.backup_invocation_count += 1
            return self.backup_interface.select_action_for_state(observation, info)

        self.invocation_count += 1
        return self.llm_interface.select_action_for_state(observation, info)

    def __str__(self) -> str:
        return "Custom LLM Agent"

    def __repr__(self) -> str:
        return "Custom LLM Agent"

agent_gpt4 = CustomAgentGPT4(prompt_gpt4, backup_interface=prompt_backup)


In [None]:
from natural20.generic_controller import GenericController
from natural20.gym.dndenv_controller import DndenvController

eval_env = make_env(game_setup_path)

def mix_controller_gpt4(session, mix=0.8, **kwargs):
    # 80% of the time use the generic controller, 20% LLM
    if (random.random() < mix):
        return GenericController(session)
    else:
        return DndenvController(session, agent_gpt4)

env = make_env(game_setup_path, custom_agent=mix_controller_gpt4)

In [None]:
seed = 1337
# Create a grid of learning rates and gammas
learning_rates = [0.001]
gammas = [0.99]

results_5 = {}
for lr in learning_rates:
  results_5[lr] = {}
  for gamma in gammas:
    seed = seed + 1
    results_5[lr][gamma] = []
    reward_per_episode = train(env, gamma, lr, max_steps=MAX_STEPS, seed=seed, \
                               reward_per_episode=results_5[lr][gamma],
                               eval_env=eval_env, trajectory_policy='e-greedy', label="llm_adversary_gpt4")
    results_5[lr][gamma] = reward_per_episode

In [None]:
for item in results_5:
  for gamma in results_5[item]:
    print(f"lr: {item} gamma: {gamma} rewards: {results_5[item][gamma]}")

# show the best reward found
print(f"Best reward: {max(results_5[0.001][0.99])}")

In [None]:
# plot the results
import matplotlib.pyplot as plt

for lr in learning_rates:
    for gamma in gammas:
        plt.plot(results_5[lr][gamma], label=f"lr={lr}, gamma={gamma}")
plt.legend()
plt.show()

In [None]:
VARIANT = 'gpt-4o'
BACKUP_VARIANT = 'NousResearch/Meta-Llama-3.1-8B-Instruct'
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', None)

In [None]:
prompt_gpt4 = GPT4Interfacer(debug=False, tools=True, api_key=OPENAI_API_KEY, variant=VARIANT, explain=False)
agent_gpt4 = CustomAgentGPT4(prompt_gpt4, backup_interface=prompt_backup)

from natural20.generic_controller import GenericController
from natural20.gym.dndenv_controller import DndenvController

eval_env = make_env(game_setup_path)

def mix_controller_gpt4(session, mix=0.8, **kwargs):
    # 80% of the time use the generic controller, 20% LLM
    if (random.random() < mix):
        return GenericController(session)
    else:
        return DndenvController(session, agent_gpt4)

env = make_env(game_setup_path, custom_agent=mix_controller_gpt4)

In [None]:
seed = 1337
# # Create a grid of learning rates and gammas
# learning_rates = [0.001]
# gammas = [0.99]

# results_6 = {}
# reward_per_episode = []
# for lr in learning_rates:
#   results_6[lr] = {}
#   for gamma in gammas:
#     seed = seed + 1
#     results_6[lr][gamma] = []
#     reward_per_episode = train(env, gamma, lr, max_steps=MAX_STEPS, seed=seed, \
#                                reward_per_episode=results_6[lr][gamma],
#                                eval_env=eval_env, trajectory_policy='e-greedy', label="llm_adversary_gpt4_2")
#     results_6[lr][gamma] = reward_per_episode

Summarize best Avg. Rewards obtained from each training method

In [None]:
results = {}
results[0.001] = {}
results[0.001][0.99] = [-7.634523809523809, -3.7698412698412698, -2.4611111111111112, 0.12420634920634925, -4.1460317460317455, -3.848412698412699, -1.9047619047619049, 1.5638888888888889, -1.4793650793650794, 0.19523809523809518, -1.801984126984127, 2.0448412698412697, 0.030158730158729982, 0.8888888888888887, 1.3702380952380953, 3.7460317460317456, 1.925, -1.6642857142857141, 2.0055555555555555, 1.753968253968254, 1.8, 1.5722222222222222, 2.4007936507936507, 4.449999999999999, 3.3579365079365084, 1.650793650793651, 1.9138888888888885, 4.8388888888888895, 1.8265873015873018, 0.757936507936508, 2.0436507936507935, -0.9964285714285714, 1.2666666666666668, 3.8055555555555554, 3.211111111111111, 0.1599206349206352, 0.38888888888888873, 2.65, 2.2079365079365085, 0.7555555555555554, 5.116666666666667, 1.7682539682539684, 2.522222222222222, 3.7888888888888888, 4.283333333333333, 3.4333333333333336, 3.2999999999999994, 2.354761904761905, 0.5055555555555555, 0.041666666666666644, 3.0980158730158736, 4.307936507936508, 3.40952380952381, 2.3083333333333336, 1.9777777777777776, 2.0888888888888886, 2.8809523809523814, 0.9626984126984126, 2.7999999999999994, 5.114285714285714, 0.24246031746031732, 3.538492063492064, 3.6317460317460313, 3.8337301587301584, 3.584920634920635, 2.906746031746031, 4.142857142857142, 0.9460317460317461, 4.44404761904762, 3.226190476190476, 3.2, 3.3317460317460315, 3.271428571428571, 1.1944444444444442, 3.425396825396825, 2.7376984126984127, 5.161904761904761, 3.8416666666666663, 3.501587301587301, 1.734920634920635, 2.083333333333333, 3.7476190476190476, 3.2904761904761903, 5.750396825396825, 4.102777777777778, 2.8706349206349207, 2.6519841269841264, 3.9178571428571427, 1.575, 3.088095238095238, 4.911111111111111, 3.738888888888889, 3.4444444444444446, 5.966666666666666, 3.6869047619047617, 5.313888888888889, 5.48452380952381, 4.851984126984127, 2.298809523809524, 4.462301587301587]


In [None]:
rewards_per_method = {}
for item in results:
  for gamma in results[item]:
    rewards_per_method["rules_based"] = np.max(results[item][gamma])
for item in results_2:
  for gamma in results_2[item]:
    rewards_per_method["llm_llama3"] = np.max(results_2[item][gamma])
for item in results_4:
    for gamma in results_4[item]:
        rewards_per_method["llm_mistral"] = np.max(results_4[item][gamma])
for item in results_5:
    for gamma in results_5[item]:
        rewards_per_method["llm_gpt4o"] = np.max(results_5[item][gamma])

# show a table of the best rewards from each method:

import pandas as pd
df = pd.DataFrame(rewards_per_method.items(), columns=["method", "rewards" ])
df


Perform some tests on the trained agent. Show a combat log from a fight against the rules based AI. Define a policy based on the Model.

In [None]:
import os

MAX_STEPS = 500
NUM_EPISODES = 100

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ModelPolicy:
    def __init__(self, weights_file = 'model_best_dnd_egreedy.pt'):
        self.model = QNetwork(device=device)
        self.model.to(device)
        if not os.path.exists(weights_file):
            raise FileNotFoundError(f"Model file {weights_file} not found. Please run dnd_dqn.ipynb notebook to train an agent.")
        self.model.load_state_dict(torch.load(weights_file))

    def action(self, state, info):
        available_moves = info["available_moves"]
        values = torch.stack([self.model(state, move) for move in available_moves])
        for index, v in enumerate(values):
            print(f"{index}: {available_moves[index]} {v.item()}")

        chosen_index = torch.argmax(values).item()
        return available_moves[chosen_index]


In [None]:

env = make_env(game_setup_path)

print("=========================================")
print("Battle between an RL agent vs a Rules based AI")
print("=========================================")
win_count = 0
loss_count = 0
for i in range(NUM_EPISODES):
    observation, info = env.reset()
    model = ModelPolicy()
    action = action = model.action(observation, info)

    print(f"selected action: {action}")
    terminal = False
    episode = 0

    while not terminal and episode < MAX_STEPS:
        episode += 1
        observation, reward, terminal, truncated, info = env.step(action)
        print(env.render())
        if not terminal and not truncated:
            episode_name_with_padding = str(episode).zfill(3)

            # display entity healths
            print(f"Turn {info['current_index']}\n")
            print(f"Reward: {reward}\n")
            print(f"health hero: {observation['health_pct']}\n")
            print(f"health enemy: {observation['health_enemy']}\n")
            print(env.render())
            
            action = model.action(observation, info)
            print(f"agent selected action: {action}")

        if terminal or truncated:
            print(f"Reward: {reward}")
            if reward > 0:
                win_count += 1
            else:
                loss_count += 1
            break
        
print(f"Win count: {win_count} Loss count: {loss_count} Win rate: {win_count/(win_count+loss_count)}")