## Train MM / explore with random sampling


In [None]:
import matplotlib

matplotlib.use("Agg")

import logging

logger = logging.getLogger()
logger.disabled = True

import os
import torch
import numpy as np

from agent import DQNAgent
from tqdm.auto import tqdm
import random
import itertools

# Number of combinations you want
num_combinations = 500  # Change this to however many combinations you need

# default
room_size = "xl-different-prob"
capacity_max = 6
terminates_at = 99
num_iterations = (terminates_at + 1) * 100

prob_type = (
    "non-equal-object-probs" if "different-prob" in room_size else "equal-object-probs"
)
root_path = (
    f"./training-results/{prob_type}/dqn/room_size={room_size}/capacity={capacity_max}/"
)

# root_path = f"training-results/TRASH/{room_size}"

# random
replay_buffer_size_ = [num_iterations // 10]
test_seed_ = [i for i in range(num_combinations)]
target_update_interval_ = [50, 100]
gamma_ = [0.99, 0.9]
semantic_decay_factor_ = [0.8]
pretrain_semantic_ = [False]
relu_between_gcn_layers_ = [False, True]
dropout_between_gcn_layers_ = [False, True]
num_layers_ = [2, 4]
batch_size_ = [32, 64]
embedding_dim_ = [32, 64]
triple_qual_weight_ = [0.8]
intrinsic_explore_reward_ = [0.5, 1.0, 2.0, 5.0, 10]
learning_rate_ = [0.001, 0.0001]
explore_policy_ = ["rl"]
mm_policy_ = ["rl"]
scale_reward_ = [True, False]
gcn_type_ = ["stare"]


# Generate all combinations
params_all = list(
    itertools.product(
        test_seed_,
        target_update_interval_,
        gamma_,
        semantic_decay_factor_,
        pretrain_semantic_,
        replay_buffer_size_,
        relu_between_gcn_layers_,
        dropout_between_gcn_layers_,
        num_layers_,
        batch_size_,
        embedding_dim_,
        triple_qual_weight_,
        intrinsic_explore_reward_,
        learning_rate_,
        explore_policy_,
        mm_policy_,
        scale_reward_,
        gcn_type_,
    )
)

# Random combinations with weighted agent_capacity_
random_combinations = random.sample(params_all, num_combinations)

for i, params in tqdm(enumerate(random_combinations)):
    (
        test_seed,
        target_update_interval,
        gamma,
        semantic_decay_factor,
        pretrain_semantic,
        replay_buffer_size,
        relu_between_gcn_layers,
        dropout_between_gcn_layers,
        num_layers,
        batch_size,
        embedding_dim,
        triple_qual_weight,
        intrinsic_explore_reward,
        learning_rate,
        explore_policy,
        mm_policy,
        scale_reward,
        gcn_type,
    ) = params

    params_dict = {
        "env_str": "room_env:RoomEnv-v2",
        "num_iterations": num_iterations,
        "replay_buffer_size": replay_buffer_size,
        "warm_start": batch_size,
        "batch_size": batch_size,
        "target_update_interval": target_update_interval,
        "epsilon_decay_until": num_iterations,
        "max_epsilon": 1.0,
        "min_epsilon": 0.1,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "capacity": {"long": capacity_max, "short": 15},
        "pretrain_semantic": pretrain_semantic,
        "semantic_decay_factor": semantic_decay_factor,
        "dqn_params": {
            "gcn_layer_params": {
                "type": gcn_type,
                "embedding_dim": embedding_dim,
                "num_layers": num_layers,
                "gcn_drop": 0.1,
                "triple_qual_weight": triple_qual_weight,
            },
            "relu_between_gcn_layers": relu_between_gcn_layers,
            "dropout_between_gcn_layers": dropout_between_gcn_layers,
            "mlp_params": {"num_hidden_layers": num_layers, "dueling_dqn": True},
        },
        "num_samples_for_results": {"val": 5, "test": 10},
        "validation_interval": 1,
        "plotting_interval": 50,
        "train_seed": test_seed + 5,
        "test_seed": test_seed,
        "device": "cpu",
        "qa_function": "latest_strongest",
        "env_config": {
            "question_prob": 1.0,
            "terminates_at": terminates_at,
            "randomize_observations": "all",
            "room_size": room_size,
            "rewards": {"correct": 1, "wrong": 0, "partial": 0},
            "make_everything_static": False,
            "num_total_questions": 1000,
            "question_interval": 1,
            "include_walls_in_observations": True,
        },
        "intrinsic_explore_reward": intrinsic_explore_reward,
        "ddqn": True,
        "default_root_dir": root_path,
        "explore_policy": explore_policy,
        "mm_policy": mm_policy,
        "scale_reward": scale_reward,
    }

    agent = DQNAgent(**params_dict)
    agent.train()

## Run fixed combinations

In [None]:
import matplotlib

matplotlib.use("Agg")

import logging

logger = logging.getLogger()
logger.disabled = True

import os
from agent import DQNAgent
from tqdm.auto import tqdm
import random
import itertools


room_size = "xxl-different-prob"
terminates_at = 99
num_iterations = (terminates_at + 1) * 200
batch_size = 32
semantic_decay_factor = 0.8
num_layers = 2
triple_qual_weight = 0.8

for test_seed in [5, 6, 7, 8]:
    for capacity_max in [96]:
        for replay_buffer_size in [num_iterations]:
            for gamma in [{"mm": 0.95, "explore": 0.95}]:
                for target_update_interval in [100]:
                    for embedding_dim in [90]:
                        prob_type = (
                            "non-equal-object-probs"
                            if "different-prob" in room_size
                            else "equal-object-probs"
                        )
                        root_path = (
                            f"./training-results/{prob_type}/dqn/"
                            f"room_size={room_size}/capacity={capacity_max}/"
                        )
                        for pretrain_semantic in [False]:
                            params_dict = {
                                "env_str": "room_env:RoomEnv-v2",
                                "num_iterations": num_iterations,
                                "replay_buffer_size": replay_buffer_size,
                                "warm_start": batch_size,
                                "batch_size": batch_size,
                                "target_update_interval": target_update_interval,
                                "epsilon_decay_until": num_iterations,
                                "max_epsilon": 1.0,
                                "min_epsilon": 0.1,
                                "gamma": gamma,
                                "learning_rate": 0.001,
                                "capacity": {"long": capacity_max, "short": 15},
                                "pretrain_semantic": pretrain_semantic,
                                "semantic_decay_factor": semantic_decay_factor,
                                "dqn_params": {
                                    "gcn_layer_params": {
                                        "type": "stare",
                                        "embedding_dim": embedding_dim,
                                        "num_layers": num_layers,
                                        "gcn_drop": 0.1,
                                        "triple_qual_weight": triple_qual_weight,
                                    },
                                    "relu_between_gcn_layers": True,
                                    "dropout_between_gcn_layers": False,
                                    "mlp_params": {
                                        "num_hidden_layers": num_layers,
                                        "dueling_dqn": True,
                                    },
                                },
                                "num_samples_for_results": {"val": 5, "test": 10},
                                "validation_interval": 1,
                                "plotting_interval": 50,
                                "train_seed": test_seed + 5,
                                "test_seed": test_seed,
                                "device": "cpu",
                                "qa_function": "latest_strongest",
                                "env_config": {
                                    "question_prob": 1.0,
                                    "terminates_at": terminates_at,
                                    "randomize_observations": "all",
                                    "room_size": room_size,
                                    "rewards": {"correct": 1, "wrong": 0, "partial": 0},
                                    "make_everything_static": False,
                                    "num_total_questions": 1000,
                                    "question_interval": 1,
                                    "include_walls_in_observations": True,
                                },
                                "intrinsic_explore_reward": 0,
                                "ddqn": True,
                                "default_root_dir": root_path,
                                "explore_policy": "rl",
                                "mm_policy": "rl",
                                "scale_reward": False,
                            }

                            agent = DQNAgent(**params_dict)
                            agent.train()

In [1]:
import argparse
from agent.dqn import DQNAgent

room_size = "xl-different-prob"
terminates_at = 99
num_iterations = (terminates_at + 1) * 1
replay_buffer_size = 16
batch_size = 4
semantic_decay_factor = 0.8
num_layers = 2
triple_qual_weight = 0.8
embedding_dim = 64
target_update_interval = 10
capacity_max = 12
test_seed = 0
pretrain_semantic = False

prob_type = (
    "non-equal-object-probs"
    if "different-prob" in room_size
    else "equal-object-probs"
)
root_path = (
    f"./training-results/{prob_type}/dqn/"
    f"room_size={room_size}/capacity={capacity_max}/"
)
if capacity_max == 192:
    pretrained_path = "trained-results/non-equal-object-probs/dqn/room_size=xl-different-prob/capacity=192/2024-08-12 12:58:16.107541/"
elif capacity_max == 96:
    pretrained_path = "trained-results/non-equal-object-probs/dqn/room_size=xl-different-prob/capacity=96/2024-08-12 23:58:06.290168/"
elif capacity_max == 48:
    pretrained_path = "trained-results/non-equal-object-probs/dqn/room_size=xl-different-prob/capacity=48/2024-08-11 11:07:00.648864/"
elif capacity_max == 24:
    pretrained_path = "trained-results/non-equal-object-probs/dqn/room_size=xl-different-prob/capacity=24/2024-08-11 13:36:54.499426/"
elif capacity_max == 12:
    pretrained_path = "trained-results/non-equal-object-probs/dqn/room_size=xl-different-prob/capacity=12/2024-08-11 16:24:54.492650/"
else:
    raise ValueError(f"Invalid capacity_max: {capacity_max}")

params_dict = {
    "env_str": "room_env:RoomEnv-v2",
    "num_iterations": num_iterations,
    "replay_buffer_size": replay_buffer_size,
    "warm_start": batch_size,
    "batch_size": batch_size,
    "target_update_interval": target_update_interval,
    "epsilon_decay_until": num_iterations,
    "max_epsilon": 1.0,
    "min_epsilon": 0.1,
    "gamma": {"mm": 0.90, "explore": 0.90},
    "learning_rate": 0.001,
    "capacity": {"long": capacity_max, "short": 15},
    "pretrain_semantic": pretrain_semantic,
    "semantic_decay_factor": semantic_decay_factor,
    "dqn_params": {
        "gcn_layer_params": {
            "type": "stare",
            "embedding_dim": embedding_dim,
            "num_layers": num_layers,
            "gcn_drop": 0.1,
            "triple_qual_weight": triple_qual_weight,
        },
        "relu_between_gcn_layers": True,
        "dropout_between_gcn_layers": False,
        "mlp_params": {
            "num_hidden_layers": num_layers,
            "dueling_dqn": True,
        },
    },
    "num_samples_for_results": {"val": 1, "test": 10},
    "validation_interval": 1,
    "plotting_interval": 50,
    "train_seed": test_seed + 5,
    "test_seed": test_seed,
    "device": "cpu",
    "env_config": {
        "question_prob": 1.0,
        "terminates_at": terminates_at,
        "randomize_observations": "all",
        "room_size": room_size,
        "rewards": {"correct": 1, "wrong": 0, "partial": 0},
        "make_everything_static": False,
        "num_total_questions": 1000,
        "question_interval": 1,
        "include_walls_in_observations": True,
    },
    "intrinsic_explore_reward": 0,
    "ddqn": True,
    "default_root_dir": root_path,
    "explore_policy": "neural",
    "mm_policy": "neural",
    "qa_function": "bandit",
    "pretrained_path": pretrained_path,
    "llm_params": {
        "model_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "quantization": "4bit",
        "max_new_tokens": 32,
    },
    "scale_reward": False,
}

agent = DQNAgent(**params_dict)
agent.train()


  from .autonotebook import tqdm as notebook_tqdm
  logger.deprecation(
  logger.deprecation(
  logger.warn(f"{pre} should be an int or np.int64, actual type: {type(obs)}")
  logger.warn(f"{pre} is not within the observation space.")


Running on cpu
Assertion passed: dict A is part of dict B.
> [0;32m/home/tk/repos/agent-room-env-v2-qa/agent/dqn/nn/gnn.py[0m(513)[0;36mforward[0;34m()[0m
[0;32m    511 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    512 [0;31m[0;34m[0m[0m
[0m[0;32m--> 513 [0;31m        [0;32mfor[0m [0mlayer_[0m [0;32min[0m [0mself[0m[0;34m.[0m[0mgcn_layers[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    514 [0;31m            [0;32mif[0m [0;34m"stare"[0m [0;32min[0m [0mself[0m[0;34m.[0m[0mgcn_type[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    515 [0;31m                entity_embeddings, relation_embeddings = layer_(
[0m
[['west_inv', 'west', 'south_inv', 'south', 'north_inv', 'north', 'east_inv', 'east', 'current_time', 'atlocation_inv', 'atlocation'], ['west_inv', 'west', 'timestamp', 'strength', 'south_inv', 'south', 'north_inv', 

In [5]:
import torch
torch.randn(10,12)[:2].shape

torch.Size([2, 12])

In [5]:
import torch

torch.tensor([ 0.1815,  0.0025,  0.0979, -0.0953, -0.0693,  0.0169,  0.1384, -0.0427,
         0.1296,  0.0261, -0.1082, -0.0142, -0.1870, -0.0642, -0.0292, -0.1634,
        -0.2894,  0.0402, -0.0284,  0.1223,  0.0674,  0.0541,  0.0152, -0.1415,
         0.1151, -0.1326, -0.1931,  0.0393,  0.0150, -0.0898,  0.0099,  0.0076,
         0.0371, -0.0550, -0.1216,  0.1665,  0.1726,  0.0524, -0.0776,  0.0290,
         0.1052,  0.0510,  0.0469, -0.0721, -0.0614, -0.0033,  0.0174, -0.0381,
         0.0197, -0.1537,  0.0900, -0.1006, -0.0351,  0.0049, -0.0163, -0.1509,
        -0.1135, -0.1118, -0.0411,  0.0645, -0.0284, -0.0641, -0.0260, -0.1871]).shape

torch.Size([64])

In [13]:
torch.stack([torch.randn(10), torch.randn(10)], dim=0).shape

torch.Size([2, 10])

In [10]:
np.array([[
    ["agent", "atlocation", "room_000", {"current_time": 0}],
    ["room_000", "east", "room_001", {"current_time": 0}],
    ["dep_001", "atlocation", "room_000", {"current_time": 0}],
    ["room_000", "west", "wall", {"current_time": 0}],
    ["dep_007", "atlocation", "room_000", {"current_time": 0}],
    ["room_000", "north", "wall", {"current_time": 0}],
    ["room_000", "south", "room_004", {"current_time": 0}],
]]).shape

(1, 7, 4)

In [12]:
import numpy as np
np.array(
    [
        list(
            [
                ["agent", "atlocation", "room_000", {"current_time": 0}],
                ["room_000", "east", "room_001", {"current_time": 0}],
                ["dep_001", "atlocation", "room_000", {"current_time": 0}],
                ["room_000", "west", "wall", {"current_time": 0}],
                ["dep_007", "atlocation", "room_000", {"current_time": 0}],
                ["room_000", "north", "wall", {"current_time": 0}],
                ["room_000", "south", "room_004", {"current_time": 0}],
            ]
        ),
        list(
            [
                ["room_001", "south", "room_005", {"current_time": 1}],
                ["agent", "atlocation", "room_001", {"current_time": 1}],
                ["room_001", "west", "room_000", {"current_time": 1}],
                ["room_001", "north", "wall", {"current_time": 1}],
                ["room_001", "east", "wall", {"current_time": 1}],
                ["dep_007", "atlocation", "room_000", {"timestamp": [0]}],
                ["room_000", "north", "wall", {"strength": 1}],
            ]
        ),
        list(
            [
                ["room_005", "east", "room_006", {"current_time": 2}],
                ["agent", "atlocation", "room_005", {"current_time": 2}],
                ["room_005", "north", "room_001", {"current_time": 2}],
                ["room_005", "south", "wall", {"current_time": 2}],
                ["room_005", "west", "room_004", {"current_time": 2}],
                ["dep_007", "atlocation", "room_000", {"timestamp": [0]}],
                ["room_000", "north", "wall", {"strength": 1}],
                ["agent", "atlocation", "room_001", {"strength": 1}],
                ["room_001", "west", "room_000", {"timestamp": [1]}],
                ["room_001", "north", "wall", {"strength": 1}],
                ["room_001", "east", "wall", {"timestamp": [1]}],
            ]
        ),
        list(
            [
                ["agent", "atlocation", "room_006", {"current_time": 3}],
                ["room_006", "north", "wall", {"current_time": 3}],
                ["sta_004", "atlocation", "room_006", {"current_time": 3}],
                ["room_006", "south", "room_010", {"current_time": 3}],
                ["room_006", "west", "room_005", {"current_time": 3}],
                ["room_006", "east", "room_007", {"current_time": 3}],
                ["dep_007", "atlocation", "room_000", {"timestamp": [0]}],
                ["room_000", "north", "wall", {"strength": 1}],
                ["agent", "atlocation", "room_001", {"strength": 1}],
                ["room_001", "west", "room_000", {"timestamp": [1]}],
                ["room_001", "north", "wall", {"strength": 1}],
                ["room_001", "east", "wall", {"timestamp": [1]}],
                ["room_005", "east", "room_006", {"timestamp": [2]}],
                ["room_005", "north", "room_001", {"strength": 1}],
            ]
        ),
    ],
    dtype=object,
)

(4,)

In [None]:
[
    ["sta_006", "atlocation", "?", 0],
    ["ind_001", "atlocation", "?", 0],
    ["sta_000", "atlocation", "?", 0],
    ["sta_002", "atlocation", "?", 0],
    ["sta_003", "atlocation", "?", 0],
    ["dep_005", "atlocation", "?", 0],
    ["sta_004", "atlocation", "?", 0],
    ["dep_002", "atlocation", "?", 0],
    ["dep_005", "atlocation", "?", 0],
    ["dep_005", "atlocation", "?", 0],
]

In [2]:
agent.

<agent.dqn.dqn.DQNAgent at 0x72006a157730>

In [6]:
agent.env.unwrapped.entities


{'static': ['sta_000',
  'sta_001',
  'sta_002',
  'sta_003',
  'sta_004',
  'sta_005',
  'sta_006',
  'sta_007'],
 'independent': ['ind_000',
  'ind_001',
  'ind_002',
  'ind_003',
  'ind_004',
  'ind_005',
  'ind_006',
  'ind_007'],
 'dependent': ['dep_000',
  'dep_001',
  'dep_002',
  'dep_003',
  'dep_004',
  'dep_005',
  'dep_006',
  'dep_007'],
 'agent': ['agent'],
 'room': ['room_000',
  'room_001',
  'room_002',
  'room_003',
  'room_004',
  'room_005',
  'room_006',
  'room_007',
  'room_008',
  'room_009',
  'room_010',
  'room_011',
  'room_012',
  'room_013',
  'room_014',
  'room_015',
  'room_016',
  'room_017',
  'room_018',
  'room_019',
  'room_020',
  'room_021',
  'room_022',
  'room_023',
  'room_024',
  'room_025',
  'room_026',
  'room_027',
  'room_028',
  'room_029',
  'room_030',
  'room_031'],
 'others': ['wall']}

In [None]:
[for foo in range(10) for bar in range(10)]