In [1]:
import nle.dataset as nld

In [2]:
nld_nao_path = "/code/nld-nao/nld-nao-unzipped"

dbfilename = "/code/NetHack-Research/data/raw/nld-nao.db"

dataset_name = "nld-nao"

In [3]:
if not nld.db.exists(dbfilename):
    nld.db.create(dbfilename)
    # Add NLD-NAO data, use the `add_altorg_directory`.
    nld.add_altorg_directory(nld_nao_path, "nld-nao", dbfilename)
else:
    print(f"Database already exists: {dbfilename}")
    
# Connect Database and print games to verify
db_conn = nld.db.connect(filename=dbfilename)
print(f"NLD-NAO Database contains {nld.db.count_games('nld-nao', conn=db_conn)} games.")

Database already exists: /code/NetHack-Research/data/raw/nld-nao.db
NLD-NAO Database contains 1511228 games.


In [16]:
random_sample = nld.TtyrecDataset(
    "nld-nao",
    batch_size=32,
    seq_length=32,
    dbfilename=dbfilename,
)

minibatch = next(iter(random_sample))
minibatch.keys()
# Show structure of data

dict_keys(['tty_chars', 'tty_colors', 'tty_cursor', 'timestamps', 'done', 'gameids'])

In [17]:
print(minibatch["gameids"])

[[2795346 2795346 2795346 ... 2795346 2795346 2795346]
 [4509335 4509335 4509335 ... 4509335 4509335 4509335]
 [4280847 4280847 4280847 ... 4280847 4280847 4280847]
 ...
 [2704779 2704779 2704779 ... 2704779 2704779 2704779]
 [2757287 2757287 2757287 ... 2757287 2757287 2757287]
 [6416377 6416377 6416377 ... 6416377 6416377 6416377]]


In [18]:
import h5py
import numpy as np

# Define output HDF5 file
HDF5_FILE = "/data/processed/data-mon-hum-neu-any.hdf5"

def save_to_hdf5(minibatch, output_file):
    with h5py.File(output_file, "w") as hdf5_file:
        unique_game_ids = np.unique(minibatch["gameids"])

        for game_id in unique_game_ids:
            mask = minibatch["gameids"] == game_id

            grp = hdf5_file.create_group(str(game_id))  # Store each game separately
            grp.create_dataset("tty_chars", data=minibatch["tty_chars"][mask])
            grp.create_dataset("tty_colors", data=minibatch["tty_colors"][mask])
            grp.create_dataset("tty_cursor", data=minibatch["tty_cursor"][mask[0]])
            grp.create_dataset("timestamps", data=minibatch["timestamps"][mask])
            grp.create_dataset("done", data=minibatch["done"][mask].astype(bool))

        print(f"Saved {len(unique_game_ids)} game trajectories to {output_file}")


# Run function to save minibatch
save_to_hdf5(minibatch, HDF5_FILE)

Saved 32 game trajectories to /code/NetHack-Research/data/raw/data-mon-hum-neu-any.hdf5


In [21]:
# Open the HDF5 file and list all groups (game IDs)
with h5py.File(HDF5_FILE, "r") as hdf5_file:
    print("Dataset Structure:")
    for game_id in hdf5_file.keys():
        print(f"Game ID: {game_id}, Keys: {list(hdf5_file[game_id].keys())}")

        # Print the shape of each dataset in this trajectory
        for key in hdf5_file[game_id].keys():
            print(f"  {key}: {hdf5_file[game_id][key].shape}, dtype={hdf5_file[game_id][key].dtype}")
        
        break  # Only print one sample game to avoid excessive output

Dataset Structure:
Game ID: 1964101, Keys: ['done', 'timestamps', 'tty_chars', 'tty_colors', 'tty_cursor']
  done: (32,), dtype=bool
  timestamps: (32,), dtype=int64
  tty_chars: (32, 24, 80), dtype=uint8
  tty_colors: (32, 24, 80), dtype=int8
  tty_cursor: (0, 32, 2), dtype=int16


In [14]:
from katakomba.env import NetHackChallenge, OfflineNetHackChallengeWrapper
from katakomba.utils.datasets import SequentialBuffer

# The task is specified using the character field
env = NetHackChallenge (
  character = "mon-hum-neu",
  observation_keys = ["tty_chars", "tty_colors", "tty_cursor"]
)

# A convenient wrapper that provides interfaces for dataset loading, score normalization, and deathlevel extraction
env = OfflineNetHackChallengeWrapper(env)

# Several options for dataset reading (check the paper for details): 
# - from RAM, decompressed ("in_memory"): fast but requires a lot of RAM, takes 5-10 minutes for decompression first
# - from Disk, decompressed ("memmap"): a bit slower than RAM, takes 5-10 minutes for decompression first
# - from Disk, compressed ("compressed"): very slow but no need for decompression, useful for debugging
# Note that this will download the dataset automatically if not found
dataset = env.get_dataset(mode="compressed", scale="small")


print(f"Successfully loaded dataset with {len(dataset)} episodes")
# Test getting an episode
episode = dataset[0]
print(f"Episode 0 has {len(episode['tty_chars'])} frames and keys: {list(episode.keys())}")


Preparing:   0%|          | 0/683 [00:00<?, ?it/s]

Successfully loaded dataset with 683 episodes
Episode 0 has 17671 frames and keys: ['actions', 'dones', 'rewards', 'tty_chars', 'tty_colors', 'tty_cursor']


In [12]:
buffer = SequentialBuffer(
  dataset=dataset,
  seq_len=32,
  batch_size=32, # Each batch element is a different trajectory
  seed=42,
  add_next_step=True # if you want (s, a, r, s') instead of (s, a, r)
)

# What's inside the batch?
# Note that the next batch will include the +1 element as expected
batch = buffer.sample()
print(
  batch["tty_chars"],  # [batch_size, seq_len + 1, 80, 24]
  batch["tty_colors"], # [batch_size, seq_len + 1, 80, 24]
  batch["tty_cursor"], # [batch_size, seq_len + 1, 2]
  batch["actions"],    # [batch_size, seq_len + 1]
  batch["rewards"],    # [batch_size, seq_len + 1]
  batch["dones"]       # [batch_size, seq_len + 1]
)

print(f"Batch keys: {list(batch.keys())}")
print(f"Batch shapes: {[(k, v.shape) for k, v in batch.items()]}")
# In case you don't want to store the decompressed dataset beyond code execution
dataset.close()

[[[[ 72 101 108 ...  32  32  32]
   [ 32  32  32 ...  32  32  32]
   [ 32  32  32 ...  32  32  32]
   ...
   [ 32  32  32 ...  32  32  32]
   [ 65 103 101 ...  83  58  32]
   [ 68 108 118 ...  32  32  32]]

  [[ 32  32  32 ...  32  32  32]
   [ 32  32  32 ...  32  32  32]
   [ 32  32  32 ...  32  32  32]
   ...
   [ 32  32  32 ...  32  32  32]
   [ 65 103 101 ...  83  58  32]
   [ 68 108 118 ...  32  32  32]]

  [[ 32  32  32 ...  32  32  32]
   [ 32  32  32 ...  32  32  32]
   [ 32  32  32 ...  32  32  32]
   ...
   [ 32  32  32 ...  32  32  32]
   [ 65 103 101 ...  83  58  32]
   [ 68 108 118 ...  32  32  32]]

  ...

  [[ 32  32  32 ...  32  32  32]
   [ 32  32  32 ...  32  32  32]
   [ 32  32  32 ...  32  32  32]
   ...
   [ 32  32  32 ...  32  32  32]
   [ 68 105 115 ...  32  32  32]
   [ 32  45  45 ...  32  32  32]]

  [[ 32  32 115 ...  32  32  32]
   [ 83 112 101 ...  32  32  32]
   [ 32  32 115 ...  32  32  32]
   ...
   [ 32  32  32 ...  32  32  32]
   [ 32  32  32 ...  32  3

In [32]:
import os
import shutil
from katakomba.utils.datasets.small_scale import CACHE_PATH

# Print the cache path to verify
print(f"Cache path: {CACHE_PATH}")

# Check if cache directory exists
if os.path.exists(CACHE_PATH):
    # List contents before deletion
    print(f"Cache contents before clearing: {os.listdir(CACHE_PATH)}")
    
    # Remove all contents in the cache directory
    for item in os.listdir(CACHE_PATH):
        item_path = os.path.join(CACHE_PATH, item)
        if os.path.isdir(item_path):
            shutil.rmtree(item_path)
        else:
            os.remove(item_path)
    
    print("Cache cleared successfully")
else:
    print("Cache directory does not exist yet")

Cache path: /home/danielolds/.katakomba/cache
Cache contents before clearing: []
Cache cleared successfully


In [36]:
from katakomba.utils.roles import Role, Race, Alignment
from katakomba.utils.datasets.small_scale import NLDSmallDataset, load_nld_aa_small_dataset
import os

# Try direct loading first
try:
    dataset = NLDSmallDataset(
        role=Role.MONK,
        race=Race.HUMAN,
        align=Alignment.NEUTRAL,
        mode="compressed"  # or "compressed"
    )
    print(f"Successfully loaded dataset with {len(dataset)} episodes")
    
    # Test getting an episode
    episode = dataset[0]
    print(f"Episode 0 has {len(episode['tty_chars'])} frames and keys: {list(episode.keys())}")
    
except Exception as e:
    print(f"Error loading dataset: {e}")

Preparing:   0%|          | 0/32 [00:00<?, ?it/s]

Successfully loaded dataset with 32 episodes
Episode 0 has 32 frames and keys: ['done', 'timestamps', 'tty_chars', 'tty_colors', 'tty_cursor']


In [35]:
from katakomba_copy.env import NetHackChallenge, OfflineNetHackChallengeWrapper
from katakomba_copy.utils.datasets import SequentialBuffer

buffer = SequentialBuffer(
    dataset=dataset,
    seq_len=16,
    batch_size=32,
    seed=42,
    add_next_step=True  # For inverse modeling
)

# Test sampling a batch
batch = buffer.sample()
print(f"Batch keys: {list(batch.keys())}")
print(f"Batch shapes: {[(k, v.shape) for k, v in batch.items()]}")

KeyError: 'actions'

In [23]:
from katakomba_copy.utils.datasets.small_scale import load_nld_aa_small_dataset

try:
    # Directly load the dataset from your local file
    dataset_path = "/data/processed/data-mon-hum-neu-any.hdf5"
    hdf5_file, trajectories = load_nld_aa_small_dataset(mode="in_memory")

    print(f"Successfully loaded dataset with {len(trajectories)} episodes")
    
    # Test getting an episode
    first_episode_key = list(trajectories.keys())[0]  # Get the first episode ID
    episode = trajectories[first_episode_key]

    print(f"Episode {first_episode_key} has {len(episode['tty_chars'])} frames")
    print(f"Keys in episode data: {list(episode.keys())}")

except Exception as e:
    print(f"Error loading dataset: {e}")

Error loading dataset: load_nld_aa_small_dataset() missing 3 required positional arguments: 'role', 'race', and 'align'
