## Compare the results of handcrafted, baselines, and HumemAI


In [22]:
from typing import Literal
import shutil
import os
from glob import glob
from humemai.utils import read_yaml
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from typing import Literal
import pandas as pd


def get_handcrafted(
    size: Literal["xxs", "xs", "s", "m", "l", "xl", "xxl"],
    include_pretrain_semantic: bool = True,
    base_path: str = "training-results/non-equal-object-probs/",
) -> pd.DataFrame:
    """Get hand-crafted results.

    room_size=xxs   num_obs=6.0     max_obs=6   min_obs=6
    room_size=xs    num_obs=6.52    max_obs=8   min_obs=5
    room_size=s     num_obs=5.64    max_obs=7   min_obs=5
    room_size=m     num_obs=6.3     max_obs=10  min_obs=5
    room_size=l     num_obs=5.32    max_obs=8   min_obs=5
    room_size=xl    num_obs=5.58    max_obs=7   min_obs=5
    room_size=xxl   num_obs=6.0     max_obs=8   min_obs=5

    Args:
        size: room size

    """

    df = pd.read_csv(
        os.path.join(
            base_path, f"handcrafted/hand-crafted-results-room_size={size}.csv"
        )
    )

    if not include_pretrain_semantic:
        df = df[
            (df["pretrain_semantic"] == False)
            | (df["pretrain_semantic"].str.lower() == "false")
        ]

    df["test_mean"] = pd.to_numeric(df["test_mean"], errors="coerce")
    df_sorted = df.sort_values(
        by=["long_capacity", "test_mean"], ascending=[True, False]
    )
    df = df_sorted.groupby("long_capacity").first().reset_index()

    # add column "room_size":
    df["room_size"] = size

    # Rename columns
    df.rename(
        columns={
            "long_capacity": "capacity",
            "test_mean": "test",
            "test_std": "std_test",
        },
        inplace=True,
    )

    # Drop unnecessary columns
    df.drop(
        columns=["episodic_capacity", "semantic_capacity", "short_capacity"],
        inplace=True,
    )

    df["val"] = np.nan
    df["test_mm"] = np.nan
    # df["std_test_mm"] = np.nan
    df["val_mm"] = np.nan
    df["agent_type"] = "handcrafted"
    df["history_block_size"] = np.nan
    df["#_runs"] = 5
    df["terminates_at"] = 99

    # Reorder columns in df1
    df = df[
        [
            "test",
            "std_test",
            "val",
            "test_mm",
            # "std_test_mm",
            "val_mm",
            "#_runs",
            "capacity",
            "agent_type",
            "pretrain_semantic",
            "semantic_decay_factor",
            "room_size",
            "history_block_size",
            "mm_policy",
            "qa_function",
            "explore_policy",
        ]
    ]

    return df


def determine_hyper_parameters(train: dict) -> dict:
    """Determine hyper parameters."""
    hyper_parameters = {}

    if "capacity" in train:
        hyper_parameters["capacity"] = (
            train["capacity"]["episodic"] + train["capacity"]["semantic"]
        )
        if train["capacity"]["episodic"] == 0:
            hyper_parameters["agent_type"] = "semantic"
        elif train["capacity"]["semantic"] == 0:
            hyper_parameters["agent_type"] = "episodic"
        elif train["capacity"]["episodic"] > 0 and train["capacity"]["semantic"] > 0:
            hyper_parameters["agent_type"] = "hybrid"
        else:
            raise ValueError("Unknown agent type")

        hyper_parameters["pretrain_semantic"] = train["pretrain_semantic"]

        if "semantic_decay_factor" in train and train["capacity"]["semantic"] > 0:
            hyper_parameters["semantic_decay_factor"] = train["semantic_decay_factor"]

    else:
        hyper_parameters["capacity"] = 6 * int(train["history_block_size"])
        hyper_parameters["history_block_size"] = int(train["history_block_size"])
        hyper_parameters["agent_type"] = "baseline"
        hyper_parameters["pretrain_semantic"] = False

    hyper_parameters["room_size"] = train["env_config"]["room_size"]
    hyper_parameters["num_iterations"] = train["num_iterations"]
    hyper_parameters["replay_buffer_size"] = train["replay_buffer_size"]
    hyper_parameters["warm_start"] = train["warm_start"]
    hyper_parameters["terminates_at"] = train["env_config"]["terminates_at"]
    hyper_parameters["target_update"] = train["target_update_interval"]
    hyper_parameters["min_epsilon"] = train["min_epsilon"]
    hyper_parameters["embedding_dim"] = train["lstm_params"]["embedding_dim"]
    hyper_parameters["relu_for_attention"] = train["lstm_params"]["relu_for_attention"]
    hyper_parameters["concat_embeddings"] = train["lstm_params"]["concat_embeddings"]


    if isinstance(train["gamma"], dict):
        hyper_parameters["gamma_mm"] = train["gamma"]["mm"]
        hyper_parameters["gamma_explore"] = train["gamma"]["explore"]
    else:
        hyper_parameters["gamma_explore"] = train["gamma"]

    return hyper_parameters


def nanmean(data):
    return None if np.isnan(data).any() else round(np.mean(data))


def nanstd(data):
    return None if np.isnan(data).any() else round(np.std(data))


def nanmax(data):
    return None if np.isnan(data).any() else round(np.max(data))


def nanmin(data):
    return None if np.isnan(data).any() else round(np.min(data))


def get_dataframe(
    room_size: Literal["xxs", "xs", "s", "m", "l", "xl", "xxl"],
    base_path: str = "training-results/non-equal-object-probs/",
) -> pd.DataFrame:
    paths = glob(
        os.path.join(base_path, f"baselines/room_size={room_size}/*/*/results.yaml")
    ) + glob(
        os.path.join(base_path, f"dqn/room_size={room_size}/*/*/explore/results.yaml")
    )

    if len(paths) == 0:
        return pd.DataFrame()

    final = {}
    for path in tqdm(paths):
        result_explore = read_yaml(path)
        val_score_explore = max(
            [foo["mean"] for foo in result_explore["validation_score"]]
        )
        test_score_explore = result_explore["test_score"]["mean"]

        if "baseline" in path:
            val_score_mm = np.nan
            test_score_mm = np.nan
        else:
            result_mm = read_yaml(
                path.replace("explore/results.yaml", "mm/results.yaml")
            )
            val_score_mm = max([foo["mean"] for foo in result_mm["validation_score"]])
            test_score_mm = result_mm["test_score"]["mean"]

        if "baseline" in path:
            train = read_yaml(path.replace("results.yaml", "train.yaml"))
        else:
            train = read_yaml(path.replace("explore/results.yaml", "train.yaml"))

        if "attention" in path:
            train["attention"] = True
        else:
            train["attention"] = False

        hp = determine_hyper_parameters(train)

        hp_str = str(hp)
        if hp_str in final:
            final[hp_str]["val_explore"].append(val_score_explore)
            final[hp_str]["test_explore"].append(test_score_explore)
            final[hp_str]["val_mm"].append(val_score_mm)
            final[hp_str]["test_mm"].append(test_score_mm)
            final[hp_str]["path"].append(path.split("/")[5].split(".")[-1])

        else:
            final[hp_str] = {
                "val_explore": [val_score_explore],
                "test_explore": [test_score_explore],
                "val_mm": [val_score_mm],
                "test_mm": [test_score_mm],
                "hyper_parameters": hp,
                "path": [path.split("/")[5].split(".")[-1]],
            }

    df_list = []
    for key in final:
        data = final[key]
        hp = data["hyper_parameters"]
        df_list.append(
            {
                "test": nanmean(data["test_explore"]),
                # "std_test": nanstd(data["test_explore"]),
                # "test_max": nanmax(data["test_explore"]),
                # "test_min": nanmin(data["test_explore"]),
                "val": nanmean(data["val_explore"]),
                # "std_val": nanstd(data["val_explore"]),
                "test_mm": nanmean(data["test_mm"]),
                # "test_mm_max": nanmax(data["test_mm"]),
                # "test_mm_min": nanmin(data["test_mm"]),
                # "std_test_mm": nanstd(data["test_mm"]),
                "val_mm": nanmean(data["val_mm"]),
                # "std_val_mm": nanstd(data["val_mm"]),
                "#_runs": len(data["test_explore"]),
                "capacity": hp.get("capacity", None),
                "agent_type": hp.get("agent_type", None),
                "pretrain_semantic": hp.get("pretrain_semantic", None),
                "semantic_decay_factor": hp.get("semantic_decay_factor", None),
                "room_size": hp.get("room_size", None),
                "history_block_size": hp.get("history_block_size", None),
                "mm_policy": hp.get("mm_policy", None),
                "qa_function": hp.get("qa_function", None),
                "explore_policy": hp.get("explore_policy", None),
                "num_iterations": hp.get("num_iterations", None),
                "replay_buffer_size": hp.get("replay_buffer_size", None),
                "warm_start": hp.get("warm_start", None),
                "terminates_at": hp.get("terminates_at", None),
                "target_update": hp.get("target_update", None),
                # "min_epsilon": hp.get("min_epsilon", None),
                "gamma_mm": hp.get("gamma_mm", None),
                "gamma_explore": hp.get("gamma_explore", None),
                "embedding_dim": hp.get("embedding_dim", None),
                "concat_embeddings": hp.get("concat_embeddings", None),
                "relu_for_attention": hp.get("relu_for_attention", None),
                "path": data["path"],
            }
        )

    df = pd.DataFrame(df_list)
    df_sorted = df.sort_values(
        by=["capacity", "test"],
        ascending=[True, False],
    )
    return df_sorted


# Function to add blank rows and flag them
def add_blank_rows_and_flag(df):
    # Create a list to hold the new rows
    new_rows = []
    previous_capacity = None

    # Iterate through the dataframe rows
    for index, row in df.iterrows():
        if previous_capacity is not None and row["capacity"] != previous_capacity:
            # Add a blank row and flag it when the capacity changes
            blank_row = pd.Series({col: "" for col in df.columns})
            blank_row["flag"] = True
            new_rows.append(blank_row)
        # Append the current row
        new_row = row.copy()
        new_row["flag"] = False
        new_rows.append(new_row)
        previous_capacity = row["capacity"]

    # Create a new dataframe from the new rows
    new_df = pd.DataFrame(new_rows).reset_index(drop=True)
    return new_df


# Function to highlight the flagged rows
def highlight_blank_rows(row):
    if row.flag:
        return ["background-color: yellow"] * len(row)
    else:
        return [""] * len(row)


def get_all_data(
    size: Literal["xxs", "xs", "s", "m", "l", "xl", "xxl"],
    include_pretrain_semantic: bool = True,
    base_path: str = "training-results/non-equal-object-probs/",
) -> pd.DataFrame:
    df_1 = get_handcrafted(
        size, include_pretrain_semantic=include_pretrain_semantic, base_path=base_path
    )
    df_2 = get_dataframe(size, base_path=base_path)
    df = pd.concat([df_1, df_2], ignore_index=True)

    df = df.sort_values(
        by=["capacity", "test"],
        ascending=[True, False],
    )

    df.rename(columns={"semantic_decay_factor": "sem_decay"}, inplace=True)
    df.rename(columns={"test": "test_explore"}, inplace=True)
    df.rename(columns={"val": "val_explore"}, inplace=True)
    df.rename(columns={"pretrain_semantic": "pretrain_sem"}, inplace=True)
    df.rename(columns={"history_block_size": "history"}, inplace=True)

    # Add blank rows and flag them in the dataframe
    df_with_blanks = add_blank_rows_and_flag(df)

    # Apply the highlight function
    df_with_blanks_styled = df_with_blanks.style.apply(highlight_blank_rows, axis=1)

    # Apply number formatting to styled DataFrame
    df_with_blanks_styled = df_with_blanks_styled.format(na_rep="NaN", precision=2)

    return df, df_with_blanks_styled


def filter_paths(
    room_size: Literal["xxs", "xs", "s", "m", "l", "xl", "xxl"],
    agent_type: Literal["baseline", "episodic", "semantic", "hybrid"],
    num_iterations: int,
    capacity: int,
    semantic_decay_factor: float,
    base_path: str,
) -> list:
    paths = glob(
        os.path.join(base_path, f"baselines/room_size={room_size}/*/*/results.yaml")
    ) + glob(
        os.path.join(base_path, f"dqn/room_size={room_size}/*/*/explore/results.yaml")
    )

    if len(paths) == 0:
        return []

    filtered_paths = []
    for path in tqdm(paths):
        if "baseline" in path:
            train_path = path.replace("results.yaml", "train.yaml")
        else:
            train_path = path.replace("explore/results.yaml", "train.yaml")

        train_data = read_yaml(train_path)

        # Determine hyperparameters to check agent type and capacity
        hp = determine_hyper_parameters(train_data)

        if hp.get("num_iterations") != num_iterations:
            continue

        if hp.get("semantic_decay_factor") != semantic_decay_factor:
            continue

        # Check the agent type
        if hp.get("agent_type") != agent_type:
            continue

        # Check the capacity range
        if hp.get("capacity") != capacity:
            continue

        # If all conditions are met, add the path to the filtered list
        filtered_paths.append(path)

    return filtered_paths

In [32]:
# filter_paths(
#     room_size="xl-different-prob",
#     num_iterations=10000,
#     agent_type="hybrid",
#     capacity=12,
#     semantic_decay_factor=0.99,
#     base_path="training-results/non-equal-object-probs/",
# )

# for foo in filter_paths_by_num_iterations("xl-different-prob"):
#     dir_path = '/'.join(foo.split('/')[:-2])
#     shutil.rmtree(dir_path)

# # save dataframe as markdown
# df.to_markdown("./hp-tuning/xl-different-prob.md", index=False)

df, df_styled = get_all_data(
    "xl",
    include_pretrain_semantic=True,
    base_path="training-results/equal-object-probs/",
)
df_styled

100%|██████████| 48/48 [00:09<00:00,  4.99it/s]


Unnamed: 0,test_explore,std_test,val_explore,test_mm,val_mm,#_runs,capacity,agent_type,pretrain_sem,sem_decay,room_size,history,mm_policy,qa_function,explore_policy,num_iterations,replay_buffer_size,warm_start,terminates_at,target_update,gamma_mm,gamma_explore,embedding_dim,concat_embeddings,relu_for_attention,path,flag
0,29.8,0.0,,,,5.0,2.0,handcrafted,False,0.99,xl,,semantic,semantic,avoid_walls,,,,,,,,,,,,False
1,,,,,,,,,,,,,,,,,,,,,,,,,,,True
2,50.7,0.0,,,,5.0,6.0,handcrafted,False,0.99,xl,,semantic,semantic,avoid_walls,,,,,,,,,,,,False
3,,,,,,,,,,,,,,,,,,,,,,,,,,,True
4,90.0,0.0,,,,5.0,12.0,handcrafted,False,0.8,xl,,handcrafted,episodic_semantic,avoid_walls,,,,,,,,,,,,False
5,,,,,,,,,,,,,,,,,,,,,,,,,,,True
6,149.6,0.0,,,,5.0,24.0,handcrafted,include_walls,0.8,xl,,handcrafted,episodic_semantic,avoid_walls,,,,,,,,,,,,False
7,103.0,,118.0,115.0,134.0,18.0,24.0,hybrid,False,0.8,xl,,,,,5000.0,99.0,99.0,99.0,100.0,0.99,0.99,32.0,True,False,"['979854', '791320', '044073', '960173', '489623', '411093', '046225', '388832', '050514', '297062', '356409', '599858', '886241', '394947', '490795', '397670', '097899', '512204']",False
8,103.0,,107.0,96.0,99.0,2.0,24.0,hybrid,False,0.8,xl,,,,,1000.0,99.0,99.0,99.0,100.0,0.99,0.99,32.0,True,False,"['454517', '412807']",False
9,97.0,,110.0,127.0,137.0,18.0,24.0,hybrid,False,0.8,xl,,,,,5000.0,99.0,99.0,99.0,100.0,0.99,1.0,32.0,True,False,"['106983', '049736', '302274', '912739', '006393', '113599', '944326', '640259', '100701', '228064', '668773', '432706', '958182', '062747', '263970', '844072', '820765', '541618']",False
