## Compare the results of handcrafted, baselines, and HumemAI


In [None]:
from typing import Literal
import os
from glob import glob
from humemai.utils import read_yaml
import pandas as pd
import numpy as np

from typing import Literal
import pandas as pd


def get_handcrafted(
    size: Literal["xxs", "xs", "s", "m", "l", "xl", "xxl"],
    include_pretrain_semantic: bool = False,
    base_path: str = "training-results/non-equal-object-probs/",
) -> pd.DataFrame:
    """Get hand-crafted results.

    room_size=xxs   num_obs=6.0     max_obs=6   min_obs=6
    room_size=xs    num_obs=6.52    max_obs=8   min_obs=5
    room_size=s     num_obs=5.64    max_obs=7   min_obs=5
    room_size=m     num_obs=6.3     max_obs=10  min_obs=5
    room_size=l     num_obs=5.32    max_obs=8   min_obs=5
    room_size=xl    num_obs=5.58    max_obs=7   min_obs=5
    room_size=xxl   num_obs=6.0     max_obs=8   min_obs=5

    Args:
        size: room size

    """

    df = pd.read_csv(
        os.path.join(
            base_path, f"handcrafted/hand-crafted-results-room_size={size}.csv"
        )
    )

    if not include_pretrain_semantic:
        df = df[df["pretrain_semantic"] == False]

    df["test_mean"] = pd.to_numeric(df["test_mean"], errors="coerce")
    df_sorted = df.sort_values(
        by=["long_capacity", "test_mean"], ascending=[True, False]
    )
    df = df_sorted.groupby("long_capacity").first().reset_index()

    # add column "room_size":
    df["room_size"] = size

    # Rename columns
    df.rename(
        columns={
            "long_capacity": "capacity",
            "test_mean": "test",
            "test_std": "std_test",
        },
        inplace=True,
    )

    # Drop unnecessary columns
    df.drop(
        columns=["episodic_capacity", "semantic_capacity", "short_capacity"],
        inplace=True,
    )

    df["val"] = np.nan
    df["test_mm"] = np.nan
    df["std_test_mm"] = np.nan
    df["val_mm"] = np.nan
    df["agent_type"] = "handcrafted"
    df["history_block_size"] = np.nan
    df["num_runs"] = 5

    # Reorder columns in df1
    df = df[
        [
            "test",
            "std_test",
            "val",
            "test_mm",
            "std_test_mm",
            "val_mm",
            "num_runs",
            "capacity",
            "agent_type",
            "pretrain_semantic",
            "semantic_decay_factor",
            "room_size",
            "history_block_size",
            "mm_policy",
            "qa_function",
            "explore_policy",
        ]
    ]

    return df


def determine_hyper_parameters(train: dict) -> dict:
    """Determine hyper parameters."""
    hyper_parameters = {}

    if "capacity" in train:
        hyper_parameters["capacity"] = (
            train["capacity"]["episodic"] + train["capacity"]["semantic"]
        )
        if train["capacity"]["episodic"] == 0:
            hyper_parameters["agent_type"] = "semantic"
        elif train["capacity"]["semantic"] == 0:
            hyper_parameters["agent_type"] = "episodic"
        elif train["capacity"]["episodic"] > 0 and train["capacity"]["semantic"] > 0:
            hyper_parameters["agent_type"] = "hybrid"
        else:
            raise ValueError("Unknown agent type")

        hyper_parameters["pretrain_semantic"] = train["pretrain_semantic"]

        if "semantic_decay_factor" in train:
            hyper_parameters["semantic_decay_factor"] = train["semantic_decay_factor"]

    else:
        hyper_parameters["capacity"] = 6 * int(train["history_block_size"])
        hyper_parameters["history_block_size"] = int(train["history_block_size"])
        hyper_parameters["agent_type"] = "baseline"
        hyper_parameters["pretrain_semantic"] = False

    hyper_parameters["room_size"] = train["env_config"]["room_size"]

    return hyper_parameters


def nanmean(data):
    return None if np.isnan(data).any() else round(np.mean(data))


def nanstd(data):
    return None if np.isnan(data).any() else round(np.std(data))


def nanmax(data):
    return None if np.isnan(data).any() else round(np.max(data))


def nanmin(data):
    return None if np.isnan(data).any() else round(np.min(data))


def get_dataframe(
    room_size: Literal["xxs", "xs", "s", "m", "l", "xl", "xxl"],
    base_path: str = "training-results/non-equal-object-probs/",
) -> pd.DataFrame:
    paths = glob(
        os.path.join(base_path, f"baselines/room_size={room_size}/*/*/results.yaml")
    ) + glob(
        os.path.join(
            base_path, f"dqn/room_size={room_size}/*/*/*/*/explore/results.yaml"
        )
    )

    if len(paths) == 0:
        return pd.DataFrame()

    final = {}
    for path in paths:
        result_explore = read_yaml(path)
        val_score_explore = max(
            [foo["mean"] for foo in result_explore["validation_score"]]
        )
        test_score_explore = result_explore["test_score"]["mean"]

        if "baseline" in path:
            val_score_mm = np.nan
            test_score_mm = np.nan
        else:
            result_mm = read_yaml(
                path.replace("explore/results.yaml", "mm/results.yaml")
            )
            val_score_mm = max([foo["mean"] for foo in result_mm["validation_score"]])
            test_score_mm = result_mm["test_score"]["mean"]

        if "baseline" in path:
            train = read_yaml(path.replace("results.yaml", "train.yaml"))
        else:
            train = read_yaml(path.replace("explore/results.yaml", "train.yaml"))

        if "attention" in path:
            train["attention"] = True
        else:
            train["attention"] = False

        hp = determine_hyper_parameters(train)

        hp_str = str(hp)
        if hp_str in final:
            final[hp_str]["val_explore"].append(val_score_explore)
            final[hp_str]["test_explore"].append(test_score_explore)

            final[hp_str]["val_mm"].append(val_score_mm)
            final[hp_str]["test_mm"].append(test_score_mm)

        else:
            final[hp_str] = {
                "val_explore": [val_score_explore],
                "test_explore": [test_score_explore],
                "val_mm": [val_score_mm],
                "test_mm": [test_score_mm],
                "hyper_parameters": hp,
            }

    df_list = []
    for key in final:
        data = final[key]
        hp = data["hyper_parameters"]
        df_list.append(
            {
                "test": nanmean(data["test_explore"]),
                "std_test": nanstd(data["test_explore"]),
                # "test_max": nanmax(data["test_explore"]),
                # "test_min": nanmin(data["test_explore"]),
                "val": nanmean(data["val_explore"]),
                # "std_val": nanstd(data["val_explore"]),
                "test_mm": nanmean(data["test_mm"]),
                # "test_mm_max": nanmax(data["test_mm"]),
                # "test_mm_min": nanmin(data["test_mm"]),
                "std_test_mm": nanstd(data["test_mm"]),
                "val_mm": nanmean(data["val_mm"]),
                # "std_val_mm": nanstd(data["val_mm"]),
                "num_runs": len(data["test_explore"]),
                "capacity": hp.get("capacity", None),
                "agent_type": hp.get("agent_type", None),
                "pretrain_semantic": hp.get("pretrain_semantic", None),
                "semantic_decay_factor": hp.get("semantic_decay_factor", None),
                "room_size": hp.get("room_size", None),
                "history_block_size": hp.get("history_block_size", None),
                "mm_policy": hp.get("mm_policy", None),
                "qa_function": hp.get("qa_function", None),
                "explore_policy": hp.get("explore_policy", None),
            }
        )

    df = pd.DataFrame(df_list)
    df_sorted = df.sort_values(
        by=["capacity", "test"],
        ascending=[True, False],
    )
    return df_sorted


# Function to add blank rows and flag them
def add_blank_rows_and_flag(df):
    # Create a list to hold the new rows
    new_rows = []
    previous_capacity = None

    # Iterate through the dataframe rows
    for index, row in df.iterrows():
        if previous_capacity is not None and row["capacity"] != previous_capacity:
            # Add a blank row and flag it when the capacity changes
            blank_row = pd.Series({col: "" for col in df.columns})
            blank_row["flag"] = True
            new_rows.append(blank_row)
        # Append the current row
        new_row = row.copy()
        new_row["flag"] = False
        new_rows.append(new_row)
        previous_capacity = row["capacity"]

    # Create a new dataframe from the new rows
    new_df = pd.DataFrame(new_rows).reset_index(drop=True)
    return new_df


# Function to highlight the flagged rows
def highlight_blank_rows(row):
    if row.flag:
        return ["background-color: yellow"] * len(row)
    else:
        return [""] * len(row)


def get_all_data(
    size: Literal["xxs", "xs", "s", "m", "l", "xl", "xxl"],
    base_path: str = "training-results/non-equal-object-probs/",
) -> pd.DataFrame:
    df_1 = get_handcrafted(size, base_path=base_path)
    df_2 = get_dataframe(size, base_path=base_path)
    df = pd.concat([df_1, df_2], ignore_index=True)

    df = df.sort_values(
        by=["capacity", "test"],
        ascending=[True, False],
    )

    # Add blank rows and flag them in the dataframe
    df_with_blanks = add_blank_rows_and_flag(df)

    # Apply the highlight function
    df_with_blanks_styled = df_with_blanks.style.apply(highlight_blank_rows, axis=1)

    return df_with_blanks_styled

In [21]:
get_all_data("xxl", base_path="training-results/equal-object-probs/")

In [20]:
get_all_data("xxl-different-prob", base_path="training-results/non-equal-object-probs/")

Unnamed: 0,test,std_test,val,test_mm,std_test_mm,val_mm,num_runs,capacity,agent_type,pretrain_semantic,semantic_decay_factor,room_size,history_block_size,mm_policy,qa_function,explore_policy,flag
0,23.532,4.706223,,,,,5.0,2.0,handcrafted,False,1.0,xl-different-prob,,episodic,episodic,avoid_walls,False
1,,,,,,,,,,,,,,,,,True
2,163.0,19.0,182.0,148.0,19.0,173.0,5.0,6.0,episodic,False,1.0,xl-different-prob,,,,,False
3,162.0,40.0,169.0,105.0,20.0,130.0,5.0,6.0,hybrid,False,1.0,xl-different-prob,,,,,False
4,128.0,8.0,134.0,,,,5.0,6.0,baseline,False,1.0,xl-different-prob,1.0,,,,False
5,42.732,18.336065,,,,,5.0,6.0,handcrafted,False,0.99,xl-different-prob,,random,episodic_semantic,avoid_walls,False
6,36.0,7.0,41.0,107.0,20.0,122.0,5.0,6.0,semantic,False,1.0,xl-different-prob,,,,,False
7,,,,,,,,,,,,,,,,,True
8,194.0,29.0,214.0,191.0,42.0,223.0,5.0,12.0,episodic,False,1.0,xl-different-prob,,,,,False
9,160.0,30.0,175.0,105.0,37.0,136.0,5.0,12.0,hybrid,False,1.0,xl-different-prob,,,,,False


In [2]:
get_all_data("xl", base_path="training-results/equal-object-probs/")

Unnamed: 0,test,std_test,val,test_mm,std_test_mm,val_mm,num_runs,capacity,agent_type,pretrain_semantic,semantic_decay_factor,room_size,history_block_size,mm_policy,qa_function,explore_policy,flag
0,31.134,6.179235,,,,,5.0,2.0,handcrafted,False,1.0,xl,,episodic,episodic,avoid_walls,False
1,,,,,,,,,,,,,,,,,True
2,90.0,11.0,103.0,93.0,18.0,112.0,6.0,6.0,episodic,False,1.0,xl,,,,,False
3,83.0,2.0,93.0,,,,5.0,6.0,baseline,False,,xl,1.0,,,,False
4,81.0,1.0,91.0,146.0,8.0,166.0,5.0,6.0,semantic,False,1.0,xl,,,,,False
5,81.0,3.0,88.0,115.0,15.0,123.0,6.0,6.0,hybrid,False,1.0,xl,,,,,False
6,49.868,9.652445,,,,,5.0,6.0,handcrafted,False,0.99,xl,,semantic,semantic,random,False
7,,,,,,,,,,,,,,,,,True
8,129.0,0.0,138.0,136.0,0.0,150.0,1.0,12.0,episodic,False,1.0,xl,,,,,False
9,107.066,12.385677,,,,,5.0,12.0,handcrafted,False,0.99,xl,,semantic,semantic,random,False


In [3]:
get_all_data("xl-different-prob", base_path="training-results/non-equal-object-probs/")

  df = pd.concat([df_1, df_2], ignore_index=True)


Unnamed: 0,test,std_test,val,test_mm,std_test_mm,val_mm,num_runs,capacity,agent_type,pretrain_semantic,semantic_decay_factor,room_size,history_block_size,mm_policy,qa_function,explore_policy,flag
0,23.8,4.715837,,,,,5.0,2.0,handcrafted,False,0.5,xl-different-prob,,semantic,semantic,avoid_walls,False
1,,,,,,,,,,,,,,,,,True
2,163.0,19.0,182.0,148.0,19.0,173.0,5.0,6.0,episodic,False,,xl-different-prob,,,,,False
3,162.0,40.0,169.0,105.0,20.0,130.0,5.0,6.0,hybrid,False,,xl-different-prob,,,,,False
4,128.0,8.0,134.0,,,,5.0,6.0,baseline,False,,xl-different-prob,1.0,,,,False
5,40.8,10.628787,,,,,5.0,6.0,handcrafted,False,0.5,xl-different-prob,,semantic,semantic,avoid_walls,False
6,36.0,7.0,41.0,107.0,20.0,122.0,5.0,6.0,semantic,False,,xl-different-prob,,,,,False
7,,,,,,,,,,,,,,,,,True
8,194.0,29.0,214.0,191.0,42.0,223.0,5.0,12.0,episodic,False,,xl-different-prob,,,,,False
9,160.0,30.0,175.0,105.0,37.0,136.0,5.0,12.0,hybrid,False,,xl-different-prob,,,,,False


In [4]:
get_all_data("xxl", base_path="training-results/equal-object-probs/")

Unnamed: 0,test,std_test,val,test_mm,std_test_mm,val_mm,num_runs,capacity,agent_type,pretrain_semantic,semantic_decay_factor,room_size,history_block_size,mm_policy,qa_function,explore_policy,flag
0,13.0,2.700948,,,,,5.0,2.0,handcrafted,False,0.7,xxl,,semantic,semantic,avoid_walls,False
1,,,,,,,,,,,,,,,,,True
2,22.798,8.199301,,,,,5.0,6.0,handcrafted,False,0.99,xxl,,semantic,semantic,random,False
3,,,,,,,,,,,,,,,,,True
4,32.732,6.099123,,,,,5.0,12.0,handcrafted,False,0.99,xxl,,semantic,semantic,random,False
5,,,,,,,,,,,,,,,,,True
6,52.734,12.415739,,,,,5.0,24.0,handcrafted,False,0.9,xxl,,semantic,semantic,avoid_walls,False
7,,,,,,,,,,,,,,,,,True
8,77.268,5.821967,,,,,5.0,48.0,handcrafted,False,0.5,xxl,,semantic,semantic,avoid_walls,False
9,,,,,,,,,,,,,,,,,True


In [5]:
get_all_data("xxl-different-prob", base_path="training-results/non-equal-object-probs/")

  df = pd.concat([df_1, df_2], ignore_index=True)


Unnamed: 0,test,std_test,val,test_mm,std_test_mm,val_mm,num_runs,capacity,agent_type,pretrain_semantic,semantic_decay_factor,room_size,history_block_size,mm_policy,qa_function,explore_policy,flag
0,13.8,3.943039,,,,,5.0,2.0,handcrafted,False,0.5,xxl-different-prob,,semantic,semantic,random,False
1,,,,,,,,,,,,,,,,,True
2,42.0,8.0,58.0,41.0,6.0,52.0,5.0,6.0,episodic,False,,xxl-different-prob,,,,,False
3,42.0,4.0,53.0,37.0,7.0,42.0,5.0,6.0,hybrid,False,,xxl-different-prob,,,,,False
4,37.0,15.0,52.0,36.0,14.0,47.0,5.0,6.0,semantic,False,,xxl-different-prob,,,,,False
5,20.066,3.586779,,,,,5.0,6.0,handcrafted,False,0.99,xxl-different-prob,,semantic,semantic,random,False
6,,,,,,,,,,,,,,,,,True
7,51.0,10.0,64.0,54.0,10.0,67.0,5.0,12.0,episodic,False,,xxl-different-prob,,,,,False
8,51.0,9.0,64.0,49.0,8.0,57.0,5.0,12.0,hybrid,False,,xxl-different-prob,,,,,False
9,47.0,8.0,54.0,32.0,6.0,42.0,5.0,12.0,semantic,False,,xxl-different-prob,,,,,False
