In [None]:
"""
Copyright © 2021-2022 The Johns Hopkins University Applied Physics Laboratory LLC

Permission is hereby granted, free of charge, to any person obtaining a copy 
of this software and associated documentation files (the “Software”), to 
deal in the Software without restriction, including without limitation the 
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell copies of the Software, and to permit persons to whom the Software is 
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in 
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""

# STE Analysis Notebook

This notebook is intended to perform analysis on STE runs, focusing on saturation value, experience

to saturation, and distribution of both values across multiple runs. The notebook also generates

lines plots for each task variant with mean and 95% confidence intervals.


## Imports


In [None]:
import json
from pathlib import Path

import l2logger.util as l2l
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from l2metrics._localutil import get_block_saturation_perf, smooth
from l2metrics.normalizer import Normalizer
from tqdm.notebook import tqdm

pd.options.display.float_format = "{:,.2f}".format
sns.set_style("darkgrid")
%matplotlib ipympl


## Notebook Settings


In [None]:
# Define path to log data
# UPDATE THIS PATH
log_dir = Path(".")

# Define path to analysis directory for outputs
analysis_dir = log_dir / "analysis"

# Create analysis directory if it doesn't exist
analysis_dir.mkdir(parents=True, exist_ok=True)

# Analysis settings
do_save = True
units = "steps"  # {"exp_num", "steps"}

# L2Metrics settings
perf_measure = "reward"
smoothing_method = "flat"
window_len = None
normalization_method = "task"

# UPDATE THIS PATH
data_range_file = Path("m21_data_range.json")

# Load data range data for normalization and standardize names to lowercase
with open(data_range_file) as drf:
    data_range = json.load(drf)
    data_range = {key.lower(): val for key, val in data_range.items()}


In [None]:
# M21 task name map
task_name_remap = {
    "MiniGridSimpleCrossingS9N1": "simplecrossing_s9n1",
    "MiniGridSimpleCrossingS9N2": "simplecrossing_s9n2",
    "MiniGridSimpleCrossingS9N3": "simplecrossing_s9n3",
    "MiniGridDistShiftR2": "distshift_r2",
    "MiniGridDistShiftR5": "distshift_r5",
    "MiniGridDistShiftR3": "distshift_r3",
    "MiniGridDynObstaclesS6N1": "dynobstacles_s6n1",
    "MiniGridDynObstaclesS8N2": "dynobstacles_s8n2",
    "MiniGridDynObstaclesS10N3": "dynobstacles_s10n3",
    "MiniGridCustomFetchS5T1N2": "customfetch_s5t1n2",
    "MiniGridCustomFetchS8T1N2": "customfetch_s8t1n2",
    "MiniGridCustomFetchS10T2N4": "customfetch_s10t2n4",
    "MiniGridCustomUnlockS5": "customunlock_s5",
    "MiniGridCustomUnlockS7": "customunlock_s7",
    "MiniGridCustomUnlockS9": "customunlock_s9",
    "MiniGridDoorKeyS5": "doorkey_s5",
    "MiniGridDoorKeyS6": "doorkey_s6",
    "MiniGridDoorKeyS7": "doorkey_s7",
}

# Define list of top-level tasks
task_clusters = [
    "simplecrossing",
    "distshift",
    "dynobstacles",
    "customfetch",
    "customunlock",
    "doorkey",
]

# Define list of task variant names
task_names = list(task_name_remap.values())

## Parse Logs


In [None]:
# Initialize DataFrame
tb_df = pd.DataFrame()
train_tb_interp_df = pd.DataFrame()

resampled_step = np.linspace(100, 1_000_000, 10_000)

# Get directories in path to log data
dirs = [p for p in log_dir.glob("*") if p.is_dir()]

# Iterate over log directories
for dir in tqdm(dirs):
    # Read logger info
    try:
        logger_info = l2l.read_logger_info(dir)
    except FileNotFoundError as e:
        print(f"{dir.name}: Logger info file not found!")
        continue

    # Check if performance measure exists in logger info
    if perf_measure not in logger_info["metrics_columns"]:
        print(
            f"{dir.name}: Performance measure ({perf_measure}) not found in valid metrics columns: {logger_info['metrics_columns']}"
        )
        continue

    # Parse log data from directory
    log_data = l2l.read_log_data(dir)

    # Check if performance measure is logged
    if perf_measure not in log_data.columns:
        print(
            f"{dir.name}: Performance measure ({perf_measure}) not found in the log data"
        )
        continue

    # Validate data format
    try:
        l2l.validate_log(log_data, logger_info["metrics_columns"])
    except RuntimeError as e:
        print(f"{dir.name}: {e}")
        continue

    # Filter data by completed experiences
    log_data = log_data[log_data["exp_status"] == "complete"]

    # Drop all rows with NaN values
    log_data = log_data[log_data[perf_measure].notna()]

    # Check for log data after filtering
    if log_data.empty:
        print(f"{dir.name}: Logs do not contain any valid data for: {perf_measure}")
        continue

    # Fill in regime number and sort
    log_data = l2l.fill_regime_num(log_data)
    log_data = log_data.sort_values(by=["regime_num", "exp_num"]).set_index(
        "regime_num", drop=False
    )

    # Get block summary
    block_info = l2l.parse_blocks(log_data)

    # Drop unused columns
    log_data.drop(
        columns=["worker_id", "block_subtype", "task_params", "exp_status"],
        inplace=True,
    )

    # Smooth data
    for regime_num in block_info["regime_num"].unique():
        if block_info.iloc[regime_num].block_type == "train":
            log_data.loc[log_data["regime_num"] == regime_num, perf_measure] = smooth(
                log_data.loc[
                    log_data["regime_num"] == regime_num, perf_measure
                ].to_numpy(),
                window_len=window_len,
                window=smoothing_method,
            )

    # Normalize data
    if normalization_method != "none":
        # Instantiate normalizer
        normalizer = Normalizer(
            perf_measure=perf_measure,
            data=log_data[["task_name", perf_measure]].set_index("task_name"),
            data_range=data_range,
            method=normalization_method,
        )

        log_data = normalizer.normalize(log_data)

    # Add run ID to log data
    log_data.insert(loc=0, column="run_id", value=dir.name)

    # Compute within-block episode and step numbers
    log_data.insert(loc=3, column="step_num", value=0)

    for regime_num in block_info["regime_num"].unique():
        log_data.loc[log_data["regime_num"] == regime_num, "step_num"] = log_data.loc[
            log_data["regime_num"] == regime_num, "episode_step_count"
        ].cumsum()

    tb_df = pd.concat([tb_df, log_data], ignore_index=True)

    train_regime = log_data.loc[log_data["block_type"] == "train"]
    train_tb_interp_df = pd.concat(
        [
            train_tb_interp_df,
            pd.DataFrame(
                {
                    "run_id": dir.name,
                    "task_name": train_regime["task_name"].unique()[0],
                    "steps": resampled_step,
                    "reward": np.interp(
                        resampled_step,
                        train_regime["step_num"].to_numpy(),
                        train_regime["reward"].to_numpy(),
                    ),
                }
            ),
        ],
        ignore_index=True,
    )

train_tb_df = tb_df[tb_df["block_type"] == "train"]
eval_tb_df = tb_df[tb_df["block_type"] == "test"]


In [None]:
neg_tasks = [
    "distshift_r2",
    "distshift_r5",
    "distshift_r3",
    "dynobstacles_s6n1",
    "dynobstacles_s8n2",
    "dynobstacles_s10n3",
]

start_perf = {}

for neg_task in neg_tasks:
    start_perf[neg_task] = (
        train_tb_interp_df[train_tb_interp_df["task_name"] == neg_task]
        .groupby("run_id")
        .head(10)["reward"]
        .mean()
    ) - 1

print(start_perf)

clamps = {}

for task, perf in start_perf.items():
    clamps[task] = (50 - perf) / 50 * -1

print(clamps)


## Compute Characteristics


In [None]:
# Initialize task chracteristic DataFrame
task_char_df = pd.DataFrame()

for run_id in tqdm(train_tb_df["run_id"].unique()):
    task_char_dict = {}

    run_df = train_tb_df[(train_tb_df["run_id"] == run_id)]

    if run_df.size > 0:
        # Get task variant reward data for current run
        data = run_df["reward"].to_numpy()

        # Get task variant step data for current run
        step_data = run_df["step_num"].to_numpy()

        # Get task variant saturation metrics for current run
        sat_val, exp_to_sat, _ = get_block_saturation_perf(data, window_len=5)

        task_char_dict["task_name"] = run_df["task_name"].unique()[0]
        task_char_dict["sat_val"] = sat_val
        task_char_dict["exp_to_sat"] = step_data[exp_to_sat]

        # Append metrics to DataFrame
        task_char_df = pd.concat(
            [task_char_df, pd.DataFrame(task_char_dict, index=[0])], ignore_index=True
        )
    else:
        print(f"{run_id}: No training data")


In [None]:
# Create summary table for STE characteristics
summary_df = task_char_df.groupby("task_name").agg(
    ["mean", "std", "median", "min", "max", "count"]
)
summary_df

In [None]:
# Identify tasks that did not learn above 50% of max performance
summary_df[summary_df[("sat_val", "mean")] < 50]

In [None]:
# Save task characteristic summary
if do_save:
    summary_df.columns = summary_df.columns.map("_".join).str.strip("_")
    summary_df.to_csv(analysis_dir / "characteristics.tsv", sep="\t")

## Compute Transfer


In [None]:
# Initialize transfer DataFrame
transfer_df = pd.DataFrame()

# Convert evaluation dictionaries to DataFrame
for run_id in tqdm(eval_tb_df["run_id"].unique()):
    transfer_row = {}
    transfer_row["base_task"] = task_name_remap[run_id.split("-", 1)[0]]

    run_df = eval_tb_df[(eval_tb_df["run_id"] == run_id)]

    # Check for two evaluation blocks
    block_nums = run_df["block_num"].unique()
    if len(block_nums) == 2:
        transfer_tasks = run_df["task_name"].unique()

        for transfer_task in transfer_tasks:
            transfer_row["transfer_task"] = transfer_task

            p0 = run_df[
                (run_df["block_num"] == block_nums[0])
                & (run_df["task_name"] == transfer_task)
            ]["reward"].mean()
            p1 = run_df[
                (run_df["block_num"] == block_nums[1])
                & (run_df["task_name"] == transfer_task)
            ]["reward"].mean()

            transfer_row["p0"] = p0
            transfer_row["p1"] = p1

            transfer_row["forward_transfer_ratio"] = p1 / p0
            transfer_row["forward_transfer_contrast"] = (p1 - p0) / (p1 + p0)
            transfer_df = pd.concat(
                [transfer_df, pd.DataFrame(transfer_row, index=[0])], ignore_index=True
            )
    else:
        print(f"{run_id}: Invalid number of evaluation blocks to compute transfer")


In [None]:
# Get average transfer values
avg_transfer_df = (
    transfer_df.groupby(by=["base_task", "transfer_task"]).agg(["mean"]).reset_index()
)
avg_transfer_df.columns = avg_transfer_df.columns.droplevel(1)
avg_transfer_df

In [None]:
# Create full transfer matrix
full_transfer_df = pd.DataFrame(columns=task_names)

for task_name in task_names:
    transfer_row = {}

    temp_df = avg_transfer_df[avg_transfer_df["base_task"] == task_name]

    for _, row in temp_df.iterrows():
        transfer_row[row["transfer_task"]] = row["forward_transfer_contrast"]

    full_transfer_df = pd.concat(
        [full_transfer_df, pd.DataFrame(transfer_row, index=[task_name])]
    )

full_transfer_df


In [None]:
# Save transfer matrix
if do_save:
    full_transfer_df.to_csv(analysis_dir / "full_transfer.tsv", sep="\t")

In [None]:
# Get across-task variant-pair transfers
across_transfer_df = pd.DataFrame(columns=task_clusters)

for base_task_cluster in task_clusters:
    across_transfer_row = {}
    for transfer_task_cluster in task_clusters:
        across_transfer_row[transfer_task_cluster] = avg_transfer_df[
            avg_transfer_df["base_task"].str.contains(base_task_cluster)
            & avg_transfer_df["transfer_task"].str.contains(transfer_task_cluster)
        ]["forward_transfer_contrast"].mean()
    across_transfer_df = pd.concat(
        [
            across_transfer_df,
            pd.DataFrame(across_transfer_row, index=[base_task_cluster]),
        ]
    )

across_transfer_df


## Plot Runs


In [None]:
plt.close("all")

# Initialize empty DataFrame
ste_data = pd.DataFrame()

for task_cluster in tqdm(task_clusters):
    if units == "steps":
        ste_data = train_tb_interp_df[
            train_tb_interp_df["task_name"].str.contains(task_cluster)
        ]
    elif units == "exp_num":
        ste_data = train_tb_df[train_tb_df["task_name"].str.contains(task_cluster)]

    if not ste_data.empty:
        fig, ax = plt.subplots(figsize=(12, 6), constrained_layout=True)
        sns.lineplot(data=ste_data, hue="task_name", x=units, y="reward", ax=ax)
        ax.title.set_text(task_cluster)
        ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))

        if do_save:
            plt.savefig(analysis_dir / (task_cluster + ".png"))
    else:
        print(f"No data for task cluster: {task_cluster}")