# Multi-Lifetime Metrics Evaluation

In [None]:
# (c) 2019 The Johns Hopkins University Applied Physics Laboratory LLC (JHU/APL).
# All Rights Reserved. This material may be only be used, modified, or reproduced
# by or for the U.S. Government pursuant to the license rights granted under the
# clauses at DFARS 252.227-7013/7014 or FAR 52.227-14. For any other permission,
# please contact the Office of Technology Transfer at JHU/APL.

# NO WARRANTY, NO LIABILITY. THIS MATERIAL IS PROVIDED “AS IS.” JHU/APL MAKES NO
# REPRESENTATION OR WARRANTY WITH RESPECT TO THE PERFORMANCE OF THE MATERIALS,
# INCLUDING THEIR SAFETY, EFFECTIVENESS, OR COMMERCIAL VIABILITY, AND DISCLAIMS
# ALL WARRANTIES IN THE MATERIAL, WHETHER EXPRESS OR IMPLIED, INCLUDING (BUT NOT
# LIMITED TO) ANY AND ALL IMPLIED WARRANTIES OF PERFORMANCE, MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT OF INTELLECTUAL PROPERTY
# OR OTHER THIRD PARTY RIGHTS. ANY USER OF THE MATERIAL ASSUMES THE ENTIRE RISK
# AND LIABILITY FOR USING THE MATERIAL. IN NO EVENT SHALL JHU/APL BE LIABLE TO ANY
# USER OF THE MATERIAL FOR ANY ACTUAL, INDIRECT, CONSEQUENTIAL, SPECIAL OR OTHER
# DAMAGES ARISING FROM THE USE OF, OR INABILITY TO USE, THE MATERIAL, INCLUDING,
# BUT NOT LIMITED TO, ANY DAMAGES FOR LOST PROFITS.

In [None]:
# Import necessary modules
import json
from pathlib import Path

import matplotlib
import pandas as pd
import scipy
import seaborn as sns

from evaluation.evaluate import (compute_eval_metrics,
                                 load_computational_costs,
                                 load_performance_thresholds,
                                 load_task_similarities,
                                 unzip_logs)

sns.set_style("dark")
sns.set_context("paper")

pd.options.display.float_format = '{:,.2f}'.format
matplotlib.use('Agg')

## SG-Specific Application Measures

In [None]:
# M9
# perf_measures = {
#     'argonne': 'score',
#     'hrl': 'norm_reward',
#     'sri': 'reward',
#     'teledyne': 'id_accuracy_incremental',
#     'upenn': 'performance'
# }

# M12
perf_measures = {
    'argonne': 'score',
    'hrl': 'reward',
    'sri': 'reward',
    'teledyne': 'object_id_accuracy',
    'upenn': 'performance'
}

## Configure Metrics Report

In [None]:
# Change the three values below to configure metrics report based on processing mode
eval_dir = 'm12_eval'
sg_name = ''
processing_mode = 'raw' # Valid modes: 'raw', 'smoothed', 'normalized', 'normalized_no_outliers'

# L2Metrics settings
kwargs = {}
kwargs['eval_dir'] = Path('../../sg_' + sg_name + '_eval/' + eval_dir)
kwargs['output_dir'] = Path('results/' + processing_mode + '/' + sg_name)
kwargs['ste_dir'] = 'agent_config'
kwargs['ste_averaging_method'] = 'time'
kwargs['perf_measure'] = perf_measures[sg_name]
kwargs['aggregation_method'] = 'mean'
kwargs['maintenance_method'] = 'both'
kwargs['transfer_method'] = 'both'
kwargs['window_length'] = None
kwargs['show_raw_data'] = True
kwargs['show_eval_lines'] = True
kwargs['do_store_ste'] = False
kwargs['do_plot'] = True
kwargs['do_save_plots'] = True
kwargs['do_save'] = True
kwargs['do_save_settings'] = True
output = sg_name + '_' + processing_mode
do_unzip = False

# Generate other input arguments based on data processing mode
kwargs['normalization_method'] = 'task' if processing_mode in [
    'normalized', 'normalized_no_outliers'] else 'none'
kwargs['smoothing_method'] = 'flat' if processing_mode in [
    'smoothed', 'normalized', 'normalized_no_outliers'] else 'none'
kwargs['clamp_outliers'] = processing_mode in ['normalized_no_outliers']

# Load data range data for normalization and standardize names to lowercase
if sg_name == 'sri':
    with open('sri_data_range.json') as f:
        data_range = json.load(f)
        data_range = {key.lower(): val for key, val in data_range.items()}
else:
    data_range = None
kwargs['data_range'] = data_range

# Create output directory if it doesn't exist
if kwargs['do_save_plots'] or kwargs['do_save'] or kwargs['do_save_settings']:
    kwargs['output_dir'].mkdir(parents=True, exist_ok=True)

## Unzip Logs

In [None]:
if do_unzip:
    unzip_logs(eval_dir)

## Compute Metrics for Evaluation

This line of code runs through all the logs in the specified evaluation directory, stores the STE data (if enabled),
then computes metrics on the LL logs with the settings above. The lifetime and task-level metrics for each run are
aggregated into a single DataFrame and dictionary, respectively. The aggregated log data from each run is also returned
as a DataFrame.

In [None]:
ll_metrics_df, ll_metrics_dicts, log_data_df = compute_eval_metrics(**kwargs)

## Summary Report

In [None]:
# Show mean and standard deviation of data
ll_metrics_df.drop(columns=['min', 'max', 'num_lx', 'num_ex']).groupby(
    by=['scenario_type', 'complexity', 'difficulty']).agg(['mean', 'std'])

In [None]:
# Show median and IQR of data
ll_metrics_df.drop(columns=['min', 'max', 'num_lx', 'num_ex']).groupby(
    by=['scenario_type', 'complexity', 'difficulty']).agg(['median', scipy.stats.iqr])

## Save Metrics and Log Data

In [None]:
# Save the lifelong learning metrics DataFrame
if kwargs['do_save']:
    if not ll_metrics_df.empty:
        with open(kwargs['output_dir'] / (output + '.tsv'), 'w', newline='\n') as metrics_file:
            ll_metrics_df.set_index(['sg_name', 'agent_config', 'run_id']).sort_values(
                ['agent_config', 'run_id']).to_csv(metrics_file, sep='\t')
    if ll_metrics_dicts:    
        with open(kwargs['output_dir'] / (output + '.json'), 'w', newline='\n') as metrics_file:
            json.dump(ll_metrics_dicts, metrics_file)
    if not log_data_df.empty:
        log_data_df.reset_index(drop=True).to_feather(kwargs['output_dir'] / (output + '_data.feather'))

# Save settings for evaluation
if kwargs['do_save_settings']:
    with open(kwargs['output_dir'] / (kwargs['output'] + '_settings.json'), 'w') as outfile:
        kwargs['eval_dir'] = str(kwargs.get('eval_dir', ''))
        kwargs['output_dir'] = str(kwargs.get('output_dir', ''))
        json.dump(kwargs, outfile)