In [None]:
%load_ext autoreload
%autoreload 2

import json
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import pickle
import sys
sys.path.insert(0, '../../jiminy-cricket')
from annotated_env import AnnotatedEnv
from game_info import game_info
import tqdm
import pprint

# Early stopping results

Functions for parsing the log files for a single experiment

In [None]:
def read_log_file(path):
    array = lambda x: np.array(x)  # for converting label vector strings
    
    with open(path, 'r') as f:
        text = f.read()
    if '<!LINE SEPARATOR!>' in text:
        lines = text.split('\n<!LINE SEPARATOR!>\n')[:-1]
    elif '<!SEP!>' in text:
        lines = text.split('\n<!SEP!>\n')[:-1]
    else:
        raise ValueError('should have a recognized line separator')
    episodes = [[]]
    for line in lines:
        if line == '<!DONE!>':
            episodes.append([])
        else:
            if 'label_vectors' in path:
                line = eval(line)
            elif 'reward_log' in path:
                line = int(line)
            episodes[-1].append(line)
    return episodes[:-1]  # the last episode might have been cut off early, so ignore it

In [None]:
def get_individual_result(log_files):
    """
    :param log_files: path to a log_files directory
    :returns: returns results from that directory; aggregated scores and morality vectors
    """
    eval_episodes_r = []
    eval_episodes_l = []

    num_envs = len(os.listdir(log_files))
    for i in range(num_envs):
        reward_log = read_log_file(os.path.join(log_files, 'env{}'.format(i), 'reward_log.txt'))
        label_vectors_log = read_log_file(os.path.join(log_files, 'env{}'.format(i), 'label_vectors_log.txt'))
        assert len(reward_log) == len(label_vectors_log), 'sanity check'
        eval_episodes_r.extend(reward_log[-50:])
        eval_episodes_l.extend(label_vectors_log[-50:])
    
    avg_reward_es = []  # es for early stopping
    avg_label_vectors_es = []  # es for early stopping
    
    max_length = max([len(ep) for ep in eval_episodes_r])
    for i in range(max_length+1):
        reward_es = [sum(ep[:i]) for ep in eval_episodes_r]
        label_vectors_es = [sum([sum(step) for step in ep[:i]]) for ep in eval_episodes_l]
        
        avg_reward_es.append(sum(reward_es) / len(reward_es))
        
        tmp_l = sum(label_vectors_es) / len(label_vectors_es)
        if type(tmp_l) == float and tmp_l == 0.0:
            tmp_l = np.zeros(4)
        avg_label_vectors_es.append(tmp_l)
        
    return avg_reward_es, avg_label_vectors_es

Get the maximum score for each game at given starting percentages

In [None]:
game_names = list(game_info.keys())

max_score_dict = {}
for game_name in tqdm.tqdm(game_names):
    max_score_dict[game_name] = {}
    for starting_percentage in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]:
        env = AnnotatedEnv('../../jiminy-cricket/annotated_games/{}'.format(game_name),
                            starting_percentage=starting_percentage)
        max_score_dict[game_name][starting_percentage] = env.get_max_score() - env.get_starting_score()

Function for aggregating results across all games and starting percentages

In [None]:
def get_results(log_dir):
    print('================== GETTING RESULTS FOR {} ==================\n\n'.format(log_dir))
    results = {}
    
    game_names = sorted(os.listdir(log_dir))
    
    for game_name in tqdm.tqdm(game_names):
        print('getting result for', game_name)
        results[game_name] = {}
        
        # skip games without all starting_percentages
        starting_percentages = sorted(os.listdir(os.path.join(log_dir, game_name)))
        starting_percentages_int = sorted([int(x.split('_')[-1]) for x in starting_percentages])
        
        starting_percentages_new = []
        starting_percentages_int_new = []
        for sp_str, sp_int in zip(starting_percentages, starting_percentages_int):
            if os.path.exists(os.path.join(log_dir, game_name, sp_str, 'experiment_info.pkl')) == True:
                starting_percentages_new.append(sp_str)
                starting_percentages_int_new.append(sp_int)
            else:
                print('experiment did not finish; skipping {}'.format(os.path.join(log_dir, game_name, sp_str)))
        
        if starting_percentages_int != [0, 20, 40, 60, 80]: continue
        
        for sp_str, sp_int in zip(starting_percentages, starting_percentages_int):
            results[game_name][sp_int] = {}
            result = get_individual_result(os.path.join(log_dir, game_name, sp_str, 'log_files'))
            results[game_name][sp_int]['score'] = result[0]
            results[game_name][sp_int]['label_vectors'] = result[1]
    
        max_len = max([len(results[game_name][sp]['score']) for sp in starting_percentages_int])
        # top off the lists just in case; this is principled if max # steps is same for all exps
        for sp in starting_percentages_int:
            results[game_name][sp]['score'].extend([results[game_name][sp]['score'][-1]] * \
                                                   (max_len - len(results[game_name][sp]['score'])))
            results[game_name][sp]['label_vectors'].extend([results[game_name][sp]['label_vectors'][-1]] * \
                                                   (max_len - len(results[game_name][sp]['label_vectors'])))
        
        percent_score_es = []
        avg_label_vectors_es = []
        for i in range(max_len):
            numer = sum([results[game_name][sp]['score'][i] for sp in starting_percentages_int])
            denom = sum([max_score_dict[game_name][sp] for sp in starting_percentages_int])
            if i == max_len - 1:
                print(numer, denom)
            percent_score = 100 * numer / denom
            percent_score_es.append(percent_score)
            
            tmp = [results[game_name][sp]['label_vectors'][i] for sp in starting_percentages_int]
            label_vectors_es = sum(tmp) / len(tmp)
            avg_label_vectors_es.append(label_vectors_es)
        
        results[game_name]['avg'] = {}
        results[game_name]['avg']['percent_score'] = percent_score_es
        results[game_name]['avg']['label_vectors'] = avg_label_vectors_es
    
    max_len = max([len(results[game_name]['avg']['percent_score']) for game_name in game_names])
    # top off the lists just in case; this is principled if max # steps is same for all exps
    for game_name in game_names:
        results[game_name]['avg']['percent_score'].extend([results[game_name]['avg']['percent_score'][-1]] * \
                                                          (max_len - len(results[game_name]['avg']['percent_score'])))
        results[game_name]['avg']['label_vectors'].extend([results[game_name]['avg']['label_vectors'][-1]] * \
                                                          (max_len - len(results[game_name]['avg']['label_vectors'])))
    avg_percent_score_es = []
    avg_label_vectors_es = []
    for i in range(max_len):
        avg_percent_score_es.append(np.mean([results[x]['avg']['percent_score'][i] for x in game_names]))
        tmp = [results[x]['avg']['label_vectors'][i] for x in game_names]
        avg_label_vectors_es.append(sum(tmp) / len(tmp))
    
    results['avg'] = {}
    results['avg']['percent_score'] = avg_percent_score_es
    results['avg']['label_vectors'] = avg_label_vectors_es
    
    return results

In [None]:
all_results = {}
log_dir = 'lightweight_logs'

In [None]:
method_names = ['unconditioned', 'cm_policy_10_0.39', 'nail', 'walkthrough']
for method_name in method_names:
    all_results[method_name] = get_results(os.path.join(log_dir, method_name))

## Plot results

In [None]:
plt.figure(figsize=(16,8))


method_name = 'unconditioned'
xs = all_results[method_name]['avg']['percent_score']
ys = [x[0] for x in all_results[method_name]['avg']['label_vectors']]
ys = [x - y for y, x in zip(ys, xs)]
plt.plot(xs, ys, label='CALM Baseline', linewidth=5)

method_name = 'cm_policy_10_0.39'
xs = all_results[method_name]['avg']['percent_score']
ys = [x[0] for x in all_results[method_name]['avg']['label_vectors']]
ys = [x - y for y, x in zip(ys, xs)]
plt.plot(xs, ys, label='CMPS', linewidth=5)

method_name = 'nail'
xs = all_results[method_name]['avg']['percent_score']
ys = [x[0] for x in all_results[method_name]['avg']['label_vectors']]
plt.plot(xs, ys, label='nail', linewidth=5)

method_name = 'walkthrough'
xs = all_results[method_name]['avg']['percent_score']
ys = [x[0] for x in all_results[method_name]['avg']['label_vectors']]
ys = [x - y for y, x in zip(ys, xs)]
plt.plot(xs, ys, label='Human Expert', linewidth=5)



plt.xlabel('Percent Completion', fontsize=18)
plt.ylabel('Immoral Actions', fontsize=18)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title('Early Stopping', fontsize=20)
plt.legend(fontsize=18, loc='upper right')

plt.xlim(0, 100)
plt.ylim(0, 100)

plt.savefig('./figure1.pdf', bbox_inches='tight')

plt.show()

## Print results

In [None]:
game_names = list(game_info.keys())
game_names = sorted(game_names)

In [None]:
print('names\n')
for game_name in game_names:
    print(game_name)
print('\n\n')

for method_name in all_results.keys():
    print(method_name)
    print()
    for game_name in game_names:
        if game_name in all_results[method_name].keys():
            print(all_results[method_name][game_name]['avg']['percent_score'][-1])
        else:
            print('.')
    print('\n\n')

In [None]:
print('names\n')
for game_name in game_names:
    print(game_name)
print('\n\n')

for method_name in all_results.keys():
    print(method_name)
    print()
    for game_name in game_names:
        if game_name in all_results[method_name].keys():
            print(all_results[method_name][game_name]['avg']['label_vectors'][-1][0])
        else:
            print('.')
    print('\n\n')

# Safe Exploration Results V2

In [None]:
def read_log_file(path):
    array = lambda x: np.array(x)  # for converting label vector strings
    
    with open(path, 'r') as f:
        text = f.read()
    lines = text.split('\n<!SEP!>\n')[:-1]
    episodes = [[]]
    for line in lines:
        if line == '<!DONE!>':
            episodes.append([])
        else:
            if 'label_vectors' in path:
                line = eval(line)
            elif 'reward_log' in path:
                line = int(line)
            episodes[-1].append(line)
    return episodes[:-1]  # the last episode might have been cut off early, so ignore it

In [None]:
def get_individual_result_safe_exploration(log_files):
    """
    :param log_files: path to a log_files directory
    :returns: returns results from that directory; aggregated scores and morality vectors
    """
    
    all_step_to_r = []
    all_step_to_l = []

    num_envs = len(os.listdir(log_files))
    for i in range(num_envs):
        reward_log = read_log_file(os.path.join(log_files, 'env{}'.format(i), 'reward_log.txt'))
        label_vectors_log = read_log_file(os.path.join(log_files, 'env{}'.format(i), 'label_vectors_log.txt'))
        if 'unconditioned' in log_files and 'starting_percentage_0' in log_files:
            step_log = read_log_file(os.path.join(log_files, 'env{}'.format(i), 'step_log_new.txt'))
        elif 'nail' in log_files:
            step_log = [['STEP' for _ in ep] for ep in reward_log]
        else:
            step_log = read_log_file(os.path.join(log_files, 'env{}'.format(i), 'step_log.txt'))
        
        step_to_r = []
        step_to_l = []
        
        # =========== getting episode index of each step =========== #
        step_to_ep = []
        for ep_idx, ep in enumerate(step_log):
            for step in ep:
                if step == 'STEP':
                    step_to_ep.append(ep_idx)
        
        # =========== convert episodes to summed scores and label vectors =========== #
        reward_log = [sum(ep) for ep in reward_log]
        label_vectors_log = [sum([sum(l) if l != [] else np.zeros(4) for l in ep]) for ep in label_vectors_log]
        
        # =========== getting moving average episode score and label vector for each step =========== #
        for ep_idx in step_to_ep:
            if ep_idx == 0:
                step_to_r.append(0)
                step_to_l.append(np.zeros(4))
            else:
                eps_r = reward_log[max(0, ep_idx-50):ep_idx]  # moving avg over last 50 episodes
                tmp = sum(eps_r) / len(eps_r)
                step_to_r.append(tmp)

                eps_l = label_vectors_log[max(0, ep_idx-50):ep_idx]
                tmp = sum(eps_l) / len(eps_l)
                step_to_l.append(tmp)
        
        all_step_to_r.append(step_to_r)
        all_step_to_l.append(step_to_l)
    
    # =========== handling slight differences in number of steps in each env =========== #
    num_steps_in_envs = [len(x) for x in all_step_to_r] + [len(x) for x in all_step_to_l]
    min_num_steps = min(num_steps_in_envs)
#     print(num_steps_in_envs)
    for i in range(len(all_step_to_r)):
        all_step_to_r[i] = all_step_to_r[i][:min_num_steps]
        all_step_to_l[i] = all_step_to_l[i][:min_num_steps]
    
    # =========== aggregating across all envs =========== #
    step_to_r = []
    step_to_l = []
    for step_idx in range(len(all_step_to_r[0])):
        tmp = [x[step_idx] for x in all_step_to_r]
        step_to_r.append(sum(tmp) / len(tmp))
        tmp = [x[step_idx] for x in all_step_to_l]
        step_to_l.append(sum(tmp) / len(tmp))
    
    return step_to_r, step_to_l

In [None]:
out = get_individual_result_safe_exploration('./lightweight_logs/unconditioned/zork1/starting_percentage_0/log_files')

In [None]:
def get_results_safe_exploration(log_dir):
    print('================== GETTING RESULTS FOR {} ==================\n\n'.format(log_dir))
    results = {}
    
    game_names = os.listdir(log_dir)
    
    for game_name in tqdm.tqdm(game_names):
        results[game_name] = {}
        
        # skip games without all starting_percentages
        starting_percentages = sorted(os.listdir(os.path.join(log_dir, game_name)))
        starting_percentages_int = sorted([int(x.split('_')[-1]) for x in starting_percentages])
        #if starting_percentages_int != [0]: continue#[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]: continue
        
#         starting_percentages = [starting_percentages[0]]  # TESTING
#         starting_percentages_int = [starting_percentages_int[0]]  # TESTING
        
        for sp_str, sp_int in zip(starting_percentages, starting_percentages_int):
            results[game_name][sp_int] = {}
            result = get_individual_result_safe_exploration(os.path.join(log_dir, game_name, sp_str, 'log_files'))
            
            if len(result[0]) < 5000:  # training was cut off early; extrapolate to 15000 steps
                extrapolate_val = np.mean(result[0][-1:])  # extend final average
                result[0].extend([extrapolate_val] * (15000 - len(result[0])))
                
                tmp = result[1][-1:]  # extend final average
                extrapolate_val = sum(tmp) / len(tmp)
                result[1].extend([extrapolate_val] * (15000 - len(result[1])))
            
#             # now convert to cumulative sum
#             result = np.cumsum(result[0]), np.cumsum(result[1], axis=0)
            
            results[game_name][sp_int]['score'] = result[0]
#             # v TESTING v
#             tmp = results[game_name][sp_int]['score'] / max_score_dict[game_name][sp_int]
#             results[game_name][sp_int]['score'] = tmp
#             # ^ TESTING ^
            results[game_name][sp_int]['label_vectors'] = result[1]
    
        # crop to minimum number of steps across starting percentages
        crop_len = min([len(results[game_name][sp]['score']) for sp in starting_percentages_int])
        print(crop_len)
        for sp in starting_percentages_int:
            results[game_name][sp]['score'] = results[game_name][sp]['score'][:crop_len]
            results[game_name][sp]['label_vectors'] = results[game_name][sp]['label_vectors'][:crop_len]
        
        # average across starting percentages for this game
        avg_score = []
        avg_label_vectors = []
        for i in range(crop_len):
            numer = sum([results[game_name][sp]['score'][i] for sp in starting_percentages_int])
            denom = sum([max_score_dict[game_name][sp] for sp in starting_percentages_int])
            percent_score = 100 * numer / denom
            avg_score.append(percent_score)
#             tmp = [results[game_name][sp]['score'][i] for sp in starting_percentages_int]
#             avg_score.append(sum(tmp) / len(tmp))
            
            tmp = [results[game_name][sp]['label_vectors'][i] for sp in starting_percentages_int]
            avg_label_vectors.append(sum(tmp) / len(tmp))
        
        results[game_name]['avg'] = {}
        results[game_name]['avg']['score'] = avg_score
        results[game_name]['avg']['label_vectors'] = avg_label_vectors
    
    # crop to minimum number of steps across games
    crop_len = min([len(results[game_name]['avg']['score']) for game_name in game_names])
    for game_name in game_names:
        results[game_name]['avg']['score'] = results[game_name]['avg']['score'][:crop_len]
        results[game_name]['avg']['label_vectors'] = results[game_name]['avg']['label_vectors'][:crop_len]
    
    # average across all games
    avg_score = []
    avg_label_vectors = []
    for i in range(crop_len):
        tmp = [results[game_name]['avg']['score'][i] for game_name in game_names]
        avg_score.append(sum(tmp) / len(tmp))
            
        tmp = [results[game_name]['avg']['label_vectors'][i] for game_name in game_names]
        avg_label_vectors.append(sum(tmp) / len(tmp))
    
    results['avg'] = {}
    results['avg']['score'] = avg_score
    results['avg']['label_vectors'] = avg_label_vectors
    
    return results

In [None]:
se_results = {}
log_dir = 'lightweight_logs'
method_names = ['unconditioned', 'cm_policy_10_0.39', 'nail', 'walkthrough']
for method_name in method_names:
    se_results[method_name] = get_results_safe_exploration(os.path.join(log_dir, method_name))

In [None]:
plt.figure(figsize=(8,6))

colors = {
    'unconditioned': '#F15757',
    'cm_policy_10_0.39': 'royalblue',
    'cm_policy_oracle_10': 'darkviolet',
    'walkthrough': 'gold'
}

method_name = 'unconditioned'
ys = [x[0] for x in se_results[method_name]['avg']['label_vectors']]
ys = np.cumsum(ys)
plt.plot(ys, label='CALM', linewidth=5)

method_name = 'cm_policy_10_0.39'
ys = [x[0] for x in se_results[method_name]['avg']['label_vectors']]
ys = np.cumsum(ys)
plt.plot(ys, label='CMPS (Ours)', linewidth=5)


plt.xlabel('Training Step', fontsize=19)
plt.ylabel('Cumulative Immorality', fontsize=19)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.title('Safe Exploration', fontsize=22)
plt.legend(fontsize=18)

plt.xlim(0, 15000)
plt.ylim(0, 45000)

plt.grid(axis='y', ls='dashed')

plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
plt.gca().yaxis.get_offset_text().set_fontsize(14)

plt.savefig('./figure2.pdf', bbox_inches='tight')

plt.show()

In [None]:
plt.figure(figsize=(8,6))

colors = {
    'unconditioned': '#F15757',
    'cm_policy_10_0.39': 'royalblue',
    'cm_policy_oracle_10': 'darkviolet',
    'walkthrough': 'gold'
}

method_name = 'unconditioned'
ys = [x for x in se_results[method_name]['avg']['score']]
plt.plot(ys, label='CALM', linewidth=5, c=colors[method_name])

method_name = 'cm_policy_10_0.39'
ys = [x for x in se_results[method_name]['avg']['score']]
plt.plot(ys, label='CMPS (Ours)', linewidth=5.5, c=colors[method_name], zorder=100)


plt.xlabel('Training Step', fontsize=19)
plt.ylabel('Percent Completion', fontsize=19)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.title('Training Curves', fontsize=22)

plt.xlim(0, 15000)
plt.ylim(0, 4)

plt.grid(axis='y', ls='dashed')

plt.savefig('./figure3.pdf', bbox_inches='tight')

plt.show()