In [1]:
import sys
sys.path.append('..\\..')
from academia.tools import visualizations
from academia.curriculum import LearningStats, LearningStatsAggregator
import plotly.graph_objects as go
import numpy as np

pygame 2.5.2 (SDL 2.28.3, Python 3.10.11)
Hello from the pygame community. https://www.pygame.org/contribute.html


# Loading saved stats

### Stats for epsilon reset `1.0`

In [2]:
stats_all_1_0 = []
for i in range(10):
    stats_per_curriculum_1_0={}
    for j in range(3):
        stats_per_curriculum_1_0[f'{j}'] = LearningStats.load(f'outputs/eps=1.0/curriculum_iter={i+1}/{j+1}.stats.json')
    stats_all_1_0.append(stats_per_curriculum_1_0)

### Stats for epsilon reset `0.6`

In [3]:
stats_all_0_6 = []
for i in range(10):
    stats_per_curriculum_0_6={}
    for j in range(3):
        stats_per_curriculum_0_6[f'{j}'] = LearningStats.load(f'outputs/eps=0.6/curriculum_iter={i+1}/{j+1}.stats.json')
    stats_all_0_6.append(stats_per_curriculum_0_6)

### Stats for epsilon reset `0.3`

In [4]:
stats_all_0_3 = []
for i in range(10):
    stats_per_curriculum_0_3={}
    for j in range(3):
        stats_per_curriculum_0_3[f'{j}'] = LearningStats.load(f'outputs/eps=0.3/curriculum_iter={i+1}/{j+1}.stats.json')
    stats_all_0_3.append(stats_per_curriculum_0_3)

### Stats for epsilon reset `0.1`

In [5]:
stats_all_0_1 = []
for i in range(10):
    stats_per_curriculum_0_1={}
    for j in range(3):
        stats_per_curriculum_0_1[f'{j}'] = LearningStats.load(f'outputs/eps=0.1/curriculum_iter={i+1}/{j+1}.stats.json')
    stats_all_0_1.append(stats_per_curriculum_0_1)

### Stats for epsilon reset `0.03`

In [6]:
stats_all_0_03 = []
for i in range(10):
    stats_per_curriculum_0_03={}
    for j in range(3):
        stats_per_curriculum_0_03[f'{j}'] = LearningStats.load(f'outputs/eps=0.03/curriculum_iter={i+1}/{j+1}.stats.json')
    stats_all_0_03.append(stats_per_curriculum_0_03)

# Results

Here are the results of the conducted experiment. The charts below depict the relationship between the agent's evaluation and the number of steps the agent has taken so far. The trajectories presented are differentiated by different colors, representing learning curves for tasks with varying levels of difficulty performed sequentially as part of the curriculum (the difficulty level of each task can be read from the legend).

An important point to note is that the presented trajectories are an average of the results obtained from 10 runs of the algorithm for each epsilon configuration, with a changed random seed during agent and environment initialization. This way, we obtain a curve representing averaged results.

The method used for averaging results is crucial and is implemented in the StatsAggregator class. If the time domain is set to the number of episodes, there is no issue with averaging results because, with each algorithm invocation, the evaluation values are obtained after the same number of episodes.

The problem arises when the time domain is set, for example, to steps, and then we have no guarantee that the number of steps the agent takes to the first evaluation, the second, etc., in the first algorithm invocation exactly matches that in the second invocation (in fact, the chance of such a scenario is minimal). To address this, interpolation is applied, creating a vector of length equal to the unique number of steps the agent has taken to a given evaluation. In the absence of an evaluation at precisely x steps in a particular algorithm invocation, interpolation of that value is performed. 

The gray area around the curve represents the standard deviation calculated from 10 runs of this algorithm.
Each task trajectory will start at 0.9 quantile of previous trajectories' termiantion points

## `eps_reset=1.0`

### Plot agent evaluation vs steps time domain

In [7]:
visualizations.plot_trajectories(stats_all_1_0,
                                 show=True, save_path='outputs/eps=1.0/trajectories.png', show_std=True)

### Plot agent evaluation vs episodes time domain

In [8]:
visualizations.plot_trajectories(stats_all_1_0, time_domain='episodes',
                                 show=True, show_std=True)

## `eps_reset=0.6`

### Plot agent evaluation vs steps time domain

In [9]:
visualizations.plot_trajectories(stats_all_0_6,
                                 show=True, save_path='outputs/eps=0.6/trajectories.png', show_std=True)

### Plot agent evaluation vs episodes time domain

In [10]:
visualizations.plot_trajectories(stats_all_0_6, time_domain='episodes',
                                 show=True, show_std=True)

## `eps_reset=0.3`

### Plot agent evaluation vs steps time domain

In [11]:
visualizations.plot_trajectories(stats_all_0_3,
                                 show=True, save_path='outputs/eps=0.3/trajectories.png', show_std=True)

### Plot agent evaluation vs episodes time domain

In [12]:
visualizations.plot_trajectories(stats_all_0_3, time_domain='episodes',
                                 show=True, show_std=True)

## `eps_reset=0.1`

### Plot agent evaluation vs steps time domain

In [13]:
visualizations.plot_trajectories(stats_all_0_1,
                                 show=True, save_path='outputs/eps=0.1/trajectories.png', show_std=True)

### Plot agent evaluation vs episodes time domain

In [14]:
visualizations.plot_trajectories(stats_all_0_1, time_domain='episodes',
                                 show=True, show_std=True)

## `eps_reset=0.03`

### Plot agent evaluation vs steps time domain

In [15]:
visualizations.plot_trajectories(stats_all_0_03,
                                 show=True, save_path='outputs/eps=0.03/trajectories.png', show_std=True)

### Plot agent evaluation vs episodes time domain

In [16]:
visualizations.plot_trajectories(stats_all_0_03, time_domain='episodes',
                                 show=True, show_std=True)

### Final Summary

In [8]:
def calculate_steps_per_run(stats_all):
    steps_all = []
    for i in range(10):
        curriculum = stats_all[i]
        steps_per_run=[]
        for j in range(3):
            steps = curriculum[f'{j}'].step_counts
            steps_per_run.append(np.sum(steps))
        steps_all.append(np.sum(steps_per_run))
    return steps_all

In [9]:
steps_1_0 = calculate_steps_per_run(stats_all_1_0)
steps_0_6 = calculate_steps_per_run(stats_all_0_6)
steps_0_3 = calculate_steps_per_run(stats_all_0_3)
steps_0_1 = calculate_steps_per_run(stats_all_0_1)
steps_0_03 = calculate_steps_per_run(stats_all_0_03)
all_steps = [steps_1_0, steps_0_6, steps_0_3, steps_0_1, steps_0_03]
tickets = ['eps=1.0', 'eps=0.6', 'eps=0.3', 'eps=0.1', 'eps=0.03']

In [10]:
steps_0_03

[463958.0,
 447727.0,
 679949.0,
 484805.0,
 395144.0,
 2026746.0,
 474870.0,
 441804.0,
 311481.0,
 548262.0]

In [10]:
def plot_boxplot(data, tickets, save_format='svg', save_path=None):
    
    with visualizations.create_figure("Epsilon reset value's impact on training",
                                      show=True, save_format=save_format, save_path=save_path) as fig:
        for i in range(0, 5):
            fig.add_trace(go.Box(y=data[i], name=f'{tickets[i]}', marker_color='blue'))

        fig.update_layout(xaxis=dict(tickvals=[0, 1, 2, 3, 4]), showlegend=False, yaxis_title='Sum of steps in the curriculum')

In [12]:
plot_boxplot(all_steps, tickets, save_format='svg', save_path='./box_plot_eps_reset')

For the boxplot corresponding to the epsilon value of 0.03, a very large outlier is observed. This outlier is likely caused by the pessimistic random state selected during the agent initialization. This conclusion is justified by the fact that the very high value of steps taken by the agent, corresponding to this outlier, appears because the agent needed as many as 11,300 episodes to complete the task at the easiest level. For each epsilon value, the first task is performed with epsilon set to 1, which is gradually decayed in subsequent episodes. Therefore, the learning time of the agent on the first task should be similar, with some variations depending on the value of the aforementioned random state. Here, the difference was significant, hence it seems reasonable to conclude that the agent was initialized with a very pessimistic random state. In the second and third tasks, when the initial epsilon value was already equal to the value to which we reset it, i.e., 0.03, the agent completed task 2 after 3000 episodes and task 3 after 1300 episodes.

To better examine the boxplots, as they are currently highly flattened due to the large outlier, this outlier was removed to obtain a more informative visualization.

In [13]:
steps_0_03_without_outlier = steps_0_03.copy()
steps_0_03_without_outlier.remove(max(steps_0_03_without_outlier))
all_steps = [steps_1_0, steps_0_6, steps_0_3, steps_0_1, steps_0_03_without_outlier]

In [14]:
plot_boxplot(all_steps, tickets, save_format='svg', save_path='./box_plot_eps_reset')

In [7]:
def calculate_steps_per_run_without_first_task(stats_all):
    steps_all = []
    for i in range(10):
        curriculum = stats_all[i]
        steps_per_run=[]
        for j in range(2):
            steps = curriculum[f'{j+1}'].step_counts
            steps_per_run.append(np.sum(steps))
        steps_all.append(np.sum(steps_per_run))
    return steps_all

In [12]:
steps_1_0_without_task1 = calculate_steps_per_run_without_first_task(stats_all_1_0)
steps_0_6_without_task1 = calculate_steps_per_run_without_first_task(stats_all_0_6)
steps_0_3_without_task1 = calculate_steps_per_run_without_first_task(stats_all_0_3)
steps_0_1_without_task1 = calculate_steps_per_run_without_first_task(stats_all_0_1)
steps_0_03_without_task1 = calculate_steps_per_run_without_first_task(stats_all_0_03)
all_steps_without_task1 = [steps_1_0_without_task1, steps_0_6_without_task1, steps_0_3_without_task1, 
             steps_0_1_without_task1, steps_0_03_without_task1]
tickets = ['eps=1.0', 'eps=0.6', 'eps=0.3', 'eps=0.1', 'eps=0.03']

In [13]:
plot_boxplot(all_steps_without_task1, tickets, save_format='svg', save_path='./box_plot_eps_reset')