In [None]:
import numpy as np
import warnings

from src.rl_agent import RLAgent
from src.rl_experiments import RLExperiments
from src.state_representation import StateRepresentation

from src.environments.env_pendulum import EnvironmentPendulum
from src.environments.env_redpillbluepill import EnvironmentRedPillBluePill

warnings.filterwarnings("ignore")

# pytorch_device = 'cuda'
# pytorch_device = 'mps'
pytorch_device = 'cpu'

## Red-Pill Blue-Pill (CVaR)

In [None]:
# For single tau

# define environment
env = EnvironmentRedPillBluePill(render_mode=None) 

# define agent
actions = list(env.action_dict.keys())
states = list(env.state_dict.values())
 
policy = None

agent = RLAgent(agent_type='q_learning',
                states=states,
                actions=actions,
                policy=policy,
                avg_reward_method='differential',
                initial_avg_reward=0.0,
                action_type='discrete',
                action_selection_rule='epsilon_greedy',
                policy_type='tabular',
                value_type='tabular',
                pytorch_device=pytorch_device,
                use_cvar=True, 
                var_quantile=0.25, 
                initial_var_reward=0.0,
               )

# run experiment
rl_experiments = RLExperiments()

step_sizes = {
    'value': 0.02,
    'avg_reward': 0.1,
    'var': 0.1,
}

df_rpbp_cvar = rl_experiments.run_experiment_continuing(experiment='redpillbluepill_cvar',
                                                        agent=agent, 
                                                        env=env,
                                                        num_runs=50,
                                                        max_steps=100000,
                                                        discount=1.0,
                                                        epsilon=0.1,
                                                        step_size=step_sizes,
                                                       )

In [None]:
# For multiple taus

# define environment
env = EnvironmentRedPillBluePill(render_mode=None) 

# define agent
actions = list(env.action_dict.keys())
states = list(env.state_dict.values())
 
policy = None

tau_experiment_results = {}
for tau in [0.1, 0.25, 0.5, 0.75, 0.85, 0.9]:
    print(tau)
    agent = RLAgent(agent_type='q_learning',
                states=states,
                actions=actions,
                policy=policy,
                avg_reward_method='differential',
                initial_avg_reward=0.0,
                action_type='discrete',
                action_selection_rule='epsilon_greedy',
                policy_type='tabular',
                value_type='tabular',
                pytorch_device=pytorch_device,
                use_cvar=True, 
                var_quantile=tau, 
                initial_var_reward=0.0,
               )

    # run experiment
    rl_experiments = RLExperiments()

    step_sizes = {
        'value': 0.02,
        'avg_reward': 0.1,
        'var': 0.1,
    }

    df_rpbp_tau = rl_experiments.run_experiment_continuing(experiment='redpillbluepill_cvar_tau',
                                                           agent=agent, 
                                                           env=env,
                                                           num_runs=50,
                                                           max_steps=500000,
                                                           discount=1.0,
                                                           epsilon=0.1,
                                                           step_size=step_sizes,
                                                          )

    tau_experiment_results[tau] = df_rpbp_tau

## Red-Pill Blue-Pill (Differential)

In [None]:
# define environment
env = EnvironmentRedPillBluePill(render_mode=None) 

# define agent
actions = list(env.action_dict.keys())
states = list(env.state_dict.values())
 
policy = None

agent = RLAgent(agent_type='q_learning',
                states=states,
                actions=actions,
                policy=policy,
                avg_reward_method='differential',
                initial_avg_reward=0.0,
                action_type='discrete',
                action_selection_rule='epsilon_greedy',
                policy_type='tabular',
                value_type='tabular',
                pytorch_device=pytorch_device,
               )

# run experiment
rl_experiments = RLExperiments()

step_sizes = {
    'value': 0.0002,
    'avg_reward': 1,
}

df_rpbp_diff = rl_experiments.run_experiment_continuing(experiment='redpillbluepill_diff',
                                                        agent=agent, 
                                                        env=env,
                                                        num_runs=50,
                                                        max_steps=100000,
                                                        discount=1.0,
                                                        epsilon=0.1,
                                                        step_size=step_sizes,
                                                       )

## Figures

In [None]:
df_dict = {
    'Differential': {
        'df': df_rpbp_diff,
        'color_cvar': '#007FA3',
        'color_average': '#2FD1FF',
    },
    'RED CVaR': {
        'df': df_rpbp_cvar,
        'color_cvar': '#AB1368',
        'color_average': '#EC52A8',
    },
}

rl_experiments = RLExperiments()
rl_experiments.get_performance_figure(experiment='redpillbluepill',
                                      df_dict=df_dict, 
                                      rolling_average_amount=1000,
                                      x_max=49900,
                                      quantile=0.25,
                                     )

In [None]:
# define environment
env = EnvironmentRedPillBluePill(render_mode=None) 

# define agent
actions = list(env.action_dict.keys())
states = list(env.state_dict.values())
 
policy = None

agent = RLAgent(agent_type='q_learning',
                states=states,
                actions=actions,
                policy=policy,
                avg_reward_method='differential',
                initial_avg_reward=0.0,
                action_type='discrete',
                action_selection_rule='epsilon_greedy',
                policy_type='tabular',
                value_type='tabular',
                pytorch_device=pytorch_device,
                use_cvar=True, 
                var_quantile=0.25, 
                initial_var_reward=0.0,
               )

# run experiment
rl_experiments = RLExperiments()

step_sizes = {
    'value': 0.02,
    'avg_reward': 0.1,
    'var': 0.1,
}

rl_experiments.cvar_redpillbluepill_estimates(agent=agent,
                                              env=env,
                                              experiment='cvar_rpbp_estimates',
                                              num_runs=1,
                                              max_steps=100000,
                                              discount=1.0,
                                              epsilon=0.1,
                                              step_size=step_sizes,
                                             )

In [None]:
# Compare CVaR values of red and blue policies (estimated using monte carlo)
rl_experiments = RLExperiments()
rl_experiments.get_cvar_by_tau_plot(
    n_samples=100000,
    epsillon=0.1,
)

In [None]:
# tau experiment
rl_experiments = RLExperiments()
rl_experiments.get_tau_results_figure(experiment='rpbp_by_tau',
                                      results_dict=tau_experiment_results, 
                                      n_runs=50, 
                                      rolling_average_amount=2500,
                                      x_max=500000,
                                     )