In [None]:
import numpy as np
import warnings

from src.rl_agent import RLAgent
from src.rl_experiments import RLExperiments
from src.state_representation import StateRepresentation

from src.environments.env_pendulum import EnvironmentPendulum
from src.environments.env_redpillbluepill import EnvironmentRedPillBluePill

warnings.filterwarnings("ignore")

# pytorch_device = 'cuda'
# pytorch_device = 'mps'
pytorch_device = 'cpu'

## Inverted Pendulum (CVaR)

In [None]:
# define environment
env = EnvironmentPendulum(render_mode=None) 

# define agent with function approximation
num_tiles = 8
num_tilings = 32
iht_size = 4096
state_limits = []
for i in range(env.gym_env.observation_space.shape[0]):
    state_limits.append([
        env.gym_env.observation_space.low[i],
        env.gym_env.observation_space.high[i],
    ])

state_representation = StateRepresentation(method='tilecoding',
                                           settings={
                                               'num_tiles': num_tiles,
                                               'num_tilings': num_tilings,
                                               'iht_size': iht_size,
                                               'state_limits': state_limits,
                                           })

states = np.zeros((iht_size, 1))

actions = list(env.action_dict.keys())

policy = {
    'initialize': {
        'file_path': './src/pytorch_networks/linear_discrete_actions.py',
    },
}

value_network = {
    'initialize': {
        'file_path': './src/pytorch_networks/linear_discrete_actions.py',
    }, 
}

agent = RLAgent(agent_type='ac', 
                states=states, 
                actions=actions, 
                policy=policy, 
                avg_reward_method='differential',
                initial_avg_reward=0.0,
                action_type='discrete',
                action_selection_rule='softmax',
                policy_type='nn_pytorch',
                policy_update_type='stochastic_gradient_descent',
                policy_loss='ac_policy_loss',
                value_type='nn_pytorch', 
                value_network=value_network,
                value_update_type='stochastic_gradient_descent',
                value_loss='mse_loss',
                pytorch_device=pytorch_device,
                use_cvar=True, 
                var_quantile=0.1, 
                initial_var_reward=0.0,
               )

# run experiment
rl_experiments = RLExperiments()

step_sizes = {
    'value': 0.002,
    'policy': 1,
    'avg_reward': 0.01,
    'var': 0.001,
}

df_pendulum_cvar = rl_experiments.run_experiment_continuing(experiment='pendulum_cvar',
                                                            agent=agent, 
                                                            env=env,
                                                            state_representation=state_representation,
                                                            num_runs=10,
                                                            max_steps=25000,
                                                            discount=1.0,
                                                            step_size=step_sizes,
                                                           )

## Inverted Pendulum (Differential)

In [None]:
# define environment
env = EnvironmentPendulum(render_mode=None) 

# define agent with function approximation
num_tiles = 8
num_tilings = 32
iht_size = 4096
state_limits = []
for i in range(env.gym_env.observation_space.shape[0]):
    state_limits.append([
        env.gym_env.observation_space.low[i],
        env.gym_env.observation_space.high[i],
    ])

state_representation = StateRepresentation(method='tilecoding',
                                           settings={
                                               'num_tiles': num_tiles,
                                               'num_tilings': num_tilings,
                                               'iht_size': iht_size,
                                               'state_limits': state_limits,
                                           })

states = np.zeros((iht_size, 1))

actions = list(env.action_dict.keys())

policy = {
    'initialize': {
        'file_path': './src/pytorch_networks/linear_discrete_actions.py',
    },
}

value_network = {
    'initialize': {
        'file_path': './src/pytorch_networks/linear_discrete_actions.py',
    }, 
}

agent = RLAgent(agent_type='ac', 
                states=states, 
                actions=actions, 
                policy=policy, 
                avg_reward_method='differential',
                initial_avg_reward=0.0,
                action_type='discrete',
                action_selection_rule='softmax',
                policy_type='nn_pytorch',
                policy_update_type='stochastic_gradient_descent',
                policy_loss='ac_policy_loss',
                value_type='nn_pytorch', 
                value_network=value_network,
                value_update_type='stochastic_gradient_descent',
                value_loss='mse_loss',
                pytorch_device=pytorch_device,
               )

# run experiment
rl_experiments = RLExperiments()

step_sizes = {
    'value': 0.002,
    'policy': 2,
    'avg_reward': 0.01,
}

df_pendulum_diff = rl_experiments.run_experiment_continuing(experiment='pendulum_diff',
                                                            agent=agent, 
                                                            env=env,
                                                            state_representation=state_representation,
                                                            num_runs=10,
                                                            max_steps=25000,
                                                            discount=1.0,
                                                            step_size=step_sizes,
                                                           )

## Figures

In [None]:
df_dict = {
    'Differential': {
        'df': df_pendulum_diff,
        'color_cvar': '#007FA3',
        'color_average': '#2FD1FF',
    },
    'RED CVaR': {
        'df': df_pendulum_cvar,
        'color_cvar': '#AB1368',
        'color_average': '#EC52A8',
    },
}

rl_experiments = RLExperiments()
rl_experiments.get_performance_figure(experiment='pendulum',
                                      df_dict=df_dict, 
                                      rolling_average_amount=1000,
                                      x_max=6900,
                                      quantile=0.1,
                                     )

In [None]:
# define environment
env = EnvironmentPendulum(render_mode=None) 

# define agent with function approximation
num_tiles = 8
num_tilings = 32
iht_size = 4096
state_limits = []
for i in range(env.gym_env.observation_space.shape[0]):
    state_limits.append([
        env.gym_env.observation_space.low[i],
        env.gym_env.observation_space.high[i],
    ])

state_representation = StateRepresentation(method='tilecoding',
                                           settings={
                                               'num_tiles': num_tiles,
                                               'num_tilings': num_tilings,
                                               'iht_size': iht_size,
                                               'state_limits': state_limits,
                                           })

states = np.zeros((iht_size, 1))

actions = list(env.action_dict.keys())

policy = {
    'initialize': {
        'file_path': './src/pytorch_networks/linear_discrete_actions.py',
    },
}

value_network = {
    'initialize': {
        'file_path': './src/pytorch_networks/linear_discrete_actions.py',
    }, 
}

agent = RLAgent(agent_type='ac', 
                states=states, 
                actions=actions, 
                policy=policy, 
                avg_reward_method='differential',
                initial_avg_reward=0.0,
                action_type='discrete',
                action_selection_rule='softmax',
                policy_type='nn_pytorch',
                policy_update_type='stochastic_gradient_descent',
                policy_loss='ac_policy_loss',
                value_type='nn_pytorch', 
                value_network=value_network,
                value_update_type='stochastic_gradient_descent',
                value_loss='mse_loss',
                pytorch_device=pytorch_device,
                use_cvar=True, 
                var_quantile=0.1, 
                initial_var_reward=0.0,
               )

# run experiment
rl_experiments = RLExperiments()

step_sizes = {
    'value': 0.002,
    'policy': 1,
    'avg_reward': 0.01,
    'var': 0.001,
}

rl_experiments.cvar_pendulum_estimates(agent=agent,
                                       env=env,
                                       experiment='cvar_pendulum_estimates',
                                       state_representation=state_representation,
                                       num_runs=1,
                                       max_steps=500000,
                                       discount=1.0,
                                       step_size=step_sizes,
                                      )