In [13]:
from rlberry.agents.torch import PPOAgent
from rlberry.agents.torch import TRPOAgent
from rlberry.envs.benchmarks.ball_exploration import PBall2D
from rlberry.envs.benchmarks.ball_exploration.ball2d import BallLevel1, BallLevel2, BallLevel3, BallLevel4, BallLevel5

In [14]:
env_dict = {0 : PBall2D(),
            1 : BallLevel1(),
            2 : BallLevel2(),
            3 : BallLevel3(),
            4 : BallLevel4(),
            5 : BallLevel5()}

idx_to_env_dict = {0 : "PBall2D",
            1 : "BallLevel1",
            2 : "BallLevel2",
            3 : "BallLevel3",
            4 : "BallLevel4",
            5 : "BallLevel5"}

In [15]:
env_ppo_reward = {}

n_episodes = 500
horizon = 256
nb_runs = 1000

for env_index in env_dict.keys():
    env = env_dict[env_index] #getting env
    agent = PPOAgent(
    env, horizon=horizon, gamma=0.99, learning_rate=0.05,
    ) #define agent
    agent.fit(budget=n_episodes,) #training 


    average_reward = 0
    for run in range(nb_runs): #getting average reward on nb_runs runs after training
        state = env.reset()
        for tt in range(horizon):
            action = agent.policy(state)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            average_reward += reward
            if done :
                break
    
    average_reward = average_reward / nb_runs
        
    
    env_ppo_reward[idx_to_env_dict[env_index]] = average_reward

print(env_ppo_reward)
    

[INFO] Could not find least used device (nvidia-smi might be missing), use cuda:0 instead 
[INFO] [PPO[worker: -1]] | max_global_step = 9 | episode_rewards = 0.0 | dw_time_elapsed = 3.3264735000002474 | fit/surrogate_loss = -0.0002451092004776001 | fit/entropy_loss = 1.3782919645309448 |  
[INFO] [PPO[worker: -1]] | max_global_step = 17 | episode_rewards = 0.0 | dw_time_elapsed = 6.398319899999933 | fit/surrogate_loss = 0.020981911569833755 | fit/entropy_loss = 1.3759486675262451 |  
[INFO] [PPO[worker: -1]] | max_global_step = 25 | episode_rewards = 0.0 | dw_time_elapsed = 9.471227200000158 | fit/surrogate_loss = 0.0020792679861187935 | fit/entropy_loss = 1.3678390979766846 |  
[INFO] [PPO[worker: -1]] | max_global_step = 34 | episode_rewards = 0.0 | dw_time_elapsed = 12.672927000000072 | fit/surrogate_loss = -0.009415924549102783 | fit/entropy_loss = 1.3516408205032349 |  
[INFO] [PPO[worker: -1]] | max_global_step = 43 | episode_rewards = 6.131213870853292 | dw_time_elapsed = 15.904

{'PBall2D': 37.227532098202595, 'BallLevel1': 138.80946265300412, 'BallLevel2': 0.018943956928484496, 'BallLevel3': 4.949471369577155, 'BallLevel4': 11.665117145512502, 'BallLevel5': 8.801657449463386}


In [16]:
env_trpo_reward = {}

n_episodes = 100
horizon = 256
nb_runs = 1000

delta = 0.05 
num_rollouts = 50

for env_index in env_dict.keys():
    env = env_dict[env_index] #getting env
    agent = TRPOAgent(
    env, horizon=horizon, gamma=0.99, learning_rate=0.05, delta = delta, num_rollouts = num_rollouts
    ) #define agent
    agent.fit(budget=n_episodes,) #training 


    average_reward = 0
    for run in range(nb_runs): #getting average reward on nb_runs runs after training
        state = env.reset()
        for tt in range(horizon):
            action = agent.policy(state)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            average_reward += reward

            if done:
                break
    
    average_reward = average_reward / nb_runs
        
    
    env_trpo_reward[idx_to_env_dict[env_index]] = average_reward

print(env_trpo_reward)
    

[INFO] Could not find least used device (nvidia-smi might be missing), use cuda:0 instead 
[INFO] [TRPO[worker: -1]] | max_global_step = 1 | episode_rewards = 1.2633329576934749 | dw_time_elapsed = 9.944675300000199 |  
[INFO] [TRPO[worker: -1]] | max_global_step = 2 | episode_rewards = 2.543491440743506 | dw_time_elapsed = 19.712180000000444 |  
[INFO] [TRPO[worker: -1]] | max_global_step = 3 | episode_rewards = 1.2572987162485167 | dw_time_elapsed = 29.265161199999966 |  
[INFO] [TRPO[worker: -1]] | max_global_step = 4 | episode_rewards = 10.018511326572407 | dw_time_elapsed = 39.12466360000053 |  
[INFO] [TRPO[worker: -1]] | max_global_step = 5 | episode_rewards = 11.302029438253394 | dw_time_elapsed = 49.040924100000666 |  
[INFO] [TRPO[worker: -1]] | max_global_step = 6 | episode_rewards = 11.009017904561697 | dw_time_elapsed = 59.120819499999925 |  
[INFO] [TRPO[worker: -1]] | max_global_step = 7 | episode_rewards = 18.825187337759804 | dw_time_elapsed = 69.28494649999993 |  
[IN

{'PBall2D': 195.1966830550651, 'BallLevel1': 226.25525090947286, 'BallLevel2': 191.56621455155172, 'BallLevel3': 168.00207100699856, 'BallLevel4': 186.08204157626048, 'BallLevel5': 178.93837605541373}
