In [1]:
import numpy as np

from diffrascape.env import BadSeedsTheSequel

In [2]:
def play_sequential_game(clist,vlist,bad_list,max_turns):
    env = BadSeedsTheSequel(
            centers=clist,
            variances=vlist,
            bad_seeds=bad_list,
            max_turns=max_turns)

    sum_points = 0
    game_terminated = False

    iguess = 0
    while not game_terminated:
        best_guess = int(iguess)
        iguess += 1
        next_state, game_terminated, next_reward = env.execute(iguess%env.N)
        sum_points += next_reward
    return sum_points


In [3]:
N = 5
clist = N*[10.0]
vlist = np.ones(N)*0.2
vlist[0] *= 20.0 #20 times higher variance in bad ones
bad_list = N*[False]
bad_list[0] = 5*[True]
max_turns = 20

print (f'sequential score {play_sequential_game(clist, vlist, bad_list, max_turns)}')


sequential score 0.0


In [4]:
# $ tensorboard --logdir data/summaries
# more state
# more explore
# expert trajectories

from tensorforce.agents import Agent

# N is the number of samples
N = 30
# n is the number of bad seeds
n = 5
clist = np.ones(N)*10.0
vlist = np.ones(N)*0.2
vlist[:n] = 0.2 * 20.0 #20 times higher variance in bad ones
bad_list = N * [False]
bad_list[:n] = n * [True]

max_turns = 200

bad_seeds_env = BadSeedsTheSequel(
    centers=clist,
    variances=vlist,
    bad_seeds=bad_list,
    max_turns=max_turns
)

good_ppo_agent = Agent.create(
    #agent="tensorforce",
    #update=64,
    #objective="policy_gradient",
    #reward_estimation=dict(horizon=max_turns),
    
    agent="ppo",
    batch_size=10,

    #exploration=0.01,
    variable_noise=0.01,
    l2_regularization=0.1,
    entropy_regularization=0.2,
    
    environment=bad_seeds_env,
    max_episode_timesteps=max_turns,
    summarizer=dict(
        directory='data/summaries',
        # list of labels, or 'all'
        labels=['graph', 'entropy', 'kl-divergence', 'losses', 'rewards'],
        frequency=10,  # store values every 10 timesteps
    )
)

agent = Agent.create(
    #agent="tensorforce",
    #update=64,
    #objective="policy_gradient",
    #reward_estimation=dict(horizon=max_turns),
    
    agent="a2c",
    # ppo batch_size=10 works
    # a2c batch_size=10 does not work but is this the problem?
    batch_size=100, # this seems to help a2c

    exploration=0.01,  # tried without this at first
    variable_noise=0.05,
    # variable_noise=0.01 bad?
    l2_regularization=0.1,
    entropy_regularization=0.2,
    
    # ppo: horizon=0 works
    # a2c: horizon=0 works worse than ppo
    horizon=200, # does this help a2c? yes

    environment=bad_seeds_env,
    max_episode_timesteps=max_turns,
    summarizer=dict(
        directory='data/summaries',
        # list of labels, or 'all'
        labels=['graph', 'entropy', 'kl-divergence', 'losses', 'rewards'],
        frequency=10,  # store values every 10 timesteps
    )
)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [5]:
from tensorforce.execution import Runner

runner = Runner(agent=agent, environment=bad_seeds_env)
for _ in range(10):
    runner.run(num_episodes=100)
    agent.save(directory="saved_models")
#runner.close()

Episodes: 100%|██████████| 100/100 [01:51, reward=7.00, ts/ep=200, sec/ep=1.24, ms/ts=6.2, agent=99.1%] 
Episodes: 100%|██████████| 100/100 [01:48, reward=8.00, ts/ep=200, sec/ep=1.16, ms/ts=5.8, agent=99.1%] 
Episodes: 100%|██████████| 100/100 [01:50, reward=19.00, ts/ep=200, sec/ep=1.02, ms/ts=5.1, agent=99.1%]
Episodes: 100%|██████████| 100/100 [01:48, reward=138.00, ts/ep=200, sec/ep=1.08, ms/ts=5.4, agent=99.1%]
Episodes: 100%|██████████| 100/100 [01:49, reward=25.00, ts/ep=200, sec/ep=1.00, ms/ts=5.0, agent=99.1%]
Episodes: 100%|██████████| 100/100 [01:45, reward=151.00, ts/ep=200, sec/ep=1.02, ms/ts=5.1, agent=99.1%]
Episodes: 100%|██████████| 100/100 [01:45, reward=10.00, ts/ep=200, sec/ep=0.97, ms/ts=4.9, agent=99.1%]
Episodes: 100%|██████████| 100/100 [01:43, reward=14.00, ts/ep=200, sec/ep=1.10, ms/ts=5.5, agent=99.1%]
Episodes: 100%|██████████| 100/100 [01:45, reward=154.00, ts/ep=200, sec/ep=0.99, ms/ts=5.0, agent=99.1%]
Episodes: 100%|██████████| 100/100 [01:42, reward=21

for n in (15, 10, 5):
    clist = np.ones(N)*10.0
    vlist = np.ones(N)*0.2
    vlist[:n] = 0.2 * 20.0 #20 times higher variance in bad ones
    bad_list = N * [False]
    bad_list[:n] = n * [True]
    #max_turns = 20

    bad_seeds_env = BadSeeds(
        clist=clist,
        vlist=vlist,
        bad_list=bad_list,
        max_turns=max_turns
    )

    runner = Runner(agent=agent, environment=bad_seeds_env)
    runner.run(num_episodes=5000)
    #runner.close()


In [None]:
# Evaluate for 100 episodes
sum_rewards = 0.0
for _ in range(100):
    states = bad_seeds_env.reset()
    print(states)
    internals = agent.initial_internals()
    terminal = False
    while not terminal:
        actions, internals = agent.act(states=states, internals=internals, evaluation=True)
        states, terminal, reward = environment.execute(actions=actions)
        sum_rewards += reward
