In [None]:
import os
import tensorflow as tf
import numpy as np
from envs.vases_grid import VasesGrid, VasesEnvState, print_state, str_to_state, state_to_str
from envs.utils import unique_perm, zeros_with_ones, printoptions
from envs.vases_spec import VasesEnvState2x3V2D3, VasesEnvSpec2x3V2D3, VasesEnvState2x3Broken, VasesEnvSpec2x3Broken

import os.path as osp
from baselines import logger
from baselines.common import set_global_seeds

def train(env, num_timesteps, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy, mlp_policy
    import baselines.common.tf_util as U
    from baselines import bench
    
    logger.configure()
    logger.set_level(30)
    
    sess = U.single_threaded_session()
    sess.__enter__()
    
    rank = 0
    workerseed = seed + 10000 * rank
    set_global_seeds(workerseed)
    
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, 
                                    hid_size=16,num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), str(rank)))

    pi = pposgd_simple.learn(env, policy_fn,
        max_timesteps=int(num_timesteps * 1.1),
        timesteps_per_actorbatch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=2*1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    return pi


def rollout(env, model, steps_printed=50):
    '''Given an env and R, runs soft VI for h steps and rolls out the resulting policy'''
    obs = env.reset()
    r = 0
    done = False
    print_state(env.s); print()
    for i in range(steps_printed):
        a, vpred = model.act(True, obs)
        obs, r, done, _ = env.step(a)
        print_state(env.s)
        print('reward: ', env.r_vec.T @ env.s_to_f(env.s), '\n')


In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
tf.logging.set_verbosity(tf.logging.ERROR)


env = VasesGrid(VasesEnvSpec2x3V2D3(), VasesEnvState2x3V2D3(), 
                f_include_masks=True, compute_transitions=False)

num_timesteps=128*128*1
model = train(env, num_timesteps, seed=1)

In [None]:
rollout(env, model, steps_printed=300)