In [1]:
import time
from hanabi_learning_environment import rl_env
import numpy as np
import random
import os


from PPOAgent.util import *
from tf_agents_lib.pyhanabi_env_wrapper import *
from PPOAgent.Model import Model
from PPOAgent.Game import Game
import tensorflow as tf

from PBT import pbt

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def load_hanabi(ENV_CONFIG, rewards_config = {}):
    # loads wrapped env
    return PyhanabiEnvWrapper(rl_env.make(**ENV_CONFIG, rewards_config = rewards_config))

In [3]:
# choose model architecture, set environment

MODEL_CONFIG_BASE = {'nsteps' : 36, 'nminibatches' : 1, 'nenvs'  :16,
                'fc_input_layers' : [128], 
                'lstm_layers' : [], 'noisy_fc' :True, 'noisy_lstm' : False,
                'v_net' : 'copy', 'gamma' : 0.99,
                'ent_coef' : 0.0, 'vf_coef' : 1,
                'lr' : 1e-3, 'masked' : True,
                'max_grad_norm' : None,
                'total_timesteps' : int(6500e6),
                'normalize_advs': True,
                'layer_norm' : False,
                 'scope' : None
               }
REWARDS_CONFIG = {'play0' : 1, 'play1' : 3, 'play2' : 9, 'play3' : 27, 'play4' : 81,
                  'discard_last_copy' : -100, 'discard_extra' : 0.5, 
                  'hint_last_copy' : 0.1, 'hint_penalty' : 0.5,  'hint_playable' : 0.1,
                  'use_hamming' : False, 'loose_life' : -8}

ENV_CONFIG = {'environment_name' : 'Hanabi-Small', 'num_players' : 2, 'use_custom_rewards' : True}

CUSTOM_REWARDS_KEYS = ['hint_reward', 'play_reward', 'discard_reward']

In [7]:
# setting parameters

nmodels =  16

mutations = [0.5, 0.7, 0.8, 0.9, 1.2, 1.3, 1.5, 2] + [1] * 22
updates_until_comparasion = 70
evaluate_last_updates = 15

evolve_ratio = 0.25
nmodels_to_evolve = int(evolve_ratio * nmodels)

evolution_criterium = 'scores'

population_name = 'pbt_each70'
folder = './experiments/PBT/'
save_every = 10

load = False

In [12]:
# pbt.randomize_dict

<function PBT.pbt.randomize_dict(d, min_val=0.5, max_val=2)>

In [5]:
# create pool

tf.reset_default_graph()
sess = tf.Session()
    
load_env = lambda: load_hanabi(ENV_CONFIG, REWARDS_CONFIG)
env = load_hanabi(ENV_CONFIG, REWARDS_CONFIG)
action_spec = env.action_spec()
obs_spec = env.observation_spec()
nactions = action_spec.maximum + 1 - action_spec.minimum
nobs = obs_spec['state'].shape[0]
nplayers = ENV_CONFIG['num_players']

model_pool = pbt.create_pool(nactions, nobs, nplayers, sess, MODEL_CONFIG_BASE, nmodels)

saver_pool = [tf.train.Saver(var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                        scope = model.scope)) for model in model_pool]
pool_summary_writer = tf.summary.FileWriter(folder + population_name + '/params/')
summary_writer_pool = [tf.summary.FileWriter(folder + population_name + '/' + model.scope + '/')
                       for model in model_pool]

reward_weights_pool = [pbt.randomize_dict(dict(REWARDS_CONFIG), 0.8, 1.2) for _ in range(nmodels)]
k_pool = [np.random.randint(10, 20) for _ in range(nmodels)]

W1125 20:18:52.063797 140613080966976 deprecation_wrapper.py:119] From /home/gr1/Documents/HLE github/hanabi/PPOAgent/Model.py:34: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.

W1125 20:18:52.068178 140613080966976 deprecation_wrapper.py:119] From /home/gr1/Documents/HLE github/hanabi/PPOAgent/Model.py:38: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1125 20:18:52.074136 140613080966976 deprecation_wrapper.py:119] From /home/gr1/Documents/HLE github/hanabi/PPOAgent/Model.py:47: The name tf.AUTO_REUSE is deprecated. Please use tf.compat.v1.AUTO_REUSE instead.

W1125 20:18:52.075902 140613080966976 deprecation.py:323] From /home/gr1/Documents/HLE github/hanabi/PPOAgent/util.py:313: flatten (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.flatten instead.
W1125 20:18:52.213707 140613080966976 deprecation_wrapper.py:119

In [8]:
# make game, load perhaps

game = Game(nplayers, model_pool[0], load_env, wait_rewards = True)
start_epoch = 0
if load:
    reward_weights_pool, k_pool, start_epoch = pbt.load_population(model_pool, saver_pool, 
                                                               population_name, folder)

In [9]:
for evolution_epoch in range(start_epoch, start_epoch + 1000):
    updates_until_comparasion = min(1200, 70 + 10 * evolution_epoch)
    print('Updates untill mutation', updates_until_comparasion)
    evaluate_last_updates = 20
    # train models in pool, saving their summaires
    time_start = time.time()
    losses, pool_results = pbt.train_pool(game, model_pool, summary_writer_pool, reward_weights_pool, k_pool, 
                                      updates_until_comparasion, population_name, folder, save_every)
    # compare models
    sorted_model_nums = pbt.get_worst_and_best_model_nums(pool_results, nmodels, evolution_criterium,
                                                      evaluate_last_updates)
    # print current results
    time_taken = time.time() - time_start
    pbt.print_pool_results(pool_results, sorted_model_nums, 'EPOCH %d' % evolution_epoch, time_taken)
    # set models for evolving
    worst_model_nums = sorted_model_nums[:nmodels_to_evolve]
    best_model_nums = sorted_model_nums[-nmodels_to_evolve:]
    pairs_for_evolution = pbt.draw_copy_pairs(worst_model_nums, best_model_nums)
    # evolve models
    for pair in pairs_for_evolution:
        print('Evolving %d ---> %d' % (pair))
        sess.run(update_target_graph(model_pool[pair[1]].scope, model_pool[pair[0]].scope))
        pbt.update_model_lr(pair, model_pool,)
        pbt.update_rewards(pair, reward_weights_pool)
        pbt.update_k(pair, k_pool)
    pbt.mutate_models(model_pool, reward_weights_pool, k_pool, mutations)
    # save population
    pbt.save_population(model_pool, saver_pool, reward_weights_pool, k_pool, population_name, folder)
    # write evolution summary
    pbt.write_pool_summary(pool_summary_writer, model_pool, reward_weights_pool, k_pool, evolution_epoch,
                       population_name, folder)
    

Updates untill mutation 70


TypeError: reset_model() takes 2 positional arguments but 3 were given