In [1]:
import itertools
import os
from collections import defaultdict
def check_create_path(path):
    if not os.path.isdir(path):
        os.makedirs(path)

def make_load_hanabi_fn(ENV_CONFIG):
    return lambda: PyhanabiEnvWrapper(rl_env.make(**ENV_CONFIG))

def get_env_spec(ENV_CONFIG,):
    env = PyhanabiEnvWrapper(rl_env.make(**ENV_CONFIG))
    action_spec = env.action_spec()
    obs_spec = env.observation_spec()
    nactions = action_spec.maximum + 1 - action_spec.minimum
    nobs = obs_spec['state'].shape[0]
    nplayers = ENV_CONFIG['num_players']
    
    print('GAME PARAMETERS: \n observation length = %d \n number of actions = %d, number of players = %d ' %
          (nobs, nactions, nplayers))
    return nobs, nactions, nplayers


def create_savers_and_summaries(path, models, ENV_CONFIG):
    check_create_path(path)
    savers = []
    summary_writers = []
    for model in models:
        
        model_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = model.scope)
        savers.append(tf.train.Saver(var_list = model_params))
        
        check_create_path(path + model.scope + '/summary/' )
        summary_writers.append(tf.summary.FileWriter(path+ model.scope + '/summary/' ))
        
    return savers, summary_writers


def save_models(models, savers, path, kl_th):
    for model, saver in zip(models, savers):
        saver.save(model.sess, path + model.scope + '/model/model.cptk')
        lr = model.lr.value()
        params_dict = {'lr' : lr, 'kl_th' : kl_th}
        with open(path + model.scope + '/params_dict.pkl', 'wb') as f:
            pickle.dump(params_dict, f)

        
        
def load_models(path, savers, models, kl_init, lr_init):
    updates_per_model = []
    kl_th_list = []
    for saver, model in zip(savers, models):
        ckpt = tf.train.get_checkpoint_state(path + model.scope + '/model/')
        if ckpt is None:
            print('Could not load model %s' % model.scope)
            init_num_upd = 0
            updates_per_model.append(int(init_num_upd))
            kl_th_list.append(kl_init)
        else:
            saver.restore(model.sess, ckpt.model_checkpoint_path)
            init_num_upd = model.sess.run(model.updates)
            try:
                with open(path + model.scope + '/params_dict.pkl', 'rb') as f:
                    params_dict = pickle.load(f)
            except:
                print('No params dict at', path + model.scope)
                params_dict = {'lr' : lr_init, 'kl_th' : kl_init}
                print(params_dict)
            model.change_lr(params_dict['lr'])
            kl_th = params_dict['kl_th']
            kl_th_list.append(kl_th)
            print('Successfully loaded model %s trained for %d updates' % (model.scope,init_num_upd))
            updates_per_model.append(int(init_num_upd))
    return kl_th_list     
                  
def save_summary(models, history_buffers, summary_writers, speed):
    for model, buffer, writer in zip(models, history_buffers, summary_writers):
        summary = tf.Summary()
        for key in buffer:
            summary.value.add(tag = key, simple_value = np.nanmean(buffer[key]))
            #print(key, buffer[key])
        
        summary.value.add(tag = 'Perf/ts per sec', simple_value = speed)
        model_steps = model.sess.run(model.total_steps)
        writer.add_summary(summary, model_steps)
        writer.flush()
        buffer = defaultdict(list)
def print_learning_rates(models):
    for m in models:
        lr = m.lr.value()
        print('Model %s has lr %.5f' % (m.scope, lr))
    
def write_into_buffer(buffer, training_stats, policy_loss, value_loss, policy_entropy, updates):
    # writes all data into buffer dict
    buffer['Perf/Score'].append(np.mean(training_stats['scores']))
    buffer['Perf/Reward'].append(np.mean(training_stats['rewards']))
    buffer['Perf/Length'].append(np.mean(training_stats['lengths']))
    buffer['Perf/Reward by "play"'].append(np.mean(training_stats['play_reward']))
    buffer['Perf/Reward by "discard"'].append(np.mean(training_stats['discard_reward']))
    buffer['Perf/Reward by "hint"'].append(np.mean(training_stats['hint_reward']))
    #buffer['Perf/Updates per batch'].append(k_trained)
    buffer['Perf/Updates done'].append(updates)
    #buffer['Losses/KL loss'].append(np.mean(kl))
    buffer['Losses/Policy loss'].append(np.mean(policy_loss))
    buffer['Losses/Value loss'].append(np.mean(value_loss))
    buffer['Losses/Policy entropy'].append(np.mean(policy_entropy))
    

def train_one_epoch(game, models, history_buffers, k, kl_th_list, episodes = 90, nsteps = None,
                    method = 'self play',
                    cliprange = 0.2, epochs_for_ma = 5,):
    # methods:
    #    self play -- each model will play with its copy, learning 
    #    multi agent -- each model will play with all other models
    #    if train is True all games will be used for training

    if method == 'single agent':
        for model, model_buffer, kl_th in zip(models, history_buffers, kl_th_list):
            # for sigle agent game uses one model
            game.players[0].assign_model(model)
            #game.reset()
            # run game untill enough data to training
            training_stats = game.play_untill_train(episodes, nsteps)
            # train
            policy_loss, value_loss, policy_entropy, k_trained = train_model(game, model, k, kl_th, 
                                                                                cliprange = cliprange)
            updates = model.sess.run(model.updates)
            # store results
            write_into_buffer(model_buffer, training_stats,
                              policy_loss, value_loss, policy_entropy, updates)
        
        
    elif method == 'self play':
        for model, model_buffer, kl_th in zip(models, history_buffers, kl_th_list):
            for player in game.players:
                player.assign_model(model)
            
            #game.reset()
            # run game untill enough data to training
            training_stats = game.play_untill_train(episodes, nsteps)
            # train
            policy_loss, value_loss, policy_entropy, k_trained = train_model(game, model, k, kl_th,
                                                                                cliprange = cliprange)
            updates = model.sess.run(model.updates)
            # store results
            write_into_buffer(model_buffer, training_stats,
                              policy_loss, value_loss, policy_entropy, updates)
            
    elif method == 'multi agent':
        model_nums = list(range(len(models)))
        player_nums = list(range(game.nplayers))
        # main_model_num is number of the model which will train by playing with
        # other models (including itself) are frozen.
        # it will play from position of main_player_num
        for main_model_num, model_buffer, kl_th in zip(model_nums, history_buffers, kl_th):
            main_model = models[main_model_num]
            main_player = random.choice(game.players)
            main_player.assign_model(main_model)
            other_players = [p for p in game.players if p.num != main_player.num]
            for _ in range(epochs_for_ma):
                self_play_case = True
                for p in other_players:
                    p_model = random.choice(models)
                    self_play_case = self_play_case and (p_model.scope == main_model.scope)
                    p.assign_model(p_model)

                #game.reset()
                # run game untill enough data to training
                training_stats = game.play_untill_train(episodes, nsteps)
                # train
                if self_play_case:
                    train_players = 'all'
                else:
                    train_players = [main_player.num]
                
                policy_loss, value_loss, policy_entropy, k_trained = train_model(game, main_model, k, 
                                                                                 kl_th,
                                                                                 train_players, cliprange)
                updates = model.sess.run(model.updates)
                # store results
                write_into_buffer(model_buffer, training_stats,
                                  policy_loss, value_loss, policy_entropy, updates)
    

        
import tensorflow as tf

import numpy as np

from PPOAgentDynamic.Model import Model
from PPOAgentDynamic.Game import Game
from PPOAgentDynamic.learn import learn

from tf_agents_lib.pyhanabi_env_wrapper import PyhanabiEnvWrapper
from hanabi_learning_environment import rl_env



  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import random
import time
import pickle

def train_model(game, model, k, target_kl, player_nums = 'all', cliprange = 0.2):
    (mb_obs, mb_actions, mb_probs, mb_neglogps, mb_legal_moves, mb_values, mb_rewards,
     mb_dones, mb_noise) = game.collect_data(player_nums)
    update_kl = False
    for i in range(k):
        p_losses, v_losses, p_ents, kl_losses = [], [], [], []
        policy_loss, value_loss, policy_entropy, _, probs = model.train(mb_obs, cliprange,
                                                                              mb_neglogps, mb_probs,
                                                                              mb_rewards, mb_actions,
                                                                              mb_values, mb_legal_moves,
                                                                              mb_noise, update_kl)
                    
        p_losses.append(policy_loss)
        v_losses.append(value_loss)
        p_ents.append(policy_entropy)
        #kl_losses.append(kl_loss)

    model.sess.run(model.increment_updates)
    return np.mean(p_losses), np.mean(v_losses), np.mean(p_ents), i + 1  

def run_evaluation(models, game, nepisodes = 10):
    # support only 2 players now
    result_matrix = -np.ones((len(models), len(models)))
    model_nums = list(range(len(models)))
    for p1_model_num in model_nums:
        
        game.players[0].assign_model(models[p1_model_num])
        if game.nplayers == 2:
            for p2_model_num in model_nums:
                game.players[1].assign_model(models[p1_model_num])
                game.reset()
                result_scores = np.mean(game.eval_results( episodes_per_env = nepisodes)['scores'])
                result_matrix[p1_model_num, p2_model_num] = result_scores
        else:
            game.reset()
            result_scores = np.mean(game.eval_results( episodes_per_env = nepisodes)['scores'])
            result_matrix[p1_model_num, p1_model_num] = result_scores

    return result_matrix

def run_experiment(run_name, MODEL_CONFIGS, ENV_CONFIG,  REWARDS_CONFIG,
                   nupdates = 10000,  k = 24,  episodes = 90, nsteps = None,
                   kl_init = 0.4, target_kl = 0.01, kl_factor = 0.996, cliprange = 0.2,
                   save_every = 100, summary_every = 10, evaluation_every = 200, eval_eps = 10,
                   root_folder = './experiments/newframework/', method = 'self play'
                  ):
    # set session
    tf.reset_default_graph()
    sess = tf.Session()
    # setting up environment depending variables
    load_env_fn = make_load_hanabi_fn(ENV_CONFIG)
    env = load_env_fn()
    nobs, nactions, nplayers = get_env_spec(ENV_CONFIG)
    nenvs = MODEL_CONFIGS[0]['nenvs']
    path = root_folder + ENV_CONFIG['environment_name'] + '-' + str(ENV_CONFIG['num_players'])
    path += '/' + run_name + '/'
    # create model
    models = [Model(nactions, nobs, nplayers, sess = sess, **MC) for MC in MODEL_CONFIGS]
    # create game
    if method == 'single agent':
        game = Game(1, nenvs, load_env_fn)
    else:
        game  = Game(nplayers, nenvs, load_env_fn, wait_rewards = True)
        
    game.reset(REWARDS_CONFIG)
    sess.run(tf.global_variables_initializer())
    # make savers, summaries, history buffers
    savers, summary_writers = create_savers_and_summaries(path, models, ENV_CONFIG)
    history_buffers = [defaultdict(list) for _ in range(len(models))]
    # try to load models
    # run training
    #kl_th_list = kl_init * np.ones(len(models))
    kl_th_list = np.array(load_models(path, savers, models, kl_init,  MODEL_CONFIGS[0]['lr']))
    steps_start, time_start = game.total_steps, time.time()
    for nupd in range(1, nupdates):
        
        # trains each model once
        train_one_epoch(game, models, history_buffers, k, kl_th_list, episodes, nsteps, 
                        cliprange = cliprange)
        kl_th_list = np.array([max(kl_factor*kl_th, target_kl) for kl_th in kl_th_list])
        # save models once in a while
        if nupd % save_every == 0:
            print_learning_rates(models)
            save_models(models, savers, path, kl_th_list)
        # save summaries, more often than models
        if nupd % summary_every == 0:
            speed = (game.total_steps - steps_start) / (time.time() - time_start)
            print('Speed = %d ts/second' % (speed))
            steps_start, time_start = game.total_steps, time.time()
            save_summary(models, history_buffers, summary_writers, speed)
            history_buffers = [defaultdict(list) for _ in range(len(models))]
        if nupd % evaluation_every == 0:
            matrix = run_evaluation(models, game, eval_eps)
            print('---------------%d---------------' % nupd)
            print('Matrix of models performance with each others:')
            print(matrix)

In [3]:
REWARDS_CONFIG = {'play0' : 1, 'play1' : 3, 'play2' : 9, 'play3' : 27, 'play4' : 81,
                  'baseline' :3,
                  'discard_last_copy' : -100, 'discard_extra' : 0.5,
                  'hint_last_copy' : 0.2, 'hint_penalty' : 0.1,  'hint_playable' : 0.2,
                  'use_hamming' : True, 'loose_life' : -50}

ENV_CONFIG = {'environment_name' : 'Hanabi-Small', 'num_players' : 2, 
              'use_custom_rewards' : True, 'open_hands' : True}

MC_BASE = {'nenvs'  : 32, 'fc_input_layers' : [128], 
           'noisy_fc' : False,
           'v_net' : 'copy', 'gamma' : 1,
           'ent_coef' : 0.0, 'vf_coef' : 1,
           'lr' : 8e-5, 'masked' : True,
           'max_grad_norm' : None,
           'total_timesteps' : int(100000),
           'normalize_advs': True,
           'normalize_rewards' : True,
           'layer_norm' : False,
           'lrschedule' : 'constant',
           'scope' : 'agent', 
           'use_kl_penalty' : False}



In [4]:
mc =  dict(MC_BASE)
run_name = 'neps_90_k=24'
run_experiment(run_name, [mc], ENV_CONFIG,  REWARDS_CONFIG,
               nupdates = 2000, k = 24, episodes = 90, nsteps = None,
               kl_init = 1, target_kl = 1, kl_factor = 1, cliprange = 0.2,
               save_every = 500, summary_every = 40, evaluation_every = 250, eval_eps = 30,
               method = 'self play')

run_name = 'nsteps_36'
run_experiment(run_name, [mc], ENV_CONFIG,  REWARDS_CONFIG,
               nupdates = 2200, k = 24, episodes = 90, nsteps = 36,
               kl_init = 1, target_kl = 1, kl_factor = 1, cliprange = 0.2,
               save_every = 100, summary_every = 10, evaluation_every = 250, eval_eps = 30,
               method = 'self play')



'''run_name = 'singlagent_oh'
run_experiment(run_name, [mc], ENV_CONFIG,  REWARDS_CONFIG,
               nupdates = 6000, k = 24, episodes = 90, 
               kl_init = 1, target_kl = 1, kl_factor = 1, 
               save_every = 100, summary_every = 10, evaluation_every = 250, eval_eps = 30,
               method = 'single agent')


run_name = 'selfplay_norm'
ENV_CONFIG['open_hands'] = False
run_experiment(run_name, [mc], ENV_CONFIG,  REWARDS_CONFIG,
               nupdates = 20000, k = 24, episodes = 90, 
               kl_init = 1, target_kl = 1, kl_factor = 1, 
               save_every = 100, summary_every = 10, evaluation_every = 250, eval_eps = 30,
               method = 'self play')'''




W1129 01:48:21.580241 140436059715392 deprecation_wrapper.py:119] From /home/gr1/Documents/HLE github/hanabi/PPOAgentDynamic/Model.py:38: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.

W1129 01:48:21.588923 140436059715392 deprecation_wrapper.py:119] From /home/gr1/Documents/HLE github/hanabi/PPOAgentDynamic/Model.py:43: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1129 01:48:21.593518 140436059715392 deprecation.py:506] From /home/gr1/.local/lib/python3.6/site-packages/tf_agents/utils/common.py:147: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1129 01:48:21.624544 140436059715392 deprecation_wrapper.py:119] From /home/gr1/Documents/HLE github/hanabi/PPOAgentDynamic/Model.py:57: The name tf.AUTO_REUSE 

GAME PARAMETERS: 
 observation length = 191 
 number of actions = 11, number of players = 2 


W1129 01:48:21.888865 140436059715392 deprecation_wrapper.py:119] From /home/gr1/Documents/HLE github/hanabi/PPOAgentDynamic/util.py:268: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.

W1129 01:48:22.113499 140436059715392 deprecation.py:323] From /home/gr1/Documents/HLE github/hanabi/PPOAgentDynamic/network_building.py:72: multinomial (from tensorflow.python.ops.random_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.random.categorical` instead.
W1129 01:48:22.131980 140436059715392 deprecation_wrapper.py:119] From /home/gr1/Documents/HLE github/hanabi/PPOAgentDynamic/Model.py:79: The name tf.log is deprecated. Please use tf.math.log instead.

W1129 01:48:22.196376 140436059715392 deprecation.py:323] From /home/gr1/.local/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed i

Could not load model agent
Speed = 2683 ts/second
Speed = 2846 ts/second
Speed = 2874 ts/second
Speed = 2610 ts/second
Speed = 2630 ts/second
Speed = 2620 ts/second
---------------250---------------
Matrix of models performance with each others:
[[0.00729167]]
Speed = 2804 ts/second
Speed = 2622 ts/second
Speed = 2597 ts/second
Speed = 2668 ts/second
Speed = 2568 ts/second
Speed = 2589 ts/second
Model agent has lr 0.00008
---------------500---------------
Matrix of models performance with each others:
[[0.31145833]]
Speed = 2687 ts/second
Speed = 2640 ts/second
Speed = 2812 ts/second
Speed = 2659 ts/second
Speed = 2624 ts/second
Speed = 2639 ts/second
---------------750---------------
Matrix of models performance with each others:
[[0.88541667]]
Speed = 2710 ts/second


KeyboardInterrupt: 

In [None]:
run_name = 'nsteps_36'
run_experiment(run_name, [mc], ENV_CONFIG,  REWARDS_CONFIG,
               nupdates = 2200, k = 24, episodes = 90, nsteps = 36,
               kl_init = 1, target_kl = 1, kl_factor = 1, cliprange = 0.2,
               save_every = 100, summary_every = 10, evaluation_every = 250, eval_eps = 30,
               method = 'self play')

In [None]:
run_name = 'nepisodes_90'
run_experiment(run_name, [mc], ENV_CONFIG,  REWARDS_CONFIG,
               nupdates = 2200, k = 24, episodes = 90, nsteps = None,
               kl_init = 1, target_kl = 1, kl_factor = 1, cliprange = 0.2,
               save_every = 100, summary_every = 10, evaluation_every = 250, eval_eps = 30,
               method = 'self play')