In [1]:
import numpy as np
import time
import matplotlib.pyplot as plt
%matplotlib inline

from env import *
from states import *
from agents import *
from models import *
from util import *

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
deck_size = len(Game().decks)

### Training Pipeline

In [3]:
## function to benchmark the learnt agents
def compare_agents(a_base,a_ai_1,a_ai_2,N_iter_loops=100,N_iter_games=100,state_generator=StateExtendV2,
                   compare_first_base_flag=False,compare_second_base_flag=False,compare_both_flag=True):
    cases = []
    if compare_first_base_flag:
        cases += [[a_ai_1,a_base,a_ai_1,a_base],
                  [a_base,a_ai_1,a_base,a_ai_1]]
    if compare_second_base_flag:
        cases += [[a_ai_2,a_base,a_ai_2,a_base],
                  [a_base,a_ai_2,a_base,a_ai_2]]
    if compare_both_flag:
        cases += [[a_ai_2,a_ai_1,a_ai_2,a_ai_1],
                  [a_ai_1,a_ai_2,a_ai_1,a_ai_2]]

    ttt = time.time()
    eval_results_all = []
    for _ in range(N_iter_loops):
        eval_results = []
        for agents in cases:
            curr_scores,eval_scores = \
                run_games(N_iter=N_iter_games,agents=agents,if_random_game=False,
                          fname='dataset/valid_set_q.p',state_generator=state_generator)
            eval_results.append(np.mean(eval_scores))
        eval_results = np.array(eval_results).reshape((-1,2))
        eval_results_all.append(eval_results)
    eval_results_all = np.stack(eval_results_all,axis=0)
    eval_mean = np.mean(eval_results_all,axis=0)
    eval_std = np.std(eval_results_all,axis=0)
    return 'time: {:.2f}. eval results: mean={}, std={}.'.format(
                time.time()-ttt,eval_mean.round(2),eval_std.round(2))

#### DQN

In [5]:
def load_q_agent(header,iter_,infer_flag=True):
    q_model = QModel(save_dir='model/',deck_size=deck_size)
    q_model.partial_restore('{}/{}'.format(header,iter_))
    agent = RlAgent(q_model)
    agent.infer_flag = infer_flag
    agent.sample_collect_flag = False
    return agent

In [4]:
version = 1
header = 'dqn-v{}'.format(version)

In [5]:
## initialize model
q_model = QModel(save_dir='model/',deck_size=deck_size)
q_model.learn_iter = 1
curr_iter = 0
q_model.save('{}/{}'.format(header,curr_iter))
q_model.restore('{}/{}'.format(header,curr_iter))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from model/dqn-v1/0/model


In [7]:
## train model using DQN
agents = []
for _ in range(4):
    agent = RlAgent(q_model)
    agent.sample_collect_flag = True
    agents.append(agent)

tt = time.time()
N_iter_inner = 10000
eval_freq = 1
for iter_outer in range(5):
    run_games(N_iter=N_iter_inner,agents=agents,if_train=True,models=[q_model],state_generator=StateExtend)
    q_model.save('dqn-v{}/{}'.format(version,curr_iter+(iter_outer+1)*N_iter_inner))
    q_model.partial_restore('dqn-v{}/{}'.format(version,curr_iter+(iter_outer+1)*N_iter_inner))
    print('{}:{:.2f}'.format(iter_outer+1,time.time() - tt))
    
    if (iter_outer+1) % eval_freq == 0 and iter_outer > 0:
        ## do some evaluation
        a_ai_1 = load_q_agent(header,curr_iter+(iter_outer+1-eval_freq)*N_iter_inner)
        a_ai_2 = load_q_agent(header,curr_iter+(iter_outer+1)*N_iter_inner)
        a_base = Agent()
        
        out_str = compare_agents(a_base,a_ai_1,a_ai_2,N_iter_loops=100,N_iter_games=100,
                    compare_first_base_flag=True,compare_both_flag=True)
        print('{}, {}'.format(iter_outer,out_str))

INFO:tensorflow:Restoring parameters from model/dqn-v1/10000/model
1:2713.14
INFO:tensorflow:Restoring parameters from model/dqn-v1/20000/model
2:5512.90
INFO:tensorflow:Restoring parameters from model/dqn-v1/10000/model
INFO:tensorflow:Restoring parameters from model/dqn-v1/20000/model
1, time: 680.59. eval results: [[ 19.6 -35.8]
 [-15.1  -8.8]].
INFO:tensorflow:Restoring parameters from model/dqn-v1/30000/model
3:8993.65
INFO:tensorflow:Restoring parameters from model/dqn-v1/20000/model
INFO:tensorflow:Restoring parameters from model/dqn-v1/30000/model
2, time: 692.78. eval results: [[ 33.6 -37.8]
 [ -6.3  -7.9]].
INFO:tensorflow:Restoring parameters from model/dqn-v1/40000/model
4:12487.96
INFO:tensorflow:Restoring parameters from model/dqn-v1/30000/model
INFO:tensorflow:Restoring parameters from model/dqn-v1/40000/model
3, time: 696.97. eval results: [[ 26.2 -37.4]
 [-11.9  -4.1]].
INFO:tensorflow:Restoring parameters from model/dqn-v1/50000/model
5:16037.00
INFO:tensorflow:Restor

In [6]:
## do some final evaluation
a_ai_1 = load_q_agent(header,0)
a_ai_2 = load_q_agent(header,50000)
a_base = Agent()

out_str = compare_agents(a_base,a_ai_1,a_ai_2,N_iter_loops=100,N_iter_games=100,compare_both_flag=True)
print(out_str)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from model/dqn-v1/0/model
INFO:tensorflow:Restoring parameters from model/dqn-v1/50000/model
time: 495.63. eval results: mean=[[ 15.44 -32.92]], std=[[6.72 4.7 ]].


#### CFR

In [8]:
## useful utility functions
def init_cfr_models(learn_iter,header,curr_iter=0):
    regret_models = []
    for p_ in range(4):
        regret_model = CFRModel(save_dir='model/',deck_size=deck_size)
        regret_model.learn_iter = learn_iter
        regret_model.train_target_flag = 'r'
        regret_models.append(regret_model)
    policy_model = CFRModel(save_dir='model/',deck_size=deck_size)
    policy_model.learn_iter = learn_iter * 4
    policy_model.train_target_flag = 'p'
    baseline_model = CFRBaselineModel(save_dir='model/',deck_size=deck_size)
    baseline_model.learn_iter = learn_iter
    save_cfr_models(regret_models,policy_model,baseline_model,header,iter_=curr_iter)
    return regret_models,policy_model,baseline_model

def save_cfr_models(regret_models,policy_model,baseline_model,header,iter_):
    for p_ in range(4):
        regret_models[p_].save('{}/r{}-{}'.format(header,p_,iter_))
    policy_model.save('{}/p-{}'.format(header,iter_))
    baseline_model.save('{}/b-{}'.format(header,iter_))

def load_cfr_models(header,iter_):
    regret_models = []
    for p_ in range(4):
        regret_model = CFRModel(save_dir='model/',deck_size=deck_size)
        regret_model.partial_restore('{}/r{}-{}'.format(header,p_,iter_))
        regret_models.append(regret_model)
    policy_model = CFRModel(save_dir='model/',deck_size=deck_size)
    policy_model.partial_restore('{}/p-{}'.format(header,iter_))
    baseline_model = CFRBaselineModel(save_dir='model/',deck_size=deck_size)
    baseline_model.partial_restore('{}/b-{}'.format(header,iter_))
    return regret_models,policy_model,baseline_model

def load_cfr_agent(header,iter_,infer_flag=True):
    regret_models,policy_model,baseline_model = \
            load_cfr_models(header,iter_)
    agent = CFRAgent(regret_models,policy_model)
    agent.infer_flag = infer_flag
    agent.sample_collect_flag = False
    return agent

In [9]:
version = 1
header = 'cfr-v{}'.format(version)

In [10]:
## initialize model
N_iter_train = 500
curr_iter = 0
regret_models,policy_model,baseline_model = init_cfr_models(N_iter_train,header,curr_iter)

In [11]:
## train model using CFR
e_sim = Env(Game(),DefaultPlayer(),state_generator=StateExtendV2)
explore_alpha = 0.1
agents = []
for p_ in range(4):
#     agent = CFRAgent(regret_models,policy_model,explore_alpha=explore_alpha,explore_player=p_)
#     agent = CFRRolloutAgent(regret_models,policy_model,e_sim,explore_alpha=explore_alpha,explore_player=p_)
#     agent = CFRExternalAgent(regret_models,policy_model,e_sim,explore_alpha=explore_alpha,explore_player=p_)
    agent = CFRBaselineAgent(regret_models,policy_model,baseline_model,
                             explore_alpha=explore_alpha,explore_player=p_)
    agent.infer_flag = False
    agents.append(agent)

tt = time.time()
N_iter_inner = 100
eval_freq = 1
for iter_outer in range(20):
    for agent in agents:
        agent.update_train_meta()
        
    for p_ in range(4):
        for agent in agents:
            agent.sample_collect_flag = False
        agents[p_].sample_collect_flag = True
        
        run_games(N_iter=N_iter_inner,agents=agents,if_train=False,
                  models=regret_models,state_generator=StateExtendV2)
        
        regret_models[p_].learn()
        baseline_model.learn()
        
    policy_model.learn()
    
    save_cfr_models(regret_models,policy_model,baseline_model,header,curr_iter+(iter_outer+1)*N_iter_inner)
    print('{}:{:.2f}'.format(iter_outer+1,time.time() - tt))
    
    if (iter_outer+1) % eval_freq == 0 and iter_outer > 0:
        ## do some evaluation
        a_ai_1 = load_cfr_agent(header,curr_iter+(iter_outer+1-eval_freq)*N_iter_inner)
        a_ai_2 = load_cfr_agent(header,curr_iter+(iter_outer+1)*N_iter_inner)
        a_base = Agent()
        
        out_str = compare_agents(a_base,a_ai_1,a_ai_2,N_iter_loops=100,N_iter_games=100,
                    compare_first_base_flag=True,compare_both_flag=True)
        print('{}, {}'.format(iter_outer,out_str))

1:222.84
2:1813.69
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from model/cfr-v1/r0-100/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r1-100/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r2-100/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r3-100/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/p-100/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/b-100/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r0-200/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r1-200/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r2-200/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r3-200/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/p-200/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/b-200/model
1, time: 1249.13. eval results: mean=[[-6.7  -5.46]
 [-6.75 -6.54]], std=[[7.51 7.

INFO:tensorflow:Restoring parameters from model/cfr-v1/r1-1000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r2-1000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r3-1000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/p-1000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/b-1000/model
9, time: 1279.81. eval results: mean=[[-8.39 -1.52]
 [-3.18 -1.76]], std=[[7.41 7.72]
 [7.43 6.46]].
11:42221.27
INFO:tensorflow:Restoring parameters from model/cfr-v1/r0-1000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r1-1000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r2-1000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r3-1000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/p-1000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/b-1000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r0-1100/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r1-1100/

INFO:tensorflow:Restoring parameters from model/cfr-v1/p-1800/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/b-1800/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r0-1900/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r1-1900/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r2-1900/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r3-1900/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/p-1900/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/b-1900/model
18, time: 1315.90. eval results: mean=[[-7.65 -6.56]
 [-5.81 -6.18]], std=[[6.95 7.48]
 [8.44 7.08]].
20:85090.47
INFO:tensorflow:Restoring parameters from model/cfr-v1/r0-1900/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r1-1900/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r2-1900/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r3-1900/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/p-1900/

In [13]:
## do some final evaluation
infer_flag = False
a_ai_1 = load_cfr_agent(header,0,infer_flag)
a_ai_2 = load_cfr_agent(header,2000,infer_flag)
a_base = Agent()

out_str = compare_agents(a_base,a_ai_1,a_ai_2,N_iter_loops=100,N_iter_games=100,compare_both_flag=True)
print(out_str)

INFO:tensorflow:Restoring parameters from model/cfr-v1/r0-0/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r1-0/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r2-0/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r3-0/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/p-0/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/b-0/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r0-2000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r1-2000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r2-2000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/r3-2000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/p-2000/model
INFO:tensorflow:Restoring parameters from model/cfr-v1/b-2000/model
time: 491.76. eval results: mean=[[-2.8  -6.08]], std=[[6.5  7.68]].


#### fictitious play (FP)

In [17]:
## useful utility functions
def init_fp_models(learn_iter,header,curr_iter=0):
    policy_models = []
    for p_ in range(4):
        policy_model = CFRModel(save_dir='model/',deck_size=deck_size)
        policy_model.learn_iter = learn_iter
        policy_model.train_target_flag = 'p'
        policy_models.append(policy_model)
    agg_policy_model = CFRModel(save_dir='model/',deck_size=deck_size)
    agg_policy_model.learn_iter = learn_iter * 2
    agg_policy_model.train_target_flag = 'p'
    save_fp_models(policy_models,agg_policy_model,header,iter_=curr_iter)
    return policy_models,agg_policy_model

def save_fp_models(policy_models,agg_policy_model,header,iter_):
    for p_ in range(4):
        policy_models[p_].save('{}/p{}-{}'.format(header,p_,iter_))
    agg_policy_model.save('{}/agg-{}'.format(header,iter_))

def load_fp_models(header,iter_):
    policy_models = []
    for p_ in range(4):
        policy_model = CFRModel(save_dir='model/',deck_size=deck_size)
        policy_model.partial_restore('{}/p{}-{}'.format(header,p_,iter_))
        policy_models.append(policy_model)
    agg_policy_model = CFRModel(save_dir='model/',deck_size=deck_size)
    agg_policy_model.partial_restore('{}/agg-{}'.format(header,iter_))
    return policy_models,agg_policy_model

def load_fp_agent(header,iter_,infer_flag=True):
    policy_models,agg_policy_model = \
            load_fp_models(header,iter_)
    agent = FPAgent(agg_policy_model,agg_policy_model)
    agent.infer_flag = infer_flag
    agent.status_flag = 'infer'
    agent.sample_collect_flag = False
    return agent

In [18]:
version = 1
header = 'fp-v{}'.format(version)

In [19]:
## initialize model
N_iter_train = 500
curr_iter = 0
policy_models,agg_policy_model = init_fp_models(N_iter_train,header,curr_iter)

In [20]:
## train model using FP
agents = []
for p_ in range(4):
    agent = FPAgent(policy_models[p_],agg_policy_model)
    agents.append(agent)

tt = time.time()
N_iter_inner = 2000
eval_freq = 1
for iter_outer in range(20):
    for agent in agents:
        agent.update_train_meta()
        
    for p_ in range(4):
        for agent in agents:
            agent.sample_collect_flag = False
            agent.status_flag = 'infer'
        agents[p_].sample_collect_flag = True
        
        ## BR
        q_model_tmp = QModel(save_dir='model/',deck_size=deck_size)
        q_model_tmp.learn_iter = 1
        agents[p_].q_model = q_model_tmp
        agents[p_].status_flag = 'BR'
        run_games(N_iter=N_iter_inner,agents=agents,if_train=True,
                  models=[q_model_tmp],state_generator=StateExtendV2)
        
        ## avg
        agents[p_].status_flag = 'avg'
        run_games(N_iter=500,agents=agents,if_train=False,state_generator=StateExtendV2)
        
        policy_models[p_].learn()
    agg_policy_model.learn()
    
    save_fp_models(policy_models,agg_policy_model,header,curr_iter+(iter_outer+1)*N_iter_inner)
    print('{}:{:.2f}'.format(iter_outer+1,time.time() - tt))
    
    if (iter_outer+1) % eval_freq == 0 and iter_outer > 0:
        ## do some evaluation
        a_ai_1 = load_fp_agent(header,curr_iter+(iter_outer+1-eval_freq)*N_iter_inner)
        a_ai_2 = load_fp_agent(header,curr_iter+(iter_outer+1)*N_iter_inner)
        a_base = Agent()
        
        out_str = compare_agents(a_base,a_ai_1,a_ai_2,N_iter_loops=100,N_iter_games=100,
                    compare_first_base_flag=True,compare_both_flag=True)
        print('{}, {}'.format(iter_outer,out_str))

1:3841.25
2:7587.94
INFO:tensorflow:Restoring parameters from model/fp-v1/p0-2000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p1-2000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p2-2000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p3-2000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/agg-2000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p0-4000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p1-4000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p2-4000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p3-4000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/agg-4000/model
1, time: 783.93. eval results: mean=[[-5.29 -6.2 ]
 [-8.38 -8.72]], std=[[6.47 6.96]
 [6.18 6.41]].
3:12275.24
INFO:tensorflow:Restoring parameters from model/fp-v1/p0-4000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p1-4000/model
INFO:tensorflow:Restoring parameters from model/fp-

INFO:tensorflow:Restoring parameters from model/fp-v1/p3-22000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/agg-22000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p0-24000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p1-24000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p2-24000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p3-24000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/agg-24000/model
11, time: 812.29. eval results: mean=[[-6.17 -6.29]
 [-7.02 -5.75]], std=[[7.49 7.91]
 [6.14 7.01]].
13:69509.62
INFO:tensorflow:Restoring parameters from model/fp-v1/p0-24000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p1-24000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p2-24000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p3-24000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/agg-24000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p0

In [22]:
## do some final evaluation
a_ai_1 = load_fp_agent(header,0)
a_ai_2 = load_fp_agent(header,40000)
a_base = Agent()

out_str = compare_agents(a_base,a_ai_1,a_ai_2,N_iter_loops=100,N_iter_games=100,compare_both_flag=True)
print(out_str)

INFO:tensorflow:Restoring parameters from model/fp-v1/p0-0/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p1-0/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p2-0/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p3-0/model
INFO:tensorflow:Restoring parameters from model/fp-v1/agg-0/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p0-40000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p1-40000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p2-40000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/p3-40000/model
INFO:tensorflow:Restoring parameters from model/fp-v1/agg-40000/model
time: 508.81. eval results: mean=[[-5.5  -7.26]], std=[[7.54 6.56]].
