# Setup for Performance Test

## Path Configuration

In [1]:
import os
import sys
import argparse

root = "../"*3
src_path = os.path.join(root, "kyoka")
sample_path = os.path.join(root, "sample")
sys.path.append(root)
sys.path.append(src_path)
sys.path.append(sample_path)

import logging as log
log.basicConfig(format='[%(levelname)s] %(message)s', level=log.INFO)

from kyoka.algorithm.montecarlo.montecarlo import MonteCarlo
from kyoka.algorithm.td_learning.sarsa import Sarsa
from kyoka.algorithm.td_learning.q_learning import QLearning
from kyoka.algorithm.td_learning.sarsa_lambda import SarsaLambda
from kyoka.algorithm.td_learning.q_lambda import QLambda

from kyoka.policy.greedy_policy import GreedyPolicy
from kyoka.policy.epsilon_greedy_policy import EpsilonGreedyPolicy
from kyoka.finish_rule.watch_iteration_count import WatchIterationCount

from sample.ticktacktoe.ticktacktoe_domain import TickTackToeDomain
from sample.ticktacktoe.ticktacktoe_keras_value_function import TickTackToeKerasValueFunction
from sample.ticktacktoe.ticktacktoe_helper import TickTackToeHelper
from sample.ticktacktoe.ticktacktoe_manual_policy import TickTackToeManualPolicy
from sample.ticktacktoe.ticktacktoe_perfect_policy import TickTackToePerfectPolicy
from sample.ticktacktoe.ticktacktoe_performance_logger import TickTackToePerformanceLogger

Using Theano backend.


## Define Const for Performance Test

In [2]:
LOG_INTERVAL = 1000
TEST_GAME_COUNT = 10
TEST_INTERVAL = 10000
IS_FIRST_PLAYER = True

domain = TickTackToeDomain(is_first_player=IS_FIRST_PLAYER)

## Setup Global Item for Performance Test

In [3]:
def gen_performance_logger():
    callback = TickTackToePerformanceLogger()
    callback.set_performance_test_interval(TEST_INTERVAL)
    callback.set_is_first_player(IS_FIRST_PLAYER)
    callback.set_test_game_count(TEST_GAME_COUNT)
    return callback

def run_performance_test(rl_algo, epsilon, test_length):
    watch_iteration = WatchIterationCount(target_count=test_length, log_interval=LOG_INTERVAL)
    finish_rules = [watch_iteration]
    value_func = TickTackToeKerasValueFunction()
    value_func.setUp()
    policy = EpsilonGreedyPolicy(domain, value_func, eps=epsilon)
    callback = gen_performance_logger()
    rl_algo.set_gpi_callback(callback)
    rl_algo.GPI(domain, policy, value_func, finish_rules)
    return callback.game_log

In [4]:
%matplotlib inline
import seaborn
import matplotlib.pyplot as plt

def visualize_test_result(title, performance_test_result):
    labels = [TEST_INTERVAL * (i+1) for i in range(len(performance_test_result))]
    lose_log, draw_log, win_log = [[log[1][i] for log in performance_test_result] for i in range(3)]

    plt.figure(figsize=(10,5))
    plt.plot(labels, lose_log, label="lose rate")
    plt.plot(labels, draw_log, label="draw rate")
    plt.plot(labels, win_log, label="win rate")

    plt.xlabel("GPI iteration")
    plt.ylabel("rate(%)")
    plt.title(title)
    plt.legend(loc = 1)

    plt.show()    

In [42]:
ACTION_NAME_MAP = {
    1 : "lower_right",
    2 : "lower_center",
    4 : "lower_left",
    8 : "middle_right",
    16: "middle_center",
    32: "middle_left",
    64: "upper_right",
    128: "upper_center",
    256: "upper_left"
}

def gen_test_case():
    bin2i = lambda b: int(b, 2)
    
    """
    - - -
    - - -
    - - -
    """
    first_player_board = bin2i("000000000")
    second_player_board = bin2i("000000000")
    case1 = (first_player_board, second_player_board)
    answer1 = {
        'upper_right': 0, 'lower_left': 0, 'upper_left': 0, 'lower_right': 0, 'lower_center': 0,
        'middle_right': 0, 'middle_left': 0, 'upper_center': 0, 'middle_center': 0
    }
    
    """
    O O -
    - - -
    - X X
    """
    first_player_board = bin2i("110000000")
    second_player_board = bin2i("000000011")
    case2 = (first_player_board, second_player_board)
    answer2 = {'middle_right': -1, 'upper_right': 1, 'middle_left': -1, 'lower_left': 1, 'middle_center': -1}
    
    """
    - - O
    - - X
    - - -
    """
    first_player_board = bin2i("0000010000")
    second_player_board = bin2i("000001000")
    case3 = (first_player_board, second_player_board)
    answer3 = {'upper_right': 1, 'lower_left': 1, 'upper_left': 1, 'lower_right': 1, 'lower_center': 1, 'middle_left': 0, 'upper_center': 1}
    
    """
    - - -
    - 0 X
    - - -
    """
    first_player_board = bin2i("010000000")
    second_player_board = bin2i("000001000")
    case4 = (first_player_board, second_player_board)
    answer4 = {'upper_right': 1, 'lower_left': 0, 'upper_left': -1, 'lower_right': 0, 'lower_center': -1, 'middle_left': 0, 'middle_center': 1}
    
    """
    - O -
    - - X
    - - -
    """
    first_player_board = bin2i("001000000")
    second_player_board = bin2i("000001000")
    case5 = (first_player_board, second_player_board)
    answer5 = {'lower_left': 0, 'upper_left': 1, 'lower_right': 0, 'lower_center': 0, 'middle_left': 0, 'upper_center': 1, 'middle_center': 1}
    
    return zip([case1, case2, case3, case4, case5], [answer1, answer2, answer3, answer4, answer5])

def visualize_policy(model_weights_path):
    domain = TickTackToeDomain()
    value_func = TickTackToeKerasValueFunction()
    value_func.setUp()
    value_func.load_model_weights(model_weights_path)
    for state, answer in gen_test_case():
        print TickTackToeHelper.visualize_board(state)
        actions = [ACTION_NAME_MAP[action] for action in domain.generate_possible_actions(state)]
        values = [value_func.calculate_value(state, action) for action in domain.generate_possible_actions(state)]
        policy = {action:value for action, value in zip(actions, values)}
        print "answer => %s" % sorted(answer.items(), key=lambda item: item[1])[::-1]
        print "policy => %s" % sorted(policy.items(), key=lambda item: item[1])[::-1]
        print

# Sarsa

In [1]:
TEST_LENGTH = 1000000
EPSILON = 0.3
performance_test_result = run_performance_test(Sarsa(alpha=0.1, gamma=0.7), epsilon=EPSILON, test_length=TEST_LENGTH)
visualize_test_result("Sarsa", performance_test_result)

<img src="./resource/Sarsa_eps_0.300000_test_result.png" />

In [43]:
visualize_policy("./resource/Sarsa_0.300000.h5")

- - -
- - -
- - -
answer => [('lower_center', 0), ('middle_center', 0), ('lower_right', 0), ('upper_left', 0), ('upper_center', 0), ('lower_left', 0), ('middle_left', 0), ('middle_right', 0), ('upper_right', 0)]
policy => [('upper_left', 0.21549654006958008), ('upper_right', 0.20649242401123047), ('middle_center', 0.19720602035522461), ('upper_center', 0.19693684577941895), ('lower_left', 0.19380760192871094), ('lower_right', 0.1855013370513916), ('middle_left', 0.17978930473327637), ('lower_center', 0.17633581161499023), ('middle_right', 0.16120791435241699)]

O O -
- - -
- X X
answer => [('lower_left', 1), ('upper_right', 1), ('middle_center', -1), ('middle_left', -1), ('middle_right', -1)]
policy => [('upper_right', 1.010073184967041), ('lower_left', 0.48578739166259766), ('middle_left', 0.44510889053344727), ('middle_right', 0.3795771598815918), ('middle_center', 0.19780659675598145)]

- - -
- O X
- - -
answer => [('upper_center', 1), ('lower_center', 1), ('lower_right', 1), ('uppe

In [None]:
start_time = time.time()
TEST_LENGTH = 1000000
EPSILON = 0.7
performance_test_result = run_performance_test(Sarsa(alpha=0.1, gamma=0.7), epsilon=EPSILON, test_length=TEST_LENGTH)
visualize_test_result("Sarsa", performance_test_result)

<img src="./resource/Sarsa_eps_0.700000_test_result.png" />

In [44]:
visualize_policy("./resource/Sarsa_0.700000.h5")

- - -
- - -
- - -
answer => [('lower_center', 0), ('middle_center', 0), ('lower_right', 0), ('upper_left', 0), ('upper_center', 0), ('lower_left', 0), ('middle_left', 0), ('middle_right', 0), ('upper_right', 0)]
policy => [('lower_left', 0.11326086521148682), ('upper_left', 0.1124175488948822), ('upper_right', 0.10433401167392731), ('lower_center', 0.101408451795578), ('upper_center', 0.10070347785949707), ('lower_right', 0.096334934234619141), ('middle_center', 0.095318853855133057), ('middle_right', 0.088748008012771606), ('middle_left', 0.082444429397583008)]

O O -
- - -
- X X
answer => [('lower_left', 1), ('upper_right', 1), ('middle_center', -1), ('middle_left', -1), ('middle_right', -1)]
policy => [('upper_right', 0.98915904760360718), ('middle_left', 0.38471138477325439), ('lower_left', 0.3119371235370636), ('middle_center', 0.13226363062858582), ('middle_right', 0.12553438544273376)]

- - -
- O X
- - -
answer => [('upper_center', 1), ('lower_center', 1), ('lower_right', 1), ('

# QLearning

In [None]:
TEST_LENGTH = 1000000
EPSILON = 0.3
performance_test_result = run_performance_test(QLearning(alpha=0.1, gamma=0.7), epsilon=EPSILON, test_length=TEST_LENGTH)
visualize_test_result("QLearning", performance_test_result)

<img src="./resource/QLearning_eps_0.300000_test_result.png" />

In [45]:
visualize_policy("./resource/QLearning_0.300000.h5")

- - -
- - -
- - -
answer => [('lower_center', 0), ('middle_center', 0), ('lower_right', 0), ('upper_left', 0), ('upper_center', 0), ('lower_left', 0), ('middle_left', 0), ('middle_right', 0), ('upper_right', 0)]
policy => [('middle_center', 0.25571227073669434), ('lower_left', 0.24657893180847168), ('middle_left', 0.24586057662963867), ('lower_right', 0.24305272102355957), ('lower_center', 0.23974251747131348), ('middle_right', 0.23496556282043457), ('upper_left', 0.23403549194335938), ('upper_right', 0.23064780235290527), ('upper_center', 0.22963213920593262)]

O O -
- - -
- X X
answer => [('lower_left', 1), ('upper_right', 1), ('middle_center', -1), ('middle_left', -1), ('middle_right', -1)]
policy => [('upper_right', 0.89198422431945801), ('middle_left', 0.54090332984924316), ('lower_left', 0.48573088645935059), ('middle_center', 0.3596651554107666), ('middle_right', 0.3500361442565918)]

- - -
- O X
- - -
answer => [('upper_center', 1), ('lower_center', 1), ('lower_right', 1), ('up

In [None]:
TEST_LENGTH = 1000000
EPSILON = 0.7
performance_test_result = run_performance_test(QLearning(alpha=0.1, gamma=0.7), epsilon=EPSILON, test_length=TEST_LENGTH)
visualize_test_result("QLearning", performance_test_result)

<img src="./resource/QLearning_eps_0.700000_test_result.png" />

In [46]:
visualize_policy("./resource/QLearning_0.700000.h5")

- - -
- - -
- - -
answer => [('lower_center', 0), ('middle_center', 0), ('lower_right', 0), ('upper_left', 0), ('upper_center', 0), ('lower_left', 0), ('middle_left', 0), ('middle_right', 0), ('upper_right', 0)]
policy => [('middle_right', 0.27092453837394714), ('middle_center', 0.26903492212295532), ('middle_left', 0.26543664932250977), ('lower_right', 0.26532420516014099), ('upper_left', 0.26419740915298462), ('upper_right', 0.26265621185302734), ('upper_center', 0.26167270541191101), ('lower_center', 0.26039361953735352), ('lower_left', 0.25689363479614258)]

O O -
- - -
- X X
answer => [('lower_left', 1), ('upper_right', 1), ('middle_center', -1), ('middle_left', -1), ('middle_right', -1)]
policy => [('upper_right', 1.1040643453598022), ('middle_left', 0.74620306491851807), ('lower_left', 0.63522964715957642), ('middle_right', 0.58852189779281616), ('middle_center', 0.44708132743835449)]

- - -
- O X
- - -
answer => [('upper_center', 1), ('lower_center', 1), ('lower_right', 1), ('u

# SarsaLambda

In [None]:
performance_test_result = run_performance_test(SarsaLambda())
visualize_test_result(performance_test_result)

# QLambda

In [None]:
TEST_LENGTH = 1000000
EPSILON = 0.3
performance_test_result = run_performance_test(QLambda(alpha=0.1, gamma=0.7), epsilon=EPSILON, test_length=TEST_LENGTH)
visualize_test_result("QLambda", performance_test_result)

<img src="./resource/QLambda_eps_0.300000_test_result.png" />

In [50]:
visualize_policy("./resource/QLambda_0.300000.h5")

- - -
- - -
- - -
answer => [('lower_center', 0), ('middle_center', 0), ('lower_right', 0), ('upper_left', 0), ('upper_center', 0), ('lower_left', 0), ('middle_left', 0), ('middle_right', 0), ('upper_right', 0)]
policy => [('upper_right', 0.24756813049316406), ('upper_left', -0.12658882141113281), ('lower_left', -0.15363121032714844), ('lower_center', -0.23360061645507812), ('middle_left', -0.24387359619140625), ('middle_center', -0.28474235534667969), ('upper_center', -0.3054351806640625), ('lower_right', -0.33952713012695312), ('middle_right', -0.35486221313476562)]

O O -
- - -
- X X
answer => [('lower_left', 1), ('upper_right', 1), ('middle_center', -1), ('middle_left', -1), ('middle_right', -1)]
policy => [('upper_right', 0.71581268310546875), ('lower_left', -0.16546821594238281), ('middle_left', -0.25571250915527344), ('middle_center', -0.29657936096191406), ('middle_right', -0.36669921875)]

- - -
- O X
- - -
answer => [('upper_center', 1), ('lower_center', 1), ('lower_right', 1

In [None]:
TEST_LENGTH = 1000000
EPSILON = 0.7
performance_test_result = run_performance_test(QLambda(alpha=0.1, gamma=0.7), epsilon=EPSILON, test_length=TEST_LENGTH)
visualize_test_result("QLambda", performance_test_result)

<img src="./resource/QLambda_eps_0.700000_test_result.png" />

In [49]:
visualize_policy("./resource/QLambda_0.700000.h5")

- - -
- - -
- - -
answer => [('lower_center', 0), ('middle_center', 0), ('lower_right', 0), ('upper_left', 0), ('upper_center', 0), ('lower_left', 0), ('middle_left', 0), ('middle_right', 0), ('upper_right', 0)]
policy => [('lower_right', 0.24588648974895477), ('upper_left', 0.23946882784366608), ('middle_center', 0.23672489821910858), ('lower_center', 0.23346804082393646), ('middle_right', 0.23132719099521637), ('upper_center', 0.2300858348608017), ('lower_left', 0.22817088663578033), ('upper_right', 0.21583981812000275), ('middle_left', 0.19873262941837311)]

O O -
- - -
- X X
answer => [('lower_left', 1), ('upper_right', 1), ('middle_center', -1), ('middle_left', -1), ('middle_right', -1)]
policy => [('upper_right', 0.80934619903564453), ('middle_left', 0.40353798866271973), ('middle_right', 0.40341246128082275), ('lower_left', 0.36922961473464966), ('middle_center', 0.17810873687267303)]

- - -
- O X
- - -
answer => [('upper_center', 1), ('lower_center', 1), ('lower_right', 1), ('u